~hc/RK356X_SDK_RELEASE.git

..	..	@@ -20,12 +20,11 @@
20	20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	22	*/
23		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24		-#include <linux/cpufreq.h>
25		-#endif
26	23	#include "sched.h"
27	24
28		-#include <trace/events/sched.h>
	25	+#include <trace/hooks/sched.h>
	26	+
	27	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
29	28
30	29	/*
31	30	* Targeted preemption latency for CPU-bound tasks:
..	..	@@ -41,17 +40,8 @@
41	40	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
42	41	*/
43	42	unsigned int sysctl_sched_latency = 6000000ULL;
44		-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45		-
46		-/*
47		- * Enable/disable honoring sync flag in energy-aware wakeups.
48		- */
49		-unsigned int sysctl_sched_sync_hint_enable = 1;
50		-
51		-/*
52		- * Enable/disable using cstate knowledge in idle sibling selection
53		- */
54		-unsigned int sysctl_sched_cstate_aware = 1;
	43	+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
	44	+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55	45
56	46	/*
57	47	* The initial- and re-scaling of tunables is configurable
..	..	@@ -71,8 +61,9 @@
71	61	*
72	62	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
73	63	*/
74		-unsigned int sysctl_sched_min_granularity = 750000ULL;
75		-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
	64	+unsigned int sysctl_sched_min_granularity = 750000ULL;
	65	+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
	66	+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
76	67
77	68	/*
78	69	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
..	..	@@ -94,10 +85,23 @@
94	85	*
95	86	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
96	87	*/
97		-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98		-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
	88	+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
	89	+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
99	90
100	91	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
	92	+
	93	+int sched_thermal_decay_shift;
	94	+static int __init setup_sched_thermal_decay_shift(char *str)
	95	+{
	96	+ int _shift = 0;
	97	+
	98	+ if (kstrtoint(str, 0, &_shift))
	99	+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
	100	+
	101	+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
	102	+ return 1;
	103	+}
	104	+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101	105
102	106	#ifdef CONFIG_SMP
103	107	/*
..	..	@@ -107,6 +111,14 @@
107	111	{
108	112	return -cpu;
109	113	}
	114	+
	115	+/*
	116	+ * The margin used when comparing utilization with CPU capacity.
	117	+ *
	118	+ * (default: ~20%)
	119	+ */
	120	+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
	121	+
110	122	#endif
111	123
112	124	#ifdef CONFIG_CFS_BANDWIDTH
..	..	@@ -122,18 +134,6 @@
122	134	*/
123	135	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124	136	#endif
125		-
126		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127		-unsigned int sysctl_sched_performance_bias = 1;
128		-#endif
129		-
130		-/*
131		- * The margin used when comparing utilization with CPU capacity:
132		- * util * margin < capacity * 1024
133		- *
134		- * (default: ~20%)
135		- */
136		-unsigned int capacity_margin = 1280;
137	137
138	138	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139	139	{
..	..	@@ -195,7 +195,7 @@
195	195	#undef SET_SYSCTL
196	196	}
197	197
198		-void sched_init_granularity(void)
	198	+void __init sched_init_granularity(void)
199	199	{
200	200	update_sysctl();
201	201	}
..	..	@@ -246,8 +246,7 @@
246	246	}
247	247	}
248	248
249		- /* hint to use a 32x32->64 mul */
250		- fact = (u64)(u32)fact * lw->inv_weight;
	249	+ fact = mul_u32_u32(fact, lw->inv_weight);
251	250
252	251	while (fact >> 32) {
253	252	fact >>= 1;
..	..	@@ -290,6 +289,19 @@
290	289	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
291	290	{
292	291	return grp->my_q;
	292	+}
	293	+
	294	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	295	+{
	296	+ if (!path)
	297	+ return;
	298	+
	299	+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
	300	+ autogroup_path(cfs_rq->tg, path, len);
	301	+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
	302	+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
	303	+ else
	304	+ strlcpy(path, "(null)", len);
293	305	}
294	306
295	307	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -466,6 +478,12 @@
466	478	return NULL;
467	479	}
468	480
	481	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	482	+{
	483	+ if (path)
	484	+ strlcpy(path, "(null)", len);
	485	+}
	486	+
469	487	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470	488	{
471	489	return true;
..	..	@@ -567,6 +585,7 @@
567	585	struct sched_entity *entry;
568	586	bool leftmost = true;
569	587
	588	+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570	589	/*
571	590	* Find the right place in the rbtree:
572	591	*/
..	..	@@ -592,6 +611,7 @@
592	611
593	612	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
594	613	{
	614	+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595	615	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596	616	}
597	617
..	..	@@ -631,8 +651,7 @@
631	651	*/
632	652
633	653	int sched_proc_update_handler(struct ctl_table *table, int write,
634		- void __user buffer, size_t lenp,
635		- loff_t *ppos)
	654	+ void buffer, size_t lenp, loff_t *ppos)
636	655	{
637	656	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638	657	unsigned int factor = get_update_sysctl_factor();
..	..	@@ -689,7 +708,13 @@
689	708	*/
690	709	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
691	710	{
692		- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
	711	+ unsigned int nr_running = cfs_rq->nr_running;
	712	+ u64 slice;
	713	+
	714	+ if (sched_feat(ALT_PERIOD))
	715	+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
	716	+
	717	+ slice = __sched_period(nr_running + !se->on_rq);
693	718
694	719	for_each_sched_entity(se) {
695	720	struct load_weight *load;
..	..	@@ -706,6 +731,10 @@
706	731	}
707	732	slice = __calc_delta(slice, se->load.weight, load);
708	733	}
	734	+
	735	+ if (sched_feat(BASE_SLICE))
	736	+ slice = max(slice, (u64)sysctl_sched_min_granularity);
	737	+
709	738	return slice;
710	739	}
711	740
..	..	@@ -734,26 +763,17 @@
734	763	memset(sa, 0, sizeof(*sa));
735	764
736	765	/*
737		- * Tasks are intialized with full load to be seen as heavy tasks until
	766	+ * Tasks are initialized with full load to be seen as heavy tasks until
738	767	* they get a chance to stabilize to their real load level.
739		- * Group entities are intialized with zero load to reflect the fact that
	768	+ * Group entities are initialized with zero load to reflect the fact that
740	769	* nothing has been attached to the task group yet.
741	770	*/
742	771	if (entity_is_task(se))
743		- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
	772	+ sa->load_avg = scale_load_down(se->load.weight);
744	773
745		- se->runnable_weight = se->load.weight;
746		-
747		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748		- if (sysctl_sched_performance_bias) {
749		- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750		- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751		- }
752		-#endif
753	774	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754	775	}
755	776
756		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757	777	static void attach_entity_cfs_rq(struct sched_entity *se);
758	778
759	779	/*
..	..	@@ -782,18 +802,15 @@
782	802	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783	803	* if util_avg > util_avg_cap.
784	804	*/
785		-void post_init_entity_util_avg(struct sched_entity *se)
	805	+void post_init_entity_util_avg(struct task_struct *p)
786	806	{
	807	+ struct sched_entity *se = &p->se;
787	808	struct cfs_rq *cfs_rq = cfs_rq_of(se);
788	809	struct sched_avg *sa = &se->avg;
789		- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
	810	+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790	811	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791	812
792		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793		- if (!sysctl_sched_performance_bias && (cap > 0)) {
794		-#else
795	813	if (cap > 0) {
796		-#endif
797	814	if (cfs_rq->avg.util_avg != 0) {
798	815	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799	816	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
..	..	@@ -805,24 +822,25 @@
805	822	}
806	823	}
807	824
808		- if (entity_is_task(se)) {
809		- struct task_struct *p = task_of(se);
810		- if (p->sched_class != &fair_sched_class) {
811		- /*
812		- * For !fair tasks do:
813		- *
814		- update_cfs_rq_load_avg(now, cfs_rq);
815		- attach_entity_load_avg(cfs_rq, se, 0);
816		- switched_from_fair(rq, p);
817		- *
818		- * such that the next switched_to_fair() has the
819		- * expected state.
820		- */
821		- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822		- return;
823		- }
	825	+ sa->runnable_avg = sa->util_avg;
	826	+
	827	+ if (p->sched_class != &fair_sched_class) {
	828	+ /*
	829	+ * For !fair tasks do:
	830	+ *
	831	+ update_cfs_rq_load_avg(now, cfs_rq);
	832	+ attach_entity_load_avg(cfs_rq, se);
	833	+ switched_from_fair(rq, p);
	834	+ *
	835	+ * such that the next switched_to_fair() has the
	836	+ * expected state.
	837	+ */
	838	+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
	839	+ return;
824	840	}
825	841
	842	+ /* Hook before this se's util is attached to cfs_rq's util */
	843	+ trace_android_rvh_post_init_entity_util_avg(se);
826	844	attach_entity_cfs_rq(se);
827	845	}
828	846
..	..	@@ -830,10 +848,10 @@
830	848	void init_entity_runnable_average(struct sched_entity *se)
831	849	{
832	850	}
833		-void post_init_entity_util_avg(struct sched_entity *se)
	851	+void post_init_entity_util_avg(struct task_struct *p)
834	852	{
835	853	}
836		-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	854	+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837	855	{
838	856	}
839	857	#endif /* CONFIG_SMP */
..	..	@@ -983,7 +1001,6 @@
983	1001	}
984	1002
985	1003	trace_sched_stat_blocked(tsk, delta);
986		- trace_sched_blocked_reason(tsk);
987	1004
988	1005	/*
989	1006	* Blocking time is in units of nanosecs, so shift by
..	..	@@ -1078,7 +1095,7 @@
1078	1095	unsigned int sysctl_numa_balancing_scan_delay = 1000;
1079	1096
1080	1097	struct numa_group {
1081		- atomic_t refcount;
	1098	+ refcount_t refcount;
1082	1099
1083	1100	spinlock_t lock; /* nr_tasks, tasks */
1084	1101	int nr_tasks;
..	..	@@ -1094,7 +1111,7 @@
1094	1111	* more by CPU use than by memory faults.
1095	1112	*/
1096	1113	unsigned long *faults_cpu;
1097		- unsigned long faults[0];
	1114	+ unsigned long faults[];
1098	1115	};
1099	1116
1100	1117	/*
..	..	@@ -1164,7 +1181,7 @@
1164	1181	unsigned long shared = group_faults_shared(ng);
1165	1182	unsigned long private = group_faults_priv(ng);
1166	1183
1167		- period *= atomic_read(&ng->refcount);
	1184	+ period *= refcount_read(&ng->refcount);
1168	1185	period *= shared + 1;
1169	1186	period /= private + shared + 1;
1170	1187	}
..	..	@@ -1189,7 +1206,7 @@
1189	1206	unsigned long private = group_faults_priv(ng);
1190	1207	unsigned long period = smax;
1191	1208
1192		- period *= atomic_read(&ng->refcount);
	1209	+ period *= refcount_read(&ng->refcount);
1193	1210	period *= shared + 1;
1194	1211	period /= private + shared + 1;
1195	1212
..	..	@@ -1199,56 +1216,15 @@
1199	1216	return max(smin, smax);
1200	1217	}
1201	1218
1202		-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203		-{
1204		- int mm_users = 0;
1205		- struct mm_struct *mm = p->mm;
1206		-
1207		- if (mm) {
1208		- mm_users = atomic_read(&mm->mm_users);
1209		- if (mm_users == 1) {
1210		- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211		- mm->numa_scan_seq = 0;
1212		- }
1213		- }
1214		- p->node_stamp = 0;
1215		- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216		- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217		- p->numa_work.next = &p->numa_work;
1218		- p->numa_faults = NULL;
1219		- RCU_INIT_POINTER(p->numa_group, NULL);
1220		- p->last_task_numa_placement = 0;
1221		- p->last_sum_exec_runtime = 0;
1222		-
1223		- /* New address space, reset the preferred nid */
1224		- if (!(clone_flags & CLONE_VM)) {
1225		- p->numa_preferred_nid = -1;
1226		- return;
1227		- }
1228		-
1229		- /*
1230		- * New thread, keep existing numa_preferred_nid which should be copied
1231		- * already by arch_dup_task_struct but stagger when scans start.
1232		- */
1233		- if (mm) {
1234		- unsigned int delay;
1235		-
1236		- delay = min_t(unsigned int, task_scan_max(current),
1237		- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238		- delay += 2 * TICK_NSEC;
1239		- p->node_stamp = delay;
1240		- }
1241		-}
1242		-
1243	1219	static void account_numa_enqueue(struct rq rq, struct task_struct p)
1244	1220	{
1245		- rq->nr_numa_running += (p->numa_preferred_nid != -1);
	1221	+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1246	1222	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1247	1223	}
1248	1224
1249	1225	static void account_numa_dequeue(struct rq rq, struct task_struct p)
1250	1226	{
1251		- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
	1227	+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1252	1228	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1253	1229	}
1254	1230
..	..	@@ -1474,7 +1450,7 @@
1474	1450	* two full passes of the "multi-stage node selection" test that is
1475	1451	* executed below.
1476	1452	*/
1477		- if ((p->numa_preferred_nid == -1 \|\| p->numa_scan_seq <= 4) &&
	1453	+ if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= 4) &&
1478	1454	(cpupid_pid_unset(last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
1479	1455	return true;
1480	1456
..	..	@@ -1527,55 +1503,52 @@
1527	1503	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1528	1504	}
1529	1505
1530		-static unsigned long weighted_cpuload(struct rq *rq);
1531		-static unsigned long source_load(int cpu, int type);
1532		-static unsigned long target_load(int cpu, int type);
	1506	+/*
	1507	+ * 'numa_type' describes the node at the moment of load balancing.
	1508	+ */
	1509	+enum numa_type {
	1510	+ /* The node has spare capacity that can be used to run more tasks. */
	1511	+ node_has_spare = 0,
	1512	+ /*
	1513	+ * The node is fully used and the tasks don't compete for more CPU
	1514	+ * cycles. Nevertheless, some tasks might wait before running.
	1515	+ */
	1516	+ node_fully_busy,
	1517	+ /*
	1518	+ * The node is overloaded and can't provide expected CPU cycles to all
	1519	+ * tasks.
	1520	+ */
	1521	+ node_overloaded
	1522	+};
1533	1523
1534	1524	/* Cached statistics for all CPUs within a node */
1535	1525	struct numa_stats {
1536	1526	unsigned long load;
1537		-
	1527	+ unsigned long runnable;
	1528	+ unsigned long util;
1538	1529	/* Total compute capacity of CPUs on a node */
1539	1530	unsigned long compute_capacity;
1540		-
1541	1531	unsigned int nr_running;
	1532	+ unsigned int weight;
	1533	+ enum numa_type node_type;
	1534	+ int idle_cpu;
1542	1535	};
1543	1536
1544		-/*
1545		- * XXX borrowed from update_sg_lb_stats
1546		- */
1547		-static void update_numa_stats(struct numa_stats *ns, int nid)
	1537	+static inline bool is_core_idle(int cpu)
1548	1538	{
1549		- int smt, cpu, cpus = 0;
1550		- unsigned long capacity;
	1539	+#ifdef CONFIG_SCHED_SMT
	1540	+ int sibling;
1551	1541
1552		- memset(ns, 0, sizeof(*ns));
1553		- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554		- struct rq *rq = cpu_rq(cpu);
	1542	+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
	1543	+ if (cpu == sibling)
	1544	+ continue;
1555	1545
1556		- ns->nr_running += rq->nr_running;
1557		- ns->load += weighted_cpuload(rq);
1558		- ns->compute_capacity += capacity_of(cpu);
1559		-
1560		- cpus++;
	1546	+ if (!idle_cpu(sibling))
	1547	+ return false;
1561	1548	}
	1549	+#endif
1562	1550
1563		- /*
1564		- * If we raced with hotplug and there are no CPUs left in our mask
1565		- * the @ns structure is NULL'ed and task_numa_compare() will
1566		- * not find this node attractive.
1567		- *
1568		- * We'll detect a huge imbalance and bail there.
1569		- */
1570		- if (!cpus)
1571		- return;
1572		-
1573		- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574		- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575		- capacity = cpus / smt; /* cores */
1576		-
1577		- capacity = min_t(unsigned, capacity,
1578		- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
	1551	+ return true;
1579	1552	}
1580	1553
1581	1554	struct task_numa_env {
..	..	@@ -1594,20 +1567,132 @@
1594	1567	int best_cpu;
1595	1568	};
1596	1569
	1570	+static unsigned long cpu_load(struct rq *rq);
	1571	+static unsigned long cpu_runnable(struct rq *rq);
	1572	+static unsigned long cpu_util(int cpu);
	1573	+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
	1574	+
	1575	+static inline enum
	1576	+numa_type numa_classify(unsigned int imbalance_pct,
	1577	+ struct numa_stats *ns)
	1578	+{
	1579	+ if ((ns->nr_running > ns->weight) &&
	1580	+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) \|\|
	1581	+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
	1582	+ return node_overloaded;
	1583	+
	1584	+ if ((ns->nr_running < ns->weight) \|\|
	1585	+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
	1586	+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
	1587	+ return node_has_spare;
	1588	+
	1589	+ return node_fully_busy;
	1590	+}
	1591	+
	1592	+#ifdef CONFIG_SCHED_SMT
	1593	+/* Forward declarations of select_idle_sibling helpers */
	1594	+static inline bool test_idle_cores(int cpu, bool def);
	1595	+static inline int numa_idle_core(int idle_core, int cpu)
	1596	+{
	1597	+ if (!static_branch_likely(&sched_smt_present) \|\|
	1598	+ idle_core >= 0 \|\| !test_idle_cores(cpu, false))
	1599	+ return idle_core;
	1600	+
	1601	+ /*
	1602	+ * Prefer cores instead of packing HT siblings
	1603	+ * and triggering future load balancing.
	1604	+ */
	1605	+ if (is_core_idle(cpu))
	1606	+ idle_core = cpu;
	1607	+
	1608	+ return idle_core;
	1609	+}
	1610	+#else
	1611	+static inline int numa_idle_core(int idle_core, int cpu)
	1612	+{
	1613	+ return idle_core;
	1614	+}
	1615	+#endif
	1616	+
	1617	+/*
	1618	+ * Gather all necessary information to make NUMA balancing placement
	1619	+ * decisions that are compatible with standard load balancer. This
	1620	+ * borrows code and logic from update_sg_lb_stats but sharing a
	1621	+ * common implementation is impractical.
	1622	+ */
	1623	+static void update_numa_stats(struct task_numa_env *env,
	1624	+ struct numa_stats *ns, int nid,
	1625	+ bool find_idle)
	1626	+{
	1627	+ int cpu, idle_core = -1;
	1628	+
	1629	+ memset(ns, 0, sizeof(*ns));
	1630	+ ns->idle_cpu = -1;
	1631	+
	1632	+ rcu_read_lock();
	1633	+ for_each_cpu(cpu, cpumask_of_node(nid)) {
	1634	+ struct rq *rq = cpu_rq(cpu);
	1635	+
	1636	+ ns->load += cpu_load(rq);
	1637	+ ns->runnable += cpu_runnable(rq);
	1638	+ ns->util += cpu_util(cpu);
	1639	+ ns->nr_running += rq->cfs.h_nr_running;
	1640	+ ns->compute_capacity += capacity_of(cpu);
	1641	+
	1642	+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
	1643	+ if (READ_ONCE(rq->numa_migrate_on) \|\|
	1644	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
	1645	+ continue;
	1646	+
	1647	+ if (ns->idle_cpu == -1)
	1648	+ ns->idle_cpu = cpu;
	1649	+
	1650	+ idle_core = numa_idle_core(idle_core, cpu);
	1651	+ }
	1652	+ }
	1653	+ rcu_read_unlock();
	1654	+
	1655	+ ns->weight = cpumask_weight(cpumask_of_node(nid));
	1656	+
	1657	+ ns->node_type = numa_classify(env->imbalance_pct, ns);
	1658	+
	1659	+ if (idle_core >= 0)
	1660	+ ns->idle_cpu = idle_core;
	1661	+}
	1662	+
1597	1663	static void task_numa_assign(struct task_numa_env *env,
1598	1664	struct task_struct *p, long imp)
1599	1665	{
1600	1666	struct rq *rq = cpu_rq(env->dst_cpu);
1601	1667
1602		- /* Bail out if run-queue part of active NUMA balance. */
1603		- if (xchg(&rq->numa_migrate_on, 1))
1604		- return;
	1668	+ /* Check if run-queue part of active NUMA balance. */
	1669	+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
	1670	+ int cpu;
	1671	+ int start = env->dst_cpu;
1605	1672
	1673	+ /* Find alternative idle CPU. */
	1674	+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
	1675	+ if (cpu == env->best_cpu \|\| !idle_cpu(cpu) \|\|
	1676	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
	1677	+ continue;
	1678	+ }
	1679	+
	1680	+ env->dst_cpu = cpu;
	1681	+ rq = cpu_rq(env->dst_cpu);
	1682	+ if (!xchg(&rq->numa_migrate_on, 1))
	1683	+ goto assign;
	1684	+ }
	1685	+
	1686	+ /* Failed to find an alternative idle CPU */
	1687	+ return;
	1688	+ }
	1689	+
	1690	+assign:
1606	1691	/*
1607	1692	* Clear previous best_cpu/rq numa-migrate flag, since task now
1608	1693	* found a better CPU to move/swap.
1609	1694	*/
1610		- if (env->best_cpu != -1) {
	1695	+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1611	1696	rq = cpu_rq(env->best_cpu);
1612	1697	WRITE_ONCE(rq->numa_migrate_on, 0);
1613	1698	}
..	..	@@ -1663,7 +1748,7 @@
1663	1748	* into account that it might be best if task running on the dst_cpu should
1664	1749	* be exchanged with the source task
1665	1750	*/
1666		-static void task_numa_compare(struct task_numa_env *env,
	1751	+static bool task_numa_compare(struct task_numa_env *env,
1667	1752	long taskimp, long groupimp, bool maymove)
1668	1753	{
1669	1754	struct numa_group cur_ng, p_ng = deref_curr_numa_group(env->p);
..	..	@@ -1674,12 +1759,13 @@
1674	1759	int dist = env->dist;
1675	1760	long moveimp = imp;
1676	1761	long load;
	1762	+ bool stopsearch = false;
1677	1763
1678	1764	if (READ_ONCE(dst_rq->numa_migrate_on))
1679		- return;
	1765	+ return false;
1680	1766
1681	1767	rcu_read_lock();
1682		- cur = task_rcu_dereference(&dst_rq->curr);
	1768	+ cur = rcu_dereference(dst_rq->curr);
1683	1769	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
1684	1770	cur = NULL;
1685	1771
..	..	@@ -1687,8 +1773,10 @@
1687	1773	* Because we have preemption enabled we can get migrated around and
1688	1774	* end try selecting ourselves (current == env->p) as a swap candidate.
1689	1775	*/
1690		- if (cur == env->p)
	1776	+ if (cur == env->p) {
	1777	+ stopsearch = true;
1691	1778	goto unlock;
	1779	+ }
1692	1780
1693	1781	if (!cur) {
1694	1782	if (maymove && moveimp >= env->best_imp)
..	..	@@ -1697,18 +1785,27 @@
1697	1785	goto unlock;
1698	1786	}
1699	1787
	1788	+ /* Skip this swap candidate if cannot move to the source cpu. */
	1789	+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
	1790	+ goto unlock;
	1791	+
	1792	+ /*
	1793	+ * Skip this swap candidate if it is not moving to its preferred
	1794	+ * node and the best task is.
	1795	+ */
	1796	+ if (env->best_task &&
	1797	+ env->best_task->numa_preferred_nid == env->src_nid &&
	1798	+ cur->numa_preferred_nid != env->src_nid) {
	1799	+ goto unlock;
	1800	+ }
	1801	+
1700	1802	/*
1701	1803	* "imp" is the fault differential for the source task between the
1702	1804	* source and destination node. Calculate the total differential for
1703	1805	* the source task and potential destination task. The more negative
1704	1806	* the value is, the more remote accesses that would be expected to
1705	1807	* be incurred if the tasks were swapped.
1706		- */
1707		- /* Skip this swap candidate if cannot move to the source cpu */
1708		- if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1709		- goto unlock;
1710		-
1711		- /*
	1808	+ *
1712	1809	* If dst and source tasks are in the same NUMA group, or not
1713	1810	* in any group then look only at task weights.
1714	1811	*/
..	..	@@ -1735,9 +1832,31 @@
1735	1832	task_weight(cur, env->dst_nid, dist);
1736	1833	}
1737	1834
	1835	+ /* Discourage picking a task already on its preferred node */
	1836	+ if (cur->numa_preferred_nid == env->dst_nid)
	1837	+ imp -= imp / 16;
	1838	+
	1839	+ /*
	1840	+ * Encourage picking a task that moves to its preferred node.
	1841	+ * This potentially makes imp larger than it's maximum of
	1842	+ * 1998 (see SMALLIMP and task_weight for why) but in this
	1843	+ * case, it does not matter.
	1844	+ */
	1845	+ if (cur->numa_preferred_nid == env->src_nid)
	1846	+ imp += imp / 8;
	1847	+
1738	1848	if (maymove && moveimp > imp && moveimp > env->best_imp) {
1739	1849	imp = moveimp;
1740	1850	cur = NULL;
	1851	+ goto assign;
	1852	+ }
	1853	+
	1854	+ /*
	1855	+ * Prefer swapping with a task moving to its preferred node over a
	1856	+ * task that is not.
	1857	+ */
	1858	+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
	1859	+ env->best_task->numa_preferred_nid != env->src_nid) {
1741	1860	goto assign;
1742	1861	}
1743	1862
..	..	@@ -1764,42 +1883,95 @@
1764	1883	goto unlock;
1765	1884
1766	1885	assign:
1767		- /*
1768		- * One idle CPU per node is evaluated for a task numa move.
1769		- * Call select_idle_sibling to maybe find a better one.
1770		- */
	1886	+ /* Evaluate an idle CPU for a task numa move. */
1771	1887	if (!cur) {
	1888	+ int cpu = env->dst_stats.idle_cpu;
	1889	+
	1890	+ /* Nothing cached so current CPU went idle since the search. */
	1891	+ if (cpu < 0)
	1892	+ cpu = env->dst_cpu;
	1893	+
1772	1894	/*
1773		- * select_idle_siblings() uses an per-CPU cpumask that
1774		- * can be used from IRQ context.
	1895	+ * If the CPU is no longer truly idle and the previous best CPU
	1896	+ * is, keep using it.
1775	1897	*/
1776		- local_irq_disable();
1777		- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778		- env->dst_cpu);
1779		- local_irq_enable();
	1898	+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
	1899	+ idle_cpu(env->best_cpu)) {
	1900	+ cpu = env->best_cpu;
	1901	+ }
	1902	+
	1903	+ env->dst_cpu = cpu;
1780	1904	}
1781	1905
1782	1906	task_numa_assign(env, cur, imp);
	1907	+
	1908	+ /*
	1909	+ * If a move to idle is allowed because there is capacity or load
	1910	+ * balance improves then stop the search. While a better swap
	1911	+ * candidate may exist, a search is not free.
	1912	+ */
	1913	+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
	1914	+ stopsearch = true;
	1915	+
	1916	+ /*
	1917	+ * If a swap candidate must be identified and the current best task
	1918	+ * moves its preferred node then stop the search.
	1919	+ */
	1920	+ if (!maymove && env->best_task &&
	1921	+ env->best_task->numa_preferred_nid == env->src_nid) {
	1922	+ stopsearch = true;
	1923	+ }
1783	1924	unlock:
1784	1925	rcu_read_unlock();
	1926	+
	1927	+ return stopsearch;
1785	1928	}
1786	1929
1787	1930	static void task_numa_find_cpu(struct task_numa_env *env,
1788	1931	long taskimp, long groupimp)
1789	1932	{
1790		- long src_load, dst_load, load;
1791	1933	bool maymove = false;
1792	1934	int cpu;
1793	1935
1794		- load = task_h_load(env->p);
1795		- dst_load = env->dst_stats.load + load;
1796		- src_load = env->src_stats.load - load;
1797		-
1798	1936	/*
1799		- * If the improvement from just moving env->p direction is better
1800		- * than swapping tasks around, check if a move is possible.
	1937	+ * If dst node has spare capacity, then check if there is an
	1938	+ * imbalance that would be overruled by the load balancer.
1801	1939	*/
1802		- maymove = !load_too_imbalanced(src_load, dst_load, env);
	1940	+ if (env->dst_stats.node_type == node_has_spare) {
	1941	+ unsigned int imbalance;
	1942	+ int src_running, dst_running;
	1943	+
	1944	+ /*
	1945	+ * Would movement cause an imbalance? Note that if src has
	1946	+ * more running tasks that the imbalance is ignored as the
	1947	+ * move improves the imbalance from the perspective of the
	1948	+ * CPU load balancer.
	1949	+ * */
	1950	+ src_running = env->src_stats.nr_running - 1;
	1951	+ dst_running = env->dst_stats.nr_running + 1;
	1952	+ imbalance = max(0, dst_running - src_running);
	1953	+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
	1954	+
	1955	+ /* Use idle CPU if there is no imbalance */
	1956	+ if (!imbalance) {
	1957	+ maymove = true;
	1958	+ if (env->dst_stats.idle_cpu >= 0) {
	1959	+ env->dst_cpu = env->dst_stats.idle_cpu;
	1960	+ task_numa_assign(env, NULL, 0);
	1961	+ return;
	1962	+ }
	1963	+ }
	1964	+ } else {
	1965	+ long src_load, dst_load, load;
	1966	+ /*
	1967	+ * If the improvement from just moving env->p direction is better
	1968	+ * than swapping tasks around, check if a move is possible.
	1969	+ */
	1970	+ load = task_h_load(env->p);
	1971	+ dst_load = env->dst_stats.load + load;
	1972	+ src_load = env->src_stats.load - load;
	1973	+ maymove = !load_too_imbalanced(src_load, dst_load, env);
	1974	+ }
1803	1975
1804	1976	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1805	1977	/* Skip this CPU if the source task cannot migrate */
..	..	@@ -1807,7 +1979,8 @@
1807	1979	continue;
1808	1980
1809	1981	env->dst_cpu = cpu;
1810		- task_numa_compare(env, taskimp, groupimp, maymove);
	1982	+ if (task_numa_compare(env, taskimp, groupimp, maymove))
	1983	+ break;
1811	1984	}
1812	1985	}
1813	1986
..	..	@@ -1861,10 +2034,10 @@
1861	2034	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1862	2035	taskweight = task_weight(p, env.src_nid, dist);
1863	2036	groupweight = group_weight(p, env.src_nid, dist);
1864		- update_numa_stats(&env.src_stats, env.src_nid);
	2037	+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1865	2038	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1866	2039	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2040	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1868	2041
1869	2042	/* Try to find a spot on the preferred nid. */
1870	2043	task_numa_find_cpu(&env, taskimp, groupimp);
..	..	@@ -1897,7 +2070,7 @@
1897	2070
1898	2071	env.dist = dist;
1899	2072	env.dst_nid = nid;
1900		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2073	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1901	2074	task_numa_find_cpu(&env, taskimp, groupimp);
1902	2075	}
1903	2076	}
..	..	@@ -1921,15 +2094,17 @@
1921	2094	}
1922	2095
1923	2096	/* No better CPU than the current one was found. */
1924		- if (env.best_cpu == -1)
	2097	+ if (env.best_cpu == -1) {
	2098	+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
1925	2099	return -EAGAIN;
	2100	+ }
1926	2101
1927	2102	best_rq = cpu_rq(env.best_cpu);
1928	2103	if (env.best_task == NULL) {
1929	2104	ret = migrate_task_to(p, env.best_cpu);
1930	2105	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1931	2106	if (ret != 0)
1932		- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
	2107	+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
1933	2108	return ret;
1934	2109	}
1935	2110
..	..	@@ -1937,7 +2112,7 @@
1937	2112	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1938	2113
1939	2114	if (ret != 0)
1940		- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
	2115	+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
1941	2116	put_task_struct(env.best_task);
1942	2117	return ret;
1943	2118	}
..	..	@@ -1948,7 +2123,7 @@
1948	2123	unsigned long interval = HZ;
1949	2124
1950	2125	/* This task has no NUMA fault statistics yet */
1951		- if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
	2126	+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
1952	2127	return;
1953	2128
1954	2129	/* Periodically retry migrating the task to the preferred node */
..	..	@@ -2199,7 +2374,7 @@
2199	2374
2200	2375	static void task_numa_placement(struct task_struct *p)
2201	2376	{
2202		- int seq, nid, max_nid = -1;
	2377	+ int seq, nid, max_nid = NUMA_NO_NODE;
2203	2378	unsigned long max_faults = 0;
2204	2379	unsigned long fault_types[2] = { 0, 0 };
2205	2380	unsigned long total_faults;
..	..	@@ -2309,12 +2484,12 @@
2309	2484
2310	2485	static inline int get_numa_group(struct numa_group *grp)
2311	2486	{
2312		- return atomic_inc_not_zero(&grp->refcount);
	2487	+ return refcount_inc_not_zero(&grp->refcount);
2313	2488	}
2314	2489
2315	2490	static inline void put_numa_group(struct numa_group *grp)
2316	2491	{
2317		- if (atomic_dec_and_test(&grp->refcount))
	2492	+ if (refcount_dec_and_test(&grp->refcount))
2318	2493	kfree_rcu(grp, rcu);
2319	2494	}
2320	2495
..	..	@@ -2335,7 +2510,7 @@
2335	2510	if (!grp)
2336	2511	return;
2337	2512
2338		- atomic_set(&grp->refcount, 1);
	2513	+ refcount_set(&grp->refcount, 1);
2339	2514	grp->active_nodes = 1;
2340	2515	grp->max_faults_cpu = 0;
2341	2516	spin_lock_init(&grp->lock);
..	..	@@ -2522,8 +2697,8 @@
2522	2697	local = 1;
2523	2698
2524	2699	/*
2525		- * Retry task to preferred node migration periodically, in case it
2526		- * case it previously failed, or the scheduler moved us.
	2700	+ * Retry to migrate task to preferred node periodically, in case it
	2701	+ * previously failed, or the scheduler moved us.
2527	2702	*/
2528	2703	if (time_after(jiffies, p->numa_migrate_retry)) {
2529	2704	task_numa_placement(p);
..	..	@@ -2558,7 +2733,7 @@
2558	2733	* The expensive part of numa migration is done from task_work context.
2559	2734	* Triggered from task_tick_numa().
2560	2735	*/
2561		-void task_numa_work(struct callback_head *work)
	2736	+static void task_numa_work(struct callback_head *work)
2562	2737	{
2563	2738	unsigned long migrate, next_scan, now = jiffies;
2564	2739	struct task_struct *p = current;
..	..	@@ -2571,7 +2746,7 @@
2571	2746
2572	2747	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2573	2748
2574		- work->next = work; /* protect against double add */
	2749	+ work->next = work;
2575	2750	/*
2576	2751	* Who cares about NUMA placement when they're dying.
2577	2752	*
..	..	@@ -2618,7 +2793,7 @@
2618	2793	return;
2619	2794
2620	2795
2621		- if (!down_read_trylock(&mm->mmap_sem))
	2796	+ if (!mmap_read_trylock(mm))
2622	2797	return;
2623	2798	vma = find_vma(mm, start);
2624	2799	if (!vma) {
..	..	@@ -2646,7 +2821,7 @@
2646	2821	* Skip inaccessible VMAs to avoid any confusion between
2647	2822	* PROT_NONE and NUMA hinting ptes
2648	2823	*/
2649		- if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
	2824	+ if (!vma_is_accessible(vma))
2650	2825	continue;
2651	2826
2652	2827	do {
..	..	@@ -2686,7 +2861,7 @@
2686	2861	mm->numa_scan_offset = start;
2687	2862	else
2688	2863	reset_ptenuma_scan(p);
2689		- up_read(&mm->mmap_sem);
	2864	+ mmap_read_unlock(mm);
2690	2865
2691	2866	/*
2692	2867	* Make sure tasks use at least 32x as much time to run other code
..	..	@@ -2700,10 +2875,54 @@
2700	2875	}
2701	2876	}
2702	2877
	2878	+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
	2879	+{
	2880	+ int mm_users = 0;
	2881	+ struct mm_struct *mm = p->mm;
	2882	+
	2883	+ if (mm) {
	2884	+ mm_users = atomic_read(&mm->mm_users);
	2885	+ if (mm_users == 1) {
	2886	+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
	2887	+ mm->numa_scan_seq = 0;
	2888	+ }
	2889	+ }
	2890	+ p->node_stamp = 0;
	2891	+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
	2892	+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
	2893	+ /* Protect against double add, see task_tick_numa and task_numa_work */
	2894	+ p->numa_work.next = &p->numa_work;
	2895	+ p->numa_faults = NULL;
	2896	+ RCU_INIT_POINTER(p->numa_group, NULL);
	2897	+ p->last_task_numa_placement = 0;
	2898	+ p->last_sum_exec_runtime = 0;
	2899	+
	2900	+ init_task_work(&p->numa_work, task_numa_work);
	2901	+
	2902	+ /* New address space, reset the preferred nid */
	2903	+ if (!(clone_flags & CLONE_VM)) {
	2904	+ p->numa_preferred_nid = NUMA_NO_NODE;
	2905	+ return;
	2906	+ }
	2907	+
	2908	+ /*
	2909	+ * New thread, keep existing numa_preferred_nid which should be copied
	2910	+ * already by arch_dup_task_struct but stagger when scans start.
	2911	+ */
	2912	+ if (mm) {
	2913	+ unsigned int delay;
	2914	+
	2915	+ delay = min_t(unsigned int, task_scan_max(current),
	2916	+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
	2917	+ delay += 2 * TICK_NSEC;
	2918	+ p->node_stamp = delay;
	2919	+ }
	2920	+}
	2921	+
2703	2922	/*
2704	2923	* Drive the periodic memory faults..
2705	2924	*/
2706		-void task_tick_numa(struct rq rq, struct task_struct curr)
	2925	+static void task_tick_numa(struct rq rq, struct task_struct curr)
2707	2926	{
2708	2927	struct callback_head *work = &curr->numa_work;
2709	2928	u64 period, now;
..	..	@@ -2728,10 +2947,8 @@
2728	2947	curr->numa_scan_period = task_scan_start(curr);
2729	2948	curr->node_stamp += period;
2730	2949
2731		- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732		- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733		- task_work_add(curr, work, true);
2734		- }
	2950	+ if (!time_before(jiffies, curr->mm->numa_next_scan))
	2951	+ task_work_add(curr, work, TWA_RESUME);
2735	2952	}
2736	2953	}
2737	2954
..	..	@@ -2761,7 +2978,8 @@
2761	2978	* the preferred node.
2762	2979	*/
2763	2980	if (dst_nid == p->numa_preferred_nid \|\|
2764		- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
	2981	+ (p->numa_preferred_nid != NUMA_NO_NODE &&
	2982	+ src_nid != p->numa_preferred_nid))
2765	2983	return;
2766	2984	}
2767	2985
..	..	@@ -2791,8 +3009,6 @@
2791	3009	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
2792	3010	{
2793	3011	update_load_add(&cfs_rq->load, se->load.weight);
2794		- if (!parent_entity(se))
2795		- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2796	3012	#ifdef CONFIG_SMP
2797	3013	if (entity_is_task(se)) {
2798	3014	struct rq *rq = rq_of(cfs_rq);
..	..	@@ -2808,8 +3024,6 @@
2808	3024	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
2809	3025	{
2810	3026	update_load_sub(&cfs_rq->load, se->load.weight);
2811		- if (!parent_entity(se))
2812		- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2813	3027	#ifdef CONFIG_SMP
2814	3028	if (entity_is_task(se)) {
2815	3029	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
..	..	@@ -2856,26 +3070,18 @@
2856	3070	WRITE_ONCE(*ptr, res); \
2857	3071	} while (0)
2858	3072
	3073	+/*
	3074	+ * Remove and clamp on negative, from a local variable.
	3075	+ *
	3076	+ * A variant of sub_positive(), which does not use explicit load-store
	3077	+ * and is thus optimized for local variable updates.
	3078	+ */
	3079	+#define lsub_positive(_ptr, _val) do { \
	3080	+ typeof(_ptr) ptr = (_ptr); \
	3081	+ ptr -= min_t(typeof(ptr), *ptr, _val); \
	3082	+} while (0)
	3083	+
2859	3084	#ifdef CONFIG_SMP
2860		-static inline void
2861		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2862		-{
2863		- cfs_rq->runnable_weight += se->runnable_weight;
2864		-
2865		- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866		- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867		-}
2868		-
2869		-static inline void
2870		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2871		-{
2872		- cfs_rq->runnable_weight -= se->runnable_weight;
2873		-
2874		- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875		- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876		- se_runnable(se) * se->avg.runnable_load_sum);
2877		-}
2878		-
2879	3085	static inline void
2880	3086	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2881	3087	{
..	..	@@ -2891,45 +3097,36 @@
2891	3097	}
2892	3098	#else
2893	3099	static inline void
2894		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2895		-static inline void
2896		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2897		-static inline void
2898	3100	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2899	3101	static inline void
2900	3102	dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2901	3103	#endif
2902	3104
2903	3105	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
2904		- unsigned long weight, unsigned long runnable)
	3106	+ unsigned long weight)
2905	3107	{
2906	3108	if (se->on_rq) {
2907	3109	/* commit outstanding execution time */
2908	3110	if (cfs_rq->curr == se)
2909	3111	update_curr(cfs_rq);
2910		- account_entity_dequeue(cfs_rq, se);
2911		- dequeue_runnable_load_avg(cfs_rq, se);
	3112	+ update_load_sub(&cfs_rq->load, se->load.weight);
2912	3113	}
2913	3114	dequeue_load_avg(cfs_rq, se);
2914	3115
2915		- se->runnable_weight = runnable;
2916	3116	update_load_set(&se->load, weight);
2917	3117
2918	3118	#ifdef CONFIG_SMP
2919	3119	do {
2920		- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
	3120	+ u32 divider = get_pelt_divider(&se->avg);
2921	3121
2922	3122	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923		- se->avg.runnable_load_avg =
2924		- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2925	3123	} while (0);
2926	3124	#endif
2927	3125
2928	3126	enqueue_load_avg(cfs_rq, se);
2929		- if (se->on_rq) {
2930		- account_entity_enqueue(cfs_rq, se);
2931		- enqueue_runnable_load_avg(cfs_rq, se);
2932		- }
	3127	+ if (se->on_rq)
	3128	+ update_load_add(&cfs_rq->load, se->load.weight);
	3129	+
2933	3130	}
2934	3131
2935	3132	void reweight_task(struct task_struct *p, int prio)
..	..	@@ -2939,7 +3136,7 @@
2939	3136	struct load_weight *load = &se->load;
2940	3137	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2941	3138
2942		- reweight_entity(cfs_rq, se, weight, weight);
	3139	+ reweight_entity(cfs_rq, se, weight);
2943	3140	load->inv_weight = sched_prio_to_wmult[prio];
2944	3141	}
2945	3142
..	..	@@ -3051,50 +3248,6 @@
3051	3248	*/
3052	3249	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3053	3250	}
3054		-
3055		-/*
3056		- * This calculates the effective runnable weight for a group entity based on
3057		- * the group entity weight calculated above.
3058		- *
3059		- * Because of the above approximation (2), our group entity weight is
3060		- * an load_avg based ratio (3). This means that it includes blocked load and
3061		- * does not represent the runnable weight.
3062		- *
3063		- * Approximate the group entity's runnable weight per ratio from the group
3064		- * runqueue:
3065		- *
3066		- * grq->avg.runnable_load_avg
3067		- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068		- * grq->avg.load_avg
3069		- *
3070		- * However, analogous to above, since the avg numbers are slow, this leads to
3071		- * transients in the from-idle case. Instead we use:
3072		- *
3073		- * ge->runnable_weight = ge->load.weight *
3074		- *
3075		- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076		- * ----------------------------------------------------- (8)
3077		- * max(grq->avg.load_avg, grq->load.weight)
3078		- *
3079		- * Where these max() serve both to use the 'instant' values to fix the slow
3080		- * from-idle and avoid the /0 on to-idle, similar to (6).
3081		- */
3082		-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083		-{
3084		- long runnable, load_avg;
3085		-
3086		- load_avg = max(cfs_rq->avg.load_avg,
3087		- scale_load_down(cfs_rq->load.weight));
3088		-
3089		- runnable = max(cfs_rq->avg.runnable_load_avg,
3090		- scale_load_down(cfs_rq->runnable_weight));
3091		-
3092		- runnable *= shares;
3093		- if (load_avg)
3094		- runnable /= load_avg;
3095		-
3096		- return clamp_t(long, runnable, MIN_SHARES, shares);
3097		-}
3098	3251	#endif /* CONFIG_SMP */
3099	3252
3100	3253	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
..	..	@@ -3106,7 +3259,7 @@
3106	3259	static void update_cfs_group(struct sched_entity *se)
3107	3260	{
3108	3261	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109		- long shares, runnable;
	3262	+ long shares;
3110	3263
3111	3264	if (!gcfs_rq)
3112	3265	return;
..	..	@@ -3115,16 +3268,15 @@
3115	3268	return;
3116	3269
3117	3270	#ifndef CONFIG_SMP
3118		- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
	3271	+ shares = READ_ONCE(gcfs_rq->tg->shares);
3119	3272
3120	3273	if (likely(se->load.weight == shares))
3121	3274	return;
3122	3275	#else
3123	3276	shares = calc_group_shares(gcfs_rq);
3124		- runnable = calc_group_runnable(gcfs_rq, shares);
3125	3277	#endif
3126	3278
3127		- reweight_entity(cfs_rq_of(se), se, shares, runnable);
	3279	+ reweight_entity(cfs_rq_of(se), se, shares);
3128	3280	}
3129	3281
3130	3282	#else /* CONFIG_FAIR_GROUP_SCHED */
..	..	@@ -3137,7 +3289,7 @@
3137	3289	{
3138	3290	struct rq *rq = rq_of(cfs_rq);
3139	3291
3140		- if (&rq->cfs == cfs_rq \|\| (flags & SCHED_CPUFREQ_MIGRATION)) {
	3292	+ if (&rq->cfs == cfs_rq) {
3141	3293	/*
3142	3294	* There are a few boundary cases this might miss but it should
3143	3295	* get called often enough that that should (hopefully) not be
..	..	@@ -3161,7 +3313,6 @@
3161	3313	/**
3162	3314	* update_tg_load_avg - update the tg's load avg
3163	3315	* @cfs_rq: the cfs_rq whose avg changed
3164		- * @force: update regardless of how small the difference
3165	3316	*
3166	3317	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3167	3318	* However, because tg->load_avg is a global value there are performance
..	..	@@ -3173,7 +3324,7 @@
3173	3324	*
3174	3325	* Updating tg's load_avg is necessary before update_cfs_share().
3175	3326	*/
3176		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	3327	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3177	3328	{
3178	3329	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3179	3330
..	..	@@ -3183,11 +3334,9 @@
3183	3334	if (cfs_rq->tg == &root_task_group)
3184	3335	return;
3185	3336
3186		- if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
	3337	+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3187	3338	atomic_long_add(delta, &cfs_rq->tg->load_avg);
3188	3339	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189		-
3190		- trace_sched_load_tg(cfs_rq);
3191	3340	}
3192	3341	}
3193	3342
..	..	@@ -3240,7 +3389,6 @@
3240	3389	se->avg.last_update_time = n_last_update_time;
3241	3390	}
3242	3391
3243		-
3244	3392	/*
3245	3393	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3246	3394	* propagate its contribution. The key to this propagation is the invariant
..	..	@@ -3251,11 +3399,11 @@
3251	3399	* _IFF_ we look at the pure running and runnable sums. Because they
3252	3400	* represent the very same entity, just at different points in the hierarchy.
3253	3401	*
3254		- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255		- * sum over (but still wrong, because the group entity and group rq do not have
3256		- * their PELT windows aligned).
	3402	+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
	3403	+ * and simply copies the running/runnable sum over (but still wrong, because
	3404	+ * the group entity and group rq do not have their PELT windows aligned).
3257	3405	*
3258		- * However, update_tg_cfs_runnable() is more complex. So we have:
	3406	+ * However, update_tg_cfs_load() is more complex. So we have:
3259	3407	*
3260	3408	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3261	3409	*
..	..	@@ -3308,45 +3456,75 @@
3308	3456	* XXX: only do this for the part of runnable > running ?
3309	3457	*
3310	3458	*/
3311		-
3312	3459	static inline void
3313	3460	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3314	3461	{
3315	3462	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
	3463	+ u32 divider;
3316	3464
3317	3465	/* Nothing to update */
3318	3466	if (!delta)
3319	3467	return;
3320	3468
3321	3469	/*
3322		- * The relation between sum and avg is:
3323		- *
3324		- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325		- *
3326		- * however, the PELT windows are not aligned between grq and gse.
	3470	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3471	+ * See ___update_load_avg() for details.
3327	3472	*/
	3473	+ divider = get_pelt_divider(&cfs_rq->avg);
3328	3474
3329	3475	/* Set new sched_entity's utilization */
3330	3476	se->avg.util_avg = gcfs_rq->avg.util_avg;
3331		- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
	3477	+ se->avg.util_sum = se->avg.util_avg * divider;
3332	3478
3333	3479	/* Update parent cfs_rq utilization */
3334	3480	add_positive(&cfs_rq->avg.util_avg, delta);
3335		- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
	3481	+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3336	3482	}
3337	3483
3338	3484	static inline void
3339	3485	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3340	3486	{
	3487	+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
	3488	+ u32 divider;
	3489	+
	3490	+ /* Nothing to update */
	3491	+ if (!delta)
	3492	+ return;
	3493	+
	3494	+ /*
	3495	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3496	+ * See ___update_load_avg() for details.
	3497	+ */
	3498	+ divider = get_pelt_divider(&cfs_rq->avg);
	3499	+
	3500	+ /* Set new sched_entity's runnable */
	3501	+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
	3502	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
	3503	+
	3504	+ /* Update parent cfs_rq runnable */
	3505	+ add_positive(&cfs_rq->avg.runnable_avg, delta);
	3506	+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
	3507	+}
	3508	+
	3509	+static inline void
	3510	+update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
	3511	+{
3341	3512	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342		- unsigned long runnable_load_avg, load_avg;
3343		- u64 runnable_load_sum, load_sum = 0;
	3513	+ unsigned long load_avg;
	3514	+ u64 load_sum = 0;
3344	3515	s64 delta_sum;
	3516	+ u32 divider;
3345	3517
3346	3518	if (!runnable_sum)
3347	3519	return;
3348	3520
3349	3521	gcfs_rq->prop_runnable_sum = 0;
	3522	+
	3523	+ /*
	3524	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3525	+ * See ___update_load_avg() for details.
	3526	+ */
	3527	+ divider = get_pelt_divider(&cfs_rq->avg);
3350	3528
3351	3529	if (runnable_sum >= 0) {
3352	3530	/*
..	..	@@ -3354,7 +3532,7 @@
3354	3532	* the CPU is saturated running == runnable.
3355	3533	*/
3356	3534	runnable_sum += se->avg.load_sum;
3357		- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
	3535	+ runnable_sum = min_t(long, runnable_sum, divider);
3358	3536	} else {
3359	3537	/*
3360	3538	* Estimate the new unweighted runnable_sum of the gcfs_rq by
..	..	@@ -3379,7 +3557,7 @@
3379	3557	runnable_sum = max(runnable_sum, running_sum);
3380	3558
3381	3559	load_sum = (s64)se_weight(se) * runnable_sum;
3382		- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
	3560	+ load_avg = div_s64(load_sum, divider);
3383	3561
3384	3562	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3385	3563	delta_avg = load_avg - se->avg.load_avg;
..	..	@@ -3388,19 +3566,6 @@
3388	3566	se->avg.load_avg = load_avg;
3389	3567	add_positive(&cfs_rq->avg.load_avg, delta_avg);
3390	3568	add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391		-
3392		- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393		- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394		- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395		- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396		-
3397		- se->avg.runnable_load_sum = runnable_sum;
3398		- se->avg.runnable_load_avg = runnable_load_avg;
3399		-
3400		- if (se->on_rq) {
3401		- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402		- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403		- }
3404	3569	}
3405	3570
3406	3571	static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
..	..	@@ -3429,9 +3594,10 @@
3429	3594
3430	3595	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3431	3596	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
	3597	+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3432	3598
3433		- trace_sched_load_cfs_rq(cfs_rq);
3434		- trace_sched_load_se(se);
	3599	+ trace_pelt_cfs_tp(cfs_rq);
	3600	+ trace_pelt_se_tp(se);
3435	3601
3436	3602	return 1;
3437	3603	}
..	..	@@ -3468,7 +3634,7 @@
3468	3634
3469	3635	#else /* CONFIG_FAIR_GROUP_SCHED */
3470	3636
3471		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
	3637	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3472	3638
3473	3639	static inline int propagate_entity_load_avg(struct sched_entity *se)
3474	3640	{
..	..	@@ -3498,18 +3664,18 @@
3498	3664	static inline int
3499	3665	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3500	3666	{
3501		- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
	3667	+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3502	3668	struct sched_avg *sa = &cfs_rq->avg;
3503	3669	int decayed = 0;
3504	3670
3505	3671	if (cfs_rq->removed.nr) {
3506	3672	unsigned long r;
3507		- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
	3673	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3508	3674
3509	3675	raw_spin_lock(&cfs_rq->removed.lock);
3510	3676	swap(cfs_rq->removed.util_avg, removed_util);
3511	3677	swap(cfs_rq->removed.load_avg, removed_load);
3512		- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
	3678	+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
3513	3679	cfs_rq->removed.nr = 0;
3514	3680	raw_spin_unlock(&cfs_rq->removed.lock);
3515	3681
..	..	@@ -3520,8 +3686,29 @@
3520	3686	r = removed_util;
3521	3687	sub_positive(&sa->util_avg, r);
3522	3688	sub_positive(&sa->util_sum, r * divider);
	3689	+ /*
	3690	+ * Because of rounding, se->util_sum might ends up being +1 more than
	3691	+ * cfs->util_sum. Although this is not a problem by itself, detaching
	3692	+ * a lot of tasks with the rounding problem between 2 updates of
	3693	+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
	3694	+ * cfs_util_avg is not.
	3695	+ * Check that util_sum is still above its lower bound for the new
	3696	+ * util_avg. Given that period_contrib might have moved since the last
	3697	+ * sync, we are only sure that util_sum must be above or equal to
	3698	+ * util_avg * minimum possible divider
	3699	+ */
	3700	+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
3523	3701
3524		- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
	3702	+ r = removed_runnable;
	3703	+ sub_positive(&sa->runnable_avg, r);
	3704	+ sub_positive(&sa->runnable_sum, r * divider);
	3705	+
	3706	+ /*
	3707	+ * removed_runnable is the unweighted version of removed_load so we
	3708	+ * can use it to estimate removed_load_sum.
	3709	+ */
	3710	+ add_tg_cfs_propagate(cfs_rq,
	3711	+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3525	3712
3526	3713	decayed = 1;
3527	3714	}
..	..	@@ -3533,9 +3720,6 @@
3533	3720	cfs_rq->load_last_update_time_copy = sa->last_update_time;
3534	3721	#endif
3535	3722
3536		- if (decayed)
3537		- cfs_rq_util_change(cfs_rq, 0);
3538		-
3539	3723	return decayed;
3540	3724	}
3541	3725
..	..	@@ -3543,14 +3727,17 @@
3543	3727	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
3544	3728	* @cfs_rq: cfs_rq to attach to
3545	3729	* @se: sched_entity to attach
3546		- * @flags: migration hints
3547	3730	*
3548	3731	* Must call update_cfs_rq_load_avg() before this, since we rely on
3549	3732	* cfs_rq->avg.last_update_time being current.
3550	3733	*/
3551		-static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
	3734	+static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
3552	3735	{
3553		- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
	3736	+ /*
	3737	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3738	+ * See ___update_load_avg() for details.
	3739	+ */
	3740	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3554	3741
3555	3742	/*
3556	3743	* When we attach the @se to the @cfs_rq, we must align the decay
..	..	@@ -3570,23 +3757,25 @@
3570	3757	*/
3571	3758	se->avg.util_sum = se->avg.util_avg * divider;
3572	3759
3573		- se->avg.load_sum = divider;
3574		- if (se_weight(se)) {
3575		- se->avg.load_sum =
3576		- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577		- }
	3760	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3578	3761
3579		- se->avg.runnable_load_sum = se->avg.load_sum;
	3762	+ se->avg.load_sum = se->avg.load_avg * divider;
	3763	+ if (se_weight(se) < se->avg.load_sum)
	3764	+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
	3765	+ else
	3766	+ se->avg.load_sum = 1;
3580	3767
3581	3768	enqueue_load_avg(cfs_rq, se);
3582	3769	cfs_rq->avg.util_avg += se->avg.util_avg;
3583	3770	cfs_rq->avg.util_sum += se->avg.util_sum;
	3771	+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
	3772	+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3584	3773
3585	3774	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3586	3775
3587		- cfs_rq_util_change(cfs_rq, flags);
	3776	+ cfs_rq_util_change(cfs_rq, 0);
3588	3777
3589		- trace_sched_load_cfs_rq(cfs_rq);
	3778	+ trace_pelt_cfs_tp(cfs_rq);
3590	3779	}
3591	3780
3592	3781	/**
..	..	@@ -3602,12 +3791,14 @@
3602	3791	dequeue_load_avg(cfs_rq, se);
3603	3792	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3604	3793	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
	3794	+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
	3795	+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3605	3796
3606	3797	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3607	3798
3608	3799	cfs_rq_util_change(cfs_rq, 0);
3609	3800
3610		- trace_sched_load_cfs_rq(cfs_rq);
	3801	+ trace_pelt_cfs_tp(cfs_rq);
3611	3802	}
3612	3803
3613	3804	/*
..	..	@@ -3623,12 +3814,15 @@
3623	3814	u64 now = cfs_rq_clock_pelt(cfs_rq);
3624	3815	int decayed;
3625	3816
	3817	+ trace_android_vh_prepare_update_load_avg_se(se, flags);
3626	3818	/*
3627	3819	* Track task load average for carrying it to new CPU after migrated, and
3628	3820	* track group sched_entity load average for task_h_load calc in migration
3629	3821	*/
3630	3822	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3631	3823	__update_load_avg_se(now, cfs_rq, se);
	3824	+
	3825	+ trace_android_vh_finish_update_load_avg_se(se, flags);
3632	3826
3633	3827	decayed = update_cfs_rq_load_avg(now, cfs_rq);
3634	3828	decayed \|= propagate_entity_load_avg(se);
..	..	@@ -3642,11 +3836,15 @@
3642	3836	*
3643	3837	* IOW we're enqueueing a task on a new CPU.
3644	3838	*/
3645		- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646		- update_tg_load_avg(cfs_rq, 0);
	3839	+ attach_entity_load_avg(cfs_rq, se);
	3840	+ update_tg_load_avg(cfs_rq);
3647	3841
3648		- } else if (decayed && (flags & UPDATE_TG))
3649		- update_tg_load_avg(cfs_rq, 0);
	3842	+ } else if (decayed) {
	3843	+ cfs_rq_util_change(cfs_rq, 0);
	3844	+
	3845	+ if (flags & UPDATE_TG)
	3846	+ update_tg_load_avg(cfs_rq);
	3847	+ }
3650	3848	}
3651	3849
3652	3850	#ifndef CONFIG_64BIT
..	..	@@ -3674,20 +3872,22 @@
3674	3872	* Synchronize entity load avg of dequeued entity without locking
3675	3873	* the previous rq.
3676	3874	*/
3677		-void sync_entity_load_avg(struct sched_entity *se)
	3875	+static void sync_entity_load_avg(struct sched_entity *se)
3678	3876	{
3679	3877	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3680	3878	u64 last_update_time;
3681	3879
3682	3880	last_update_time = cfs_rq_last_update_time(cfs_rq);
	3881	+ trace_android_vh_prepare_update_load_avg_se(se, 0);
3683	3882	__update_load_avg_blocked_se(last_update_time, se);
	3883	+ trace_android_vh_finish_update_load_avg_se(se, 0);
3684	3884	}
3685	3885
3686	3886	/*
3687	3887	* Task first catches up with cfs_rq, and then subtract
3688	3888	* itself from the cfs_rq (task must be off the queue now).
3689	3889	*/
3690		-void remove_entity_load_avg(struct sched_entity *se)
	3890	+static void remove_entity_load_avg(struct sched_entity *se)
3691	3891	{
3692	3892	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3693	3893	unsigned long flags;
..	..	@@ -3696,10 +3896,6 @@
3696	3896	* tasks cannot exit without having gone through wake_up_new_task() ->
3697	3897	* post_init_entity_util_avg() which will have added things to the
3698	3898	* cfs_rq, so we can remove unconditionally.
3699		- *
3700		- * Similarly for groups, they will have passed through
3701		- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702		- * calls this.
3703	3899	*/
3704	3900
3705	3901	sync_entity_load_avg(se);
..	..	@@ -3708,13 +3904,13 @@
3708	3904	++cfs_rq->removed.nr;
3709	3905	cfs_rq->removed.util_avg += se->avg.util_avg;
3710	3906	cfs_rq->removed.load_avg += se->avg.load_avg;
3711		- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
	3907	+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3712	3908	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3713	3909	}
3714	3910
3715		-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
	3911	+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3716	3912	{
3717		- return cfs_rq->avg.runnable_load_avg;
	3913	+ return cfs_rq->avg.runnable_avg;
3718	3914	}
3719	3915
3720	3916	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
..	..	@@ -3722,7 +3918,7 @@
3722	3918	return cfs_rq->avg.load_avg;
3723	3919	}
3724	3920
3725		-static int idle_balance(struct rq this_rq, struct rq_flags rf);
	3921	+static int newidle_balance(struct rq this_rq, struct rq_flags rf);
3726	3922
3727	3923	static inline unsigned long task_util(struct task_struct *p)
3728	3924	{
..	..	@@ -3733,10 +3929,10 @@
3733	3929	{
3734	3930	struct util_est ue = READ_ONCE(p->se.avg.util_est);
3735	3931
3736		- return max(ue.ewma, ue.enqueued);
	3932	+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3737	3933	}
3738	3934
3739		-unsigned long task_util_est(struct task_struct *p)
	3935	+static inline unsigned long task_util_est(struct task_struct *p)
3740	3936	{
3741	3937	return max(task_util(p), _task_util_est(p));
3742	3938	}
..	..	@@ -3765,13 +3961,29 @@
3765	3961
3766	3962	/* Update root cfs_rq's estimated utilization */
3767	3963	enqueued = cfs_rq->avg.util_est.enqueued;
3768		- enqueued += (_task_util_est(p) \| UTIL_AVG_UNCHANGED);
	3964	+ enqueued += _task_util_est(p);
3769	3965	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3770	3966
3771		- /* Update plots for Task and CPU estimated utilization */
3772		- trace_sched_util_est_task(p, &p->se.avg);
3773		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
	3967	+ trace_sched_util_est_cfs_tp(cfs_rq);
3774	3968	}
	3969	+
	3970	+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
	3971	+ struct task_struct *p)
	3972	+{
	3973	+ unsigned int enqueued;
	3974	+
	3975	+ if (!sched_feat(UTIL_EST))
	3976	+ return;
	3977	+
	3978	+ /* Update root cfs_rq's estimated utilization */
	3979	+ enqueued = cfs_rq->avg.util_est.enqueued;
	3980	+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
	3981	+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
	3982	+
	3983	+ trace_sched_util_est_cfs_tp(cfs_rq);
	3984	+}
	3985	+
	3986	+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3775	3987
3776	3988	/*
3777	3989	* Check if a (signed) value is within a specified (unsigned) margin,
..	..	@@ -3786,24 +3998,20 @@
3786	3998	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3787	3999	}
3788	4000
3789		-static void
3790		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
	4001	+static inline void util_est_update(struct cfs_rq *cfs_rq,
	4002	+ struct task_struct *p,
	4003	+ bool task_sleep)
3791	4004	{
3792		- long last_ewma_diff;
	4005	+ long last_ewma_diff, last_enqueued_diff;
3793	4006	struct util_est ue;
3794		- int cpu;
	4007	+ int ret = 0;
	4008	+
	4009	+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
	4010	+ if (ret)
	4011	+ return;
3795	4012
3796	4013	if (!sched_feat(UTIL_EST))
3797	4014	return;
3798		-
3799		- /* Update root cfs_rq's estimated utilization */
3800		- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801		- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
3803		- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804		-
3805		- /* Update plots for CPU's estimated utilization */
3806		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3807	4015
3808	4016	/*
3809	4017	* Skip update of task's estimated utilization when the task has not
..	..	@@ -3820,11 +4028,13 @@
3820	4028	if (ue.enqueued & UTIL_AVG_UNCHANGED)
3821	4029	return;
3822	4030
	4031	+ last_enqueued_diff = ue.enqueued;
	4032	+
3823	4033	/*
3824	4034	* Reset EWMA on utilization increases, the moving average is used only
3825	4035	* to smooth utilization decreases.
3826	4036	*/
3827		- ue.enqueued = (task_util(p) \| UTIL_AVG_UNCHANGED);
	4037	+ ue.enqueued = task_util(p);
3828	4038	if (sched_feat(UTIL_EST_FASTUP)) {
3829	4039	if (ue.ewma < ue.enqueued) {
3830	4040	ue.ewma = ue.enqueued;
..	..	@@ -3833,19 +4043,23 @@
3833	4043	}
3834	4044
3835	4045	/*
3836		- * Skip update of task's estimated utilization when its EWMA is
	4046	+ * Skip update of task's estimated utilization when its members are
3837	4047	* already ~1% close to its last activation value.
3838	4048	*/
3839	4049	last_ewma_diff = ue.enqueued - ue.ewma;
3840		- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
	4050	+ last_enqueued_diff -= ue.enqueued;
	4051	+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
	4052	+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
	4053	+ goto done;
	4054	+
3841	4055	return;
	4056	+ }
3842	4057
3843	4058	/*
3844	4059	* To avoid overestimation of actual task utilization, skip updates if
3845	4060	* we cannot grant there is idle time in this CPU.
3846	4061	*/
3847		- cpu = cpu_of(rq_of(cfs_rq));
3848		- if (task_util(p) > capacity_orig_of(cpu))
	4062	+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
3849	4063	return;
3850	4064
3851	4065	/*
..	..	@@ -3869,39 +4083,26 @@
3869	4083	ue.ewma += last_ewma_diff;
3870	4084	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3871	4085	done:
	4086	+ ue.enqueued \|= UTIL_AVG_UNCHANGED;
3872	4087	WRITE_ONCE(p->se.avg.util_est, ue);
3873	4088
3874		- /* Update plots for Task's estimated utilization */
3875		- trace_sched_util_est_task(p, &p->se.avg);
	4089	+ trace_sched_util_est_se_tp(&p->se);
3876	4090	}
3877	4091
3878	4092	static inline int task_fits_capacity(struct task_struct *p, long capacity)
3879	4093	{
3880		- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
	4094	+ return fits_capacity(uclamp_task_util(p), capacity);
3881	4095	}
3882		-
3883		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884		-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885		-{
3886		- unsigned long capacity = capacity_of(cpu);
3887		- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
3888		-
3889		- if (capacity == max_capacity)
3890		- return true;
3891		-
3892		- if (capacity * capacity_margin > max_capacity * 1024)
3893		- return true;
3894		-
3895		- return task_fits_capacity(p, capacity);
3896		-}
3897		-#endif
3898	4096
3899	4097	static inline void update_misfit_status(struct task_struct p, struct rq rq)
3900	4098	{
3901		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
	4099	+ bool need_update = true;
	4100	+
	4101	+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
	4102	+ if (!static_branch_unlikely(&sched_asym_cpucapacity) \|\| !need_update)
3902	4103	return;
3903	4104
3904		- if (!p) {
	4105	+ if (!p \|\| p->nr_cpus_allowed == 1) {
3905	4106	rq->misfit_task_load = 0;
3906	4107	return;
3907	4108	}
..	..	@@ -3911,7 +4112,11 @@
3911	4112	return;
3912	4113	}
3913	4114
3914		- rq->misfit_task_load = task_h_load(p);
	4115	+ /*
	4116	+ * Make sure that misfit_task_load will not be null even if
	4117	+ * task_h_load() returns 0.
	4118	+ */
	4119	+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
3915	4120	}
3916	4121
3917	4122	#else /* CONFIG_SMP */
..	..	@@ -3928,11 +4133,11 @@
3928	4133	static inline void remove_entity_load_avg(struct sched_entity *se) {}
3929	4134
3930	4135	static inline void
3931		-attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags) {}
	4136	+attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3932	4137	static inline void
3933	4138	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3934	4139
3935		-static inline int idle_balance(struct rq rq, struct rq_flags rf)
	4140	+static inline int newidle_balance(struct rq rq, struct rq_flags rf)
3936	4141	{
3937	4142	return 0;
3938	4143	}
..	..	@@ -3941,8 +4146,11 @@
3941	4146	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
3942	4147
3943	4148	static inline void
3944		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
3945		- bool task_sleep) {}
	4149	+util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p) {}
	4150	+
	4151	+static inline void
	4152	+util_est_update(struct cfs_rq cfs_rq, struct task_struct p,
	4153	+ bool task_sleep) {}
3946	4154	static inline void update_misfit_status(struct task_struct p, struct rq rq) {}
3947	4155
3948	4156	#endif /* CONFIG_SMP */
..	..	@@ -3990,6 +4198,7 @@
3990	4198
3991	4199	/* ensure we never gain time by being placed backwards. */
3992	4200	se->vruntime = max_vruntime(se->vruntime, vruntime);
	4201	+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
3993	4202	}
3994	4203
3995	4204	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
..	..	@@ -4014,6 +4223,7 @@
4014	4223	#endif
4015	4224	}
4016	4225
	4226	+static inline bool cfs_bandwidth_used(void);
4017	4227
4018	4228	/*
4019	4229	* MIGRATION
..	..	@@ -4078,8 +4288,8 @@
4078	4288	* - Add its new weight to cfs_rq->load.weight
4079	4289	*/
4080	4290	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
	4291	+ se_update_runnable(se);
4081	4292	update_cfs_group(se);
4082		- enqueue_runnable_load_avg(cfs_rq, se);
4083	4293	account_entity_enqueue(cfs_rq, se);
4084	4294
4085	4295	if (flags & ENQUEUE_WAKEUP)
..	..	@@ -4092,10 +4302,16 @@
4092	4302	__enqueue_entity(cfs_rq, se);
4093	4303	se->on_rq = 1;
4094	4304
4095		- if (cfs_rq->nr_running == 1) {
	4305	+ /*
	4306	+ * When bandwidth control is enabled, cfs might have been removed
	4307	+ * because of a parent been throttled but cfs->nr_running > 1. Try to
	4308	+ * add it unconditionnally.
	4309	+ */
	4310	+ if (cfs_rq->nr_running == 1 \|\| cfs_bandwidth_used())
4096	4311	list_add_leaf_cfs_rq(cfs_rq);
	4312	+
	4313	+ if (cfs_rq->nr_running == 1)
4097	4314	check_enqueue_throttle(cfs_rq);
4098		- }
4099	4315	}
4100	4316
4101	4317	static void __clear_buddies_last(struct sched_entity *se)
..	..	@@ -4156,13 +4372,13 @@
4156	4372	/*
4157	4373	* When dequeuing a sched_entity, we must:
4158	4374	* - Update loads to have both entity and cfs_rq synced with now.
4159		- * - Substract its load from the cfs_rq->runnable_avg.
4160		- * - Substract its previous weight from cfs_rq->load.weight.
	4375	+ * - Subtract its load from the cfs_rq->runnable_avg.
	4376	+ * - Subtract its previous weight from cfs_rq->load.weight.
4161	4377	* - For group entity, update its weight to reflect the new share
4162	4378	* of its group cfs_rq.
4163	4379	*/
4164	4380	update_load_avg(cfs_rq, se, UPDATE_TG);
4165		- dequeue_runnable_load_avg(cfs_rq, se);
	4381	+ se_update_runnable(se);
4166	4382
4167	4383	update_stats_dequeue(cfs_rq, se, flags);
4168	4384
..	..	@@ -4206,9 +4422,14 @@
4206	4422	unsigned long ideal_runtime, delta_exec;
4207	4423	struct sched_entity *se;
4208	4424	s64 delta;
	4425	+ bool skip_preempt = false;
4209	4426
4210	4427	ideal_runtime = sched_slice(cfs_rq, curr);
4211	4428	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	4429	+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
	4430	+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
	4431	+ if (skip_preempt)
	4432	+ return;
4212	4433	if (delta_exec > ideal_runtime) {
4213	4434	resched_curr_lazy(rq_of(cfs_rq));
4214	4435	/*
..	..	@@ -4237,8 +4458,7 @@
4237	4458	resched_curr_lazy(rq_of(cfs_rq));
4238	4459	}
4239	4460
4240		-static void
4241		-set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
	4461	+void set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
4242	4462	{
4243	4463	/* 'current' is not kept within the tree. */
4244	4464	if (se->on_rq) {
..	..	@@ -4260,7 +4480,8 @@
4260	4480	* least twice that of our own weight (i.e. dont track it
4261	4481	* when there are only lesser-weight tasks around):
4262	4482	*/
4263		- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
	4483	+ if (schedstat_enabled() &&
	4484	+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4264	4485	schedstat_set(se->statistics.slice_max,
4265	4486	max((u64)schedstat_val(se->statistics.slice_max),
4266	4487	se->sum_exec_runtime - se->prev_sum_exec_runtime));
..	..	@@ -4268,6 +4489,8 @@
4268	4489
4269	4490	se->prev_sum_exec_runtime = se->sum_exec_runtime;
4270	4491	}
	4492	+EXPORT_SYMBOL_GPL(set_next_entity);
	4493	+
4271	4494
4272	4495	static int
4273	4496	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
..	..	@@ -4283,7 +4506,11 @@
4283	4506	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
4284	4507	{
4285	4508	struct sched_entity *left = __pick_first_entity(cfs_rq);
4286		- struct sched_entity *se;
	4509	+ struct sched_entity *se = NULL;
	4510	+
	4511	+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
	4512	+ if (se)
	4513	+ goto done;
4287	4514
4288	4515	/*
4289	4516	* If curr is set we have to see if its left of the leftmost entity
..	..	@@ -4313,18 +4540,19 @@
4313	4540	se = second;
4314	4541	}
4315	4542
4316		- /*
4317		- * Prefer last buddy, try to return the CPU to a preempted task.
4318		- */
4319		- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320		- se = cfs_rq->last;
4321		-
4322		- /*
4323		- * Someone really wants this to run. If it's not unfair, run it.
4324		- */
4325		- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
	4543	+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
	4544	+ /*
	4545	+ * Someone really wants this to run. If it's not unfair, run it.
	4546	+ */
4326	4547	se = cfs_rq->next;
	4548	+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
	4549	+ /*
	4550	+ * Prefer last buddy, try to return the CPU to a preempted task.
	4551	+ */
	4552	+ se = cfs_rq->last;
	4553	+ }
4327	4554
	4555	+done:
4328	4556	clear_buddies(cfs_rq, se);
4329	4557
4330	4558	return se;
..	..	@@ -4457,26 +4685,17 @@
4457	4685	return &tg->cfs_bandwidth;
4458	4686	}
4459	4687
4460		-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462		-{
4463		- if (unlikely(cfs_rq->throttle_count))
4464		- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465		-
4466		- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467		-}
4468		-
4469	4688	/* returns 0 on failure to allocate runtime */
4470		-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4689	+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
	4690	+ struct cfs_rq *cfs_rq, u64 target_runtime)
4471	4691	{
4472		- struct task_group *tg = cfs_rq->tg;
4473		- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474		- u64 amount = 0, min_amount;
	4692	+ u64 min_amount, amount = 0;
	4693	+
	4694	+ lockdep_assert_held(&cfs_b->lock);
4475	4695
4476	4696	/* note: this is a positive sum as runtime_remaining <= 0 */
4477		- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
	4697	+ min_amount = target_runtime - cfs_rq->runtime_remaining;
4478	4698
4479		- raw_spin_lock(&cfs_b->lock);
4480	4699	if (cfs_b->quota == RUNTIME_INF)
4481	4700	amount = min_amount;
4482	4701	else {
..	..	@@ -4488,11 +4707,23 @@
4488	4707	cfs_b->idle = 0;
4489	4708	}
4490	4709	}
4491		- raw_spin_unlock(&cfs_b->lock);
4492	4710
4493	4711	cfs_rq->runtime_remaining += amount;
4494	4712
4495	4713	return cfs_rq->runtime_remaining > 0;
	4714	+}
	4715	+
	4716	+/* returns 0 on failure to allocate runtime */
	4717	+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4718	+{
	4719	+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
	4720	+ int ret;
	4721	+
	4722	+ raw_spin_lock(&cfs_b->lock);
	4723	+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
	4724	+ raw_spin_unlock(&cfs_b->lock);
	4725	+
	4726	+ return ret;
4496	4727	}
4497	4728
4498	4729	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
..	..	@@ -4557,9 +4788,8 @@
4557	4788
4558	4789	cfs_rq->throttle_count--;
4559	4790	if (!cfs_rq->throttle_count) {
4560		- /* adjust cfs_rq_clock_task() */
4561		- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562		- cfs_rq->throttled_clock_task;
	4791	+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
	4792	+ cfs_rq->throttled_clock_pelt;
4563	4793
4564	4794	/* Add cfs_rq with already running entity in the list */
4565	4795	if (cfs_rq->nr_running >= 1)
..	..	@@ -4576,7 +4806,7 @@
4576	4806
4577	4807	/* group is entering throttled state, stop time */
4578	4808	if (!cfs_rq->throttle_count) {
4579		- cfs_rq->throttled_clock_task = rq_clock_task(rq);
	4809	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
4580	4810	list_del_leaf_cfs_rq(cfs_rq);
4581	4811	}
4582	4812	cfs_rq->throttle_count++;
..	..	@@ -4584,13 +4814,33 @@
4584	4814	return 0;
4585	4815	}
4586	4816
4587		-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
	4817	+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4588	4818	{
4589	4819	struct rq *rq = rq_of(cfs_rq);
4590	4820	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4591	4821	struct sched_entity *se;
4592		- long task_delta, dequeue = 1;
4593		- bool empty;
	4822	+ long task_delta, idle_task_delta, dequeue = 1;
	4823	+
	4824	+ raw_spin_lock(&cfs_b->lock);
	4825	+ /* This will start the period timer if necessary */
	4826	+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
	4827	+ /*
	4828	+ * We have raced with bandwidth becoming available, and if we
	4829	+ * actually throttled the timer might not unthrottle us for an
	4830	+ * entire period. We additionally needed to make sure that any
	4831	+ * subsequent check_cfs_rq_runtime calls agree not to throttle
	4832	+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
	4833	+ * for 1ns of runtime rather than just check cfs_b.
	4834	+ */
	4835	+ dequeue = 0;
	4836	+ } else {
	4837	+ list_add_tail_rcu(&cfs_rq->throttled_list,
	4838	+ &cfs_b->throttled_cfs_rq);
	4839	+ }
	4840	+ raw_spin_unlock(&cfs_b->lock);
	4841	+
	4842	+ if (!dequeue)
	4843	+ return false; /* Throttle no longer required. */
4594	4844
4595	4845	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4596	4846
..	..	@@ -4600,15 +4850,22 @@
4600	4850	rcu_read_unlock();
4601	4851
4602	4852	task_delta = cfs_rq->h_nr_running;
	4853	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4603	4854	for_each_sched_entity(se) {
4604	4855	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4605	4856	/* throttled entity or throttle-on-deactivate */
4606	4857	if (!se->on_rq)
4607	4858	break;
4608	4859
4609		- if (dequeue)
	4860	+ if (dequeue) {
4610	4861	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
	4862	+ } else {
	4863	+ update_load_avg(qcfs_rq, se, 0);
	4864	+ se_update_runnable(se);
	4865	+ }
	4866	+
4611	4867	qcfs_rq->h_nr_running -= task_delta;
	4868	+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
4612	4869
4613	4870	if (qcfs_rq->load.weight)
4614	4871	dequeue = 0;
..	..	@@ -4617,29 +4874,13 @@
4617	4874	if (!se)
4618	4875	sub_nr_running(rq, task_delta);
4619	4876
	4877	+ /*
	4878	+ * Note: distribution will already see us throttled via the
	4879	+ * throttled-list. rq->lock protects completion.
	4880	+ */
4620	4881	cfs_rq->throttled = 1;
4621	4882	cfs_rq->throttled_clock = rq_clock(rq);
4622		- raw_spin_lock(&cfs_b->lock);
4623		- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624		-
4625		- /*
4626		- * Add to the _head_ of the list, so that an already-started
4627		- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628		- * not running add to the tail so that later runqueues don't get starved.
4629		- */
4630		- if (cfs_b->distribute_running)
4631		- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632		- else
4633		- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634		-
4635		- /*
4636		- * If we're the first throttled task, make sure the bandwidth
4637		- * timer is running.
4638		- */
4639		- if (empty)
4640		- start_cfs_bandwidth(cfs_b);
4641		-
4642		- raw_spin_unlock(&cfs_b->lock);
	4883	+ return true;
4643	4884	}
4644	4885
4645	4886	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -4647,8 +4888,7 @@
4647	4888	struct rq *rq = rq_of(cfs_rq);
4648	4889	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4649	4890	struct sched_entity *se;
4650		- int enqueue = 1;
4651		- long task_delta;
	4891	+ long task_delta, idle_task_delta;
4652	4892
4653	4893	se = cfs_rq->tg->se[cpu_of(rq)];
4654	4894
..	..	@@ -4668,34 +4908,70 @@
4668	4908	return;
4669	4909
4670	4910	task_delta = cfs_rq->h_nr_running;
	4911	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4671	4912	for_each_sched_entity(se) {
4672	4913	if (se->on_rq)
4673		- enqueue = 0;
4674		-
	4914	+ break;
4675	4915	cfs_rq = cfs_rq_of(se);
4676		- if (enqueue)
4677		- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678		- cfs_rq->h_nr_running += task_delta;
	4916	+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4679	4917
	4918	+ cfs_rq->h_nr_running += task_delta;
	4919	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	4920	+
	4921	+ /* end evaluation on encountering a throttled cfs_rq */
4680	4922	if (cfs_rq_throttled(cfs_rq))
	4923	+ goto unthrottle_throttle;
	4924	+ }
	4925	+
	4926	+ for_each_sched_entity(se) {
	4927	+ cfs_rq = cfs_rq_of(se);
	4928	+
	4929	+ update_load_avg(cfs_rq, se, UPDATE_TG);
	4930	+ se_update_runnable(se);
	4931	+
	4932	+ cfs_rq->h_nr_running += task_delta;
	4933	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	4934	+
	4935	+
	4936	+ /* end evaluation on encountering a throttled cfs_rq */
	4937	+ if (cfs_rq_throttled(cfs_rq))
	4938	+ goto unthrottle_throttle;
	4939	+
	4940	+ /*
	4941	+ * One parent has been throttled and cfs_rq removed from the
	4942	+ * list. Add it back to not break the leaf list.
	4943	+ */
	4944	+ if (throttled_hierarchy(cfs_rq))
	4945	+ list_add_leaf_cfs_rq(cfs_rq);
	4946	+ }
	4947	+
	4948	+ /* At this point se is NULL and we are at root level*/
	4949	+ add_nr_running(rq, task_delta);
	4950	+
	4951	+unthrottle_throttle:
	4952	+ /*
	4953	+ * The cfs_rq_throttled() breaks in the above iteration can result in
	4954	+ * incomplete leaf list maintenance, resulting in triggering the
	4955	+ * assertion below.
	4956	+ */
	4957	+ for_each_sched_entity(se) {
	4958	+ cfs_rq = cfs_rq_of(se);
	4959	+
	4960	+ if (list_add_leaf_cfs_rq(cfs_rq))
4681	4961	break;
4682	4962	}
4683	4963
4684	4964	assert_list_leaf_cfs_rq(rq);
4685		-
4686		- if (!se)
4687		- add_nr_running(rq, task_delta);
4688	4965
4689	4966	/* Determine whether we need to wake up potentially idle CPU: */
4690	4967	if (rq->curr == rq->idle && rq->cfs.nr_running)
4691	4968	resched_curr(rq);
4692	4969	}
4693	4970
4694		-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
	4971	+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
4695	4972	{
4696	4973	struct cfs_rq *cfs_rq;
4697		- u64 runtime;
4698		- u64 starting_runtime = remaining;
	4974	+ u64 runtime, remaining = 1;
4699	4975
4700	4976	rcu_read_lock();
4701	4977	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
..	..	@@ -4710,10 +4986,13 @@
4710	4986	/* By the above check, this should never be true */
4711	4987	SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4712	4988
	4989	+ raw_spin_lock(&cfs_b->lock);
4713	4990	runtime = -cfs_rq->runtime_remaining + 1;
4714		- if (runtime > remaining)
4715		- runtime = remaining;
4716		- remaining -= runtime;
	4991	+ if (runtime > cfs_b->runtime)
	4992	+ runtime = cfs_b->runtime;
	4993	+ cfs_b->runtime -= runtime;
	4994	+ remaining = cfs_b->runtime;
	4995	+ raw_spin_unlock(&cfs_b->lock);
4717	4996
4718	4997	cfs_rq->runtime_remaining += runtime;
4719	4998
..	..	@@ -4728,8 +5007,6 @@
4728	5007	break;
4729	5008	}
4730	5009	rcu_read_unlock();
4731		-
4732		- return starting_runtime - remaining;
4733	5010	}
4734	5011
4735	5012	/*
..	..	@@ -4740,7 +5017,6 @@
4740	5017	*/
4741	5018	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4742	5019	{
4743		- u64 runtime;
4744	5020	int throttled;
4745	5021
4746	5022	/* no need to continue the timer with no bandwidth constraint */
..	..	@@ -4769,24 +5045,15 @@
4769	5045	cfs_b->nr_throttled += overrun;
4770	5046
4771	5047	/*
4772		- * This check is repeated as we are holding onto the new bandwidth while
4773		- * we unthrottle. This can potentially race with an unthrottled group
4774		- * trying to acquire new bandwidth from the global pool. This can result
4775		- * in us over-using our runtime if it is all used during this loop, but
4776		- * only by limited amounts in that extreme case.
	5048	+ * This check is repeated as we release cfs_b->lock while we unthrottle.
4777	5049	*/
4778		- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779		- runtime = cfs_b->runtime;
4780		- cfs_b->distribute_running = 1;
	5050	+ while (throttled && cfs_b->runtime > 0) {
4781	5051	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4782	5052	/* we can't nest cfs_b->lock while distributing bandwidth */
4783		- runtime = distribute_cfs_runtime(cfs_b, runtime);
	5053	+ distribute_cfs_runtime(cfs_b);
4784	5054	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4785	5055
4786		- cfs_b->distribute_running = 0;
4787	5056	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788		-
4789		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4790	5057	}
4791	5058
4792	5059	/*
..	..	@@ -4842,6 +5109,11 @@
4842	5109	if (runtime_refresh_within(cfs_b, min_left))
4843	5110	return;
4844	5111
	5112	+ /* don't push forwards an existing deferred unthrottle */
	5113	+ if (cfs_b->slack_started)
	5114	+ return;
	5115	+ cfs_b->slack_started = true;
	5116	+
4845	5117	hrtimer_start(&cfs_b->slack_timer,
4846	5118	ns_to_ktime(cfs_bandwidth_slack_period),
4847	5119	HRTIMER_MODE_REL);
..	..	@@ -4893,10 +5165,7 @@
4893	5165
4894	5166	/* confirm we're still not at a refresh boundary */
4895	5167	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4896		- if (cfs_b->distribute_running) {
4897		- raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4898		- return;
4899		- }
	5168	+ cfs_b->slack_started = false;
4900	5169
4901	5170	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4902	5171	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
..	..	@@ -4906,26 +5175,21 @@
4906	5175	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4907	5176	runtime = cfs_b->runtime;
4908	5177
4909		- if (runtime)
4910		- cfs_b->distribute_running = 1;
4911		-
4912	5178	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4913	5179
4914	5180	if (!runtime)
4915	5181	return;
4916	5182
4917		- runtime = distribute_cfs_runtime(cfs_b, runtime);
	5183	+ distribute_cfs_runtime(cfs_b);
4918	5184
4919	5185	raw_spin_lock_irqsave(&cfs_b->lock, flags);
4920		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4921		- cfs_b->distribute_running = 0;
4922	5186	raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4923	5187	}
4924	5188
4925	5189	/*
4926	5190	* When a group wakes up we want to make sure that its quota is not already
4927	5191	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
4928		- * runtime as update_curr() throttling can not not trigger until it's on-rq.
	5192	+ * runtime as update_curr() throttling can not trigger until it's on-rq.
4929	5193	*/
4930	5194	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4931	5195	{
..	..	@@ -4960,7 +5224,7 @@
4960	5224	pcfs_rq = tg->parent->cfs_rq[cpu];
4961	5225
4962	5226	cfs_rq->throttle_count = pcfs_rq->throttle_count;
4963		- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
	5227	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
4964	5228	}
4965	5229
4966	5230	/* conditionally throttle active cfs_rq's from put_prev_entity() */
..	..	@@ -4979,8 +5243,7 @@
4979	5243	if (cfs_rq_throttled(cfs_rq))
4980	5244	return true;
4981	5245
4982		- throttle_cfs_rq(cfs_rq);
4983		- return true;
	5246	+ return throttle_cfs_rq(cfs_rq);
4984	5247	}
4985	5248
4986	5249	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
..	..	@@ -5009,6 +5272,8 @@
5009	5272	overrun = hrtimer_forward_now(timer, cfs_b->period);
5010	5273	if (!overrun)
5011	5274	break;
	5275	+
	5276	+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5012	5277
5013	5278	if (++count > 3) {
5014	5279	u64 new, old = ktime_to_ns(cfs_b->period);
..	..	@@ -5039,8 +5304,6 @@
5039	5304	/* reset count so we don't come right back in here */
5040	5305	count = 0;
5041	5306	}
5042		-
5043		- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5044	5307	}
5045	5308	if (idle)
5046	5309	cfs_b->period_active = 0;
..	..	@@ -5061,7 +5324,7 @@
5061	5324	cfs_b->period_timer.function = sched_cfs_period_timer;
5062	5325	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5063	5326	cfs_b->slack_timer.function = sched_cfs_slack_timer;
5064		- cfs_b->distribute_running = 0;
	5327	+ cfs_b->slack_started = false;
5065	5328	}
5066	5329
5067	5330	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
..	..	@@ -5156,11 +5419,6 @@
5156	5419	return false;
5157	5420	}
5158	5421
5159		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5160		-{
5161		- return rq_clock_task(rq_of(cfs_rq));
5162		-}
5163		-
5164	5422	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5165	5423	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5166	5424	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
..	..	@@ -5253,22 +5511,41 @@
5253	5511
5254	5512	#ifdef CONFIG_SMP
5255	5513	static inline unsigned long cpu_util(int cpu);
5256		-static unsigned long capacity_of(int cpu);
5257	5514
5258	5515	static inline bool cpu_overutilized(int cpu)
5259	5516	{
5260		- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
	5517	+ int overutilized = -1;
	5518	+
	5519	+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
	5520	+ if (overutilized != -1)
	5521	+ return overutilized;
	5522	+
	5523	+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5261	5524	}
5262	5525
5263	5526	static inline void update_overutilized_status(struct rq *rq)
5264	5527	{
5265	5528	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5266	5529	WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5267		- trace_sched_overutilized(1);
	5530	+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5268	5531	}
5269	5532	}
5270	5533	#else
5271	5534	static inline void update_overutilized_status(struct rq *rq) { }
	5535	+#endif
	5536	+
	5537	+/* Runqueue only has SCHED_IDLE tasks enqueued */
	5538	+static int sched_idle_rq(struct rq *rq)
	5539	+{
	5540	+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
	5541	+ rq->nr_running);
	5542	+}
	5543	+
	5544	+#ifdef CONFIG_SMP
	5545	+static int sched_idle_cpu(int cpu)
	5546	+{
	5547	+ return sched_idle_rq(cpu_rq(cpu));
	5548	+}
5272	5549	#endif
5273	5550
5274	5551	/*
..	..	@@ -5281,12 +5558,9 @@
5281	5558	{
5282	5559	struct cfs_rq *cfs_rq;
5283	5560	struct sched_entity *se = &p->se;
	5561	+ int idle_h_nr_running = task_has_idle_policy(p);
5284	5562	int task_new = !(flags & ENQUEUE_WAKEUP);
5285		-
5286		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5287		- if (sysctl_sched_performance_bias)
5288		- cpufreq_task_boost(rq->cpu, task_util_est(p));
5289		-#endif
	5563	+ int should_iowait_boost;
5290	5564
5291	5565	/*
5292	5566	* The code below (indirectly) updates schedutil which looks at
..	..	@@ -5297,29 +5571,13 @@
5297	5571	util_est_enqueue(&rq->cfs, p);
5298	5572
5299	5573	/*
5300		- * The code below (indirectly) updates schedutil which looks at
5301		- * the cfs_rq utilization to select a frequency.
5302		- * Let's update schedtune here to ensure the boost value of the
5303		- * current task is accounted for in the selection of the OPP.
5304		- *
5305		- * We do it also in the case where we enqueue a throttled task;
5306		- * we could argue that a throttled task should not boost a CPU,
5307		- * however:
5308		- * a) properly implementing CPU boosting considering throttled
5309		- * tasks will increase a lot the complexity of the solution
5310		- * b) it's not easy to quantify the benefits introduced by
5311		- * such a more complex solution.
5312		- * Thus, for the time being we go for the simple solution and boost
5313		- * also for throttled RQs.
5314		- */
5315		- schedtune_enqueue_task(p, cpu_of(rq));
5316		-
5317		- /*
5318	5574	* If in_iowait is set, the code below may not trigger any cpufreq
5319	5575	* utilization updates, so do it here explicitly with the IOWAIT flag
5320	5576	* passed.
5321	5577	*/
5322		- if (p->in_iowait)
	5578	+ should_iowait_boost = p->in_iowait;
	5579	+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
	5580	+ if (should_iowait_boost)
5323	5581	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5324	5582
5325	5583	for_each_sched_entity(se) {
..	..	@@ -5328,51 +5586,60 @@
5328	5586	cfs_rq = cfs_rq_of(se);
5329	5587	enqueue_entity(cfs_rq, se, flags);
5330	5588
5331		- /*
5332		- * end evaluation on encountering a throttled cfs_rq
5333		- *
5334		- * note: in the case of encountering a throttled cfs_rq we will
5335		- * post the final h_nr_running increment below.
5336		- */
5337		- if (cfs_rq_throttled(cfs_rq))
5338		- break;
5339	5589	cfs_rq->h_nr_running++;
	5590	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5591	+
	5592	+ /* end evaluation on encountering a throttled cfs_rq */
	5593	+ if (cfs_rq_throttled(cfs_rq))
	5594	+ goto enqueue_throttle;
5340	5595
5341	5596	flags = ENQUEUE_WAKEUP;
5342	5597	}
5343	5598
	5599	+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
5344	5600	for_each_sched_entity(se) {
5345	5601	cfs_rq = cfs_rq_of(se);
5346		- cfs_rq->h_nr_running++;
5347		-
5348		- if (cfs_rq_throttled(cfs_rq))
5349		- break;
5350	5602
5351	5603	update_load_avg(cfs_rq, se, UPDATE_TG);
	5604	+ se_update_runnable(se);
5352	5605	update_cfs_group(se);
	5606	+
	5607	+ cfs_rq->h_nr_running++;
	5608	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5609	+
	5610	+ /* end evaluation on encountering a throttled cfs_rq */
	5611	+ if (cfs_rq_throttled(cfs_rq))
	5612	+ goto enqueue_throttle;
	5613	+
	5614	+ /*
	5615	+ * One parent has been throttled and cfs_rq removed from the
	5616	+ * list. Add it back to not break the leaf list.
	5617	+ */
	5618	+ if (throttled_hierarchy(cfs_rq))
	5619	+ list_add_leaf_cfs_rq(cfs_rq);
5353	5620	}
5354	5621
5355		- if (!se) {
5356		- add_nr_running(rq, 1);
5357		- /*
5358		- * Since new tasks are assigned an initial util_avg equal to
5359		- * half of the spare capacity of their CPU, tiny tasks have the
5360		- * ability to cross the overutilized threshold, which will
5361		- * result in the load balancer ruining all the task placement
5362		- * done by EAS. As a way to mitigate that effect, do not account
5363		- * for the first enqueue operation of new tasks during the
5364		- * overutilized flag detection.
5365		- *
5366		- * A better way of solving this problem would be to wait for
5367		- * the PELT signals of tasks to converge before taking them
5368		- * into account, but that is not straightforward to implement,
5369		- * and the following generally works well enough in practice.
5370		- */
5371		- if (!task_new)
5372		- update_overutilized_status(rq);
	5622	+ /* At this point se is NULL and we are at root level*/
	5623	+ add_nr_running(rq, 1);
5373	5624
5374		- }
	5625	+ /*
	5626	+ * Since new tasks are assigned an initial util_avg equal to
	5627	+ * half of the spare capacity of their CPU, tiny tasks have the
	5628	+ * ability to cross the overutilized threshold, which will
	5629	+ * result in the load balancer ruining all the task placement
	5630	+ * done by EAS. As a way to mitigate that effect, do not account
	5631	+ * for the first enqueue operation of new tasks during the
	5632	+ * overutilized flag detection.
	5633	+ *
	5634	+ * A better way of solving this problem would be to wait for
	5635	+ * the PELT signals of tasks to converge before taking them
	5636	+ * into account, but that is not straightforward to implement,
	5637	+ * and the following generally works well enough in practice.
	5638	+ */
	5639	+ if (!task_new)
	5640	+ update_overutilized_status(rq);
5375	5641
	5642	+enqueue_throttle:
5376	5643	if (cfs_bandwidth_used()) {
5377	5644	/*
5378	5645	* When bandwidth control is enabled; the cfs_rq_throttled()
..	..	@@ -5405,28 +5672,21 @@
5405	5672	struct cfs_rq *cfs_rq;
5406	5673	struct sched_entity *se = &p->se;
5407	5674	int task_sleep = flags & DEQUEUE_SLEEP;
	5675	+ int idle_h_nr_running = task_has_idle_policy(p);
	5676	+ bool was_sched_idle = sched_idle_rq(rq);
5408	5677
5409		- /*
5410		- * The code below (indirectly) updates schedutil which looks at
5411		- * the cfs_rq utilization to select a frequency.
5412		- * Let's update schedtune here to ensure the boost value of the
5413		- * current task is not more accounted for in the selection of the OPP.
5414		- */
5415		- schedtune_dequeue_task(p, cpu_of(rq));
	5678	+ util_est_dequeue(&rq->cfs, p);
5416	5679
5417	5680	for_each_sched_entity(se) {
5418	5681	cfs_rq = cfs_rq_of(se);
5419	5682	dequeue_entity(cfs_rq, se, flags);
5420	5683
5421		- /*
5422		- * end evaluation on encountering a throttled cfs_rq
5423		- *
5424		- * note: in the case of encountering a throttled cfs_rq we will
5425		- * post the final h_nr_running decrement below.
5426		- */
5427		- if (cfs_rq_throttled(cfs_rq))
5428		- break;
5429	5684	cfs_rq->h_nr_running--;
	5685	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5686	+
	5687	+ /* end evaluation on encountering a throttled cfs_rq */
	5688	+ if (cfs_rq_throttled(cfs_rq))
	5689	+ goto dequeue_throttle;
5430	5690
5431	5691	/* Don't dequeue parent if it has other entities besides us */
5432	5692	if (cfs_rq->load.weight) {
..	..	@@ -5443,21 +5703,32 @@
5443	5703	flags \|= DEQUEUE_SLEEP;
5444	5704	}
5445	5705
	5706	+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
5446	5707	for_each_sched_entity(se) {
5447	5708	cfs_rq = cfs_rq_of(se);
5448		- cfs_rq->h_nr_running--;
5449		-
5450		- if (cfs_rq_throttled(cfs_rq))
5451		- break;
5452	5709
5453	5710	update_load_avg(cfs_rq, se, UPDATE_TG);
	5711	+ se_update_runnable(se);
5454	5712	update_cfs_group(se);
	5713	+
	5714	+ cfs_rq->h_nr_running--;
	5715	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5716	+
	5717	+ /* end evaluation on encountering a throttled cfs_rq */
	5718	+ if (cfs_rq_throttled(cfs_rq))
	5719	+ goto dequeue_throttle;
	5720	+
5455	5721	}
5456	5722
5457		- if (!se)
5458		- sub_nr_running(rq, 1);
	5723	+ /* At this point se is NULL and we are at root level*/
	5724	+ sub_nr_running(rq, 1);
5459	5725
5460		- util_est_dequeue(&rq->cfs, p, task_sleep);
	5726	+ /* balance early to pull high priority tasks */
	5727	+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
	5728	+ rq->next_balance = jiffies;
	5729	+
	5730	+dequeue_throttle:
	5731	+ util_est_update(&rq->cfs, p, task_sleep);
5461	5732	hrtick_update(rq);
5462	5733	}
5463	5734
..	..	@@ -5468,71 +5739,6 @@
5468	5739	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5469	5740
5470	5741	#ifdef CONFIG_NO_HZ_COMMON
5471		-/*
5472		- * per rq 'load' arrray crap; XXX kill this.
5473		- */
5474		-
5475		-/*
5476		- * The exact cpuload calculated at every tick would be:
5477		- *
5478		- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5479		- *
5480		- * If a CPU misses updates for n ticks (as it was idle) and update gets
5481		- * called on the n+1-th tick when CPU may be busy, then we have:
5482		- *
5483		- * load_n = (1 - 1/2^i)^n * load_0
5484		- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5485		- *
5486		- * decay_load_missed() below does efficient calculation of
5487		- *
5488		- * load' = (1 - 1/2^i)^n * load
5489		- *
5490		- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5491		- * This allows us to precompute the above in said factors, thereby allowing the
5492		- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5493		- * fixed_power_int())
5494		- *
5495		- * The calculation is approximated on a 128 point scale.
5496		- */
5497		-#define DEGRADE_SHIFT 7
5498		-
5499		-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5500		-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5501		- { 0, 0, 0, 0, 0, 0, 0, 0 },
5502		- { 64, 32, 8, 0, 0, 0, 0, 0 },
5503		- { 96, 72, 40, 12, 1, 0, 0, 0 },
5504		- { 112, 98, 75, 43, 15, 1, 0, 0 },
5505		- { 120, 112, 98, 76, 45, 16, 2, 0 }
5506		-};
5507		-
5508		-/*
5509		- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5510		- * would be when CPU is idle and so we just decay the old load without
5511		- * adding any new load.
5512		- */
5513		-static unsigned long
5514		-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5515		-{
5516		- int j = 0;
5517		-
5518		- if (!missed_updates)
5519		- return load;
5520		-
5521		- if (missed_updates >= degrade_zero_ticks[idx])
5522		- return 0;
5523		-
5524		- if (idx == 1)
5525		- return load >> missed_updates;
5526		-
5527		- while (missed_updates) {
5528		- if (missed_updates % 2)
5529		- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5530		-
5531		- missed_updates >>= 1;
5532		- j++;
5533		- }
5534		- return load;
5535		-}
5536	5742
5537	5743	static struct {
5538	5744	cpumask_var_t idle_cpus_mask;
..	..	@@ -5544,249 +5750,68 @@
5544	5750
5545	5751	#endif /* CONFIG_NO_HZ_COMMON */
5546	5752
5547		-/**
5548		- * __cpu_load_update - update the rq->cpu_load[] statistics
5549		- * @this_rq: The rq to update statistics for
5550		- * @this_load: The current load
5551		- * @pending_updates: The number of missed updates
5552		- *
5553		- * Update rq->cpu_load[] statistics. This function is usually called every
5554		- * scheduler tick (TICK_NSEC).
5555		- *
5556		- * This function computes a decaying average:
5557		- *
5558		- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5559		- *
5560		- * Because of NOHZ it might not get called on every tick which gives need for
5561		- * the @pending_updates argument.
5562		- *
5563		- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5564		- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5565		- * = A * (A * load[i]_n-2 + B) + B
5566		- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5567		- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5568		- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5569		- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5570		- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5571		- *
5572		- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5573		- * any change in load would have resulted in the tick being turned back on.
5574		- *
5575		- * For regular NOHZ, this reduces to:
5576		- *
5577		- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5578		- *
5579		- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5580		- * term.
5581		- */
5582		-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5583		- unsigned long pending_updates)
	5753	+static unsigned long cpu_load(struct rq *rq)
5584	5754	{
5585		- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5586		- int i, scale;
5587		-
5588		- this_rq->nr_load_updates++;
5589		-
5590		- /* Update our load: */
5591		- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5592		- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5593		- unsigned long old_load, new_load;
5594		-
5595		- /* scale is effectively 1 << i now, and >> i divides by scale */
5596		-
5597		- old_load = this_rq->cpu_load[i];
5598		-#ifdef CONFIG_NO_HZ_COMMON
5599		- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5600		- if (tickless_load) {
5601		- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5602		- /*
5603		- * old_load can never be a negative value because a
5604		- * decayed tickless_load cannot be greater than the
5605		- * original tickless_load.
5606		- */
5607		- old_load += tickless_load;
5608		- }
5609		-#endif
5610		- new_load = this_load;
5611		- /*
5612		- * Round up the averaging division if load is increasing. This
5613		- * prevents us from getting stuck on 9 if the load is 10, for
5614		- * example.
5615		- */
5616		- if (new_load > old_load)
5617		- new_load += scale - 1;
5618		-
5619		- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5620		- }
5621		-}
5622		-
5623		-/* Used instead of source_load when we know the type == 0 */
5624		-static unsigned long weighted_cpuload(struct rq *rq)
5625		-{
5626		- return cfs_rq_runnable_load_avg(&rq->cfs);
5627		-}
5628		-
5629		-#ifdef CONFIG_NO_HZ_COMMON
5630		-/*
5631		- * There is no sane way to deal with nohz on smp when using jiffies because the
5632		- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5633		- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5634		- *
5635		- * Therefore we need to avoid the delta approach from the regular tick when
5636		- * possible since that would seriously skew the load calculation. This is why we
5637		- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5638		- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5639		- * loop exit, nohz_idle_balance, nohz full exit...)
5640		- *
5641		- * This means we might still be one tick off for nohz periods.
5642		- */
5643		-
5644		-static void cpu_load_update_nohz(struct rq *this_rq,
5645		- unsigned long curr_jiffies,
5646		- unsigned long load)
5647		-{
5648		- unsigned long pending_updates;
5649		-
5650		- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5651		- if (pending_updates) {
5652		- this_rq->last_load_update_tick = curr_jiffies;
5653		- /*
5654		- * In the regular NOHZ case, we were idle, this means load 0.
5655		- * In the NOHZ_FULL case, we were non-idle, we should consider
5656		- * its weighted load.
5657		- */
5658		- cpu_load_update(this_rq, load, pending_updates);
5659		- }
	5755	+ return cfs_rq_load_avg(&rq->cfs);
5660	5756	}
5661	5757
5662	5758	/*
5663		- * Called from nohz_idle_balance() to update the load ratings before doing the
5664		- * idle balance.
5665		- */
5666		-static void cpu_load_update_idle(struct rq *this_rq)
5667		-{
5668		- /*
5669		- * bail if there's load or we're actually up-to-date.
5670		- */
5671		- if (weighted_cpuload(this_rq))
5672		- return;
5673		-
5674		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5675		-}
5676		-
5677		-/*
5678		- * Record CPU load on nohz entry so we know the tickless load to account
5679		- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5680		- * than other cpu_load[idx] but it should be fine as cpu_load readers
5681		- * shouldn't rely into synchronized cpu_load[*] updates.
5682		- */
5683		-void cpu_load_update_nohz_start(void)
5684		-{
5685		- struct rq *this_rq = this_rq();
5686		-
5687		- /*
5688		- * This is all lockless but should be fine. If weighted_cpuload changes
5689		- * concurrently we'll exit nohz. And cpu_load write can race with
5690		- * cpu_load_update_idle() but both updater would be writing the same.
5691		- */
5692		- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5693		-}
5694		-
5695		-/*
5696		- * Account the tickless load in the end of a nohz frame.
5697		- */
5698		-void cpu_load_update_nohz_stop(void)
5699		-{
5700		- unsigned long curr_jiffies = READ_ONCE(jiffies);
5701		- struct rq *this_rq = this_rq();
5702		- unsigned long load;
5703		- struct rq_flags rf;
5704		-
5705		- if (curr_jiffies == this_rq->last_load_update_tick)
5706		- return;
5707		-
5708		- load = weighted_cpuload(this_rq);
5709		- rq_lock(this_rq, &rf);
5710		- update_rq_clock(this_rq);
5711		- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5712		- rq_unlock(this_rq, &rf);
5713		-}
5714		-#else /* !CONFIG_NO_HZ_COMMON */
5715		-static inline void cpu_load_update_nohz(struct rq *this_rq,
5716		- unsigned long curr_jiffies,
5717		- unsigned long load) { }
5718		-#endif /* CONFIG_NO_HZ_COMMON */
5719		-
5720		-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5721		-{
5722		-#ifdef CONFIG_NO_HZ_COMMON
5723		- /* See the mess around cpu_load_update_nohz(). */
5724		- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5725		-#endif
5726		- cpu_load_update(this_rq, load, 1);
5727		-}
5728		-
5729		-/*
5730		- * Called from scheduler_tick()
5731		- */
5732		-void cpu_load_update_active(struct rq *this_rq)
5733		-{
5734		- unsigned long load = weighted_cpuload(this_rq);
5735		-
5736		- if (tick_nohz_tick_stopped())
5737		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5738		- else
5739		- cpu_load_update_periodic(this_rq, load);
5740		-}
5741		-
5742		-/*
5743		- * Return a low guess at the load of a migration-source CPU weighted
5744		- * according to the scheduling class and "nice" value.
	5759	+ * cpu_load_without - compute CPU load without any contributions from *p
	5760	+ * @cpu: the CPU which load is requested
	5761	+ * @p: the task which load should be discounted
5745	5762	*
5746		- * We want to under-estimate the load of migration sources, to
5747		- * balance conservatively.
	5763	+ * The load of a CPU is defined by the load of tasks currently enqueued on that
	5764	+ * CPU as well as tasks which are currently sleeping after an execution on that
	5765	+ * CPU.
	5766	+ *
	5767	+ * This method returns the load of the specified CPU by discounting the load of
	5768	+ * the specified task, whenever the task is currently contributing to the CPU
	5769	+ * load.
5748	5770	*/
5749		-static unsigned long source_load(int cpu, int type)
	5771	+static unsigned long cpu_load_without(struct rq rq, struct task_struct p)
5750	5772	{
5751		- struct rq *rq = cpu_rq(cpu);
5752		- unsigned long total = weighted_cpuload(rq);
	5773	+ struct cfs_rq *cfs_rq;
	5774	+ unsigned int load;
5753	5775
5754		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5755		- return total;
	5776	+ /* Task has no contribution or is new */
	5777	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5778	+ return cpu_load(rq);
5756	5779
5757		- return min(rq->cpu_load[type-1], total);
	5780	+ cfs_rq = &rq->cfs;
	5781	+ load = READ_ONCE(cfs_rq->avg.load_avg);
	5782	+
	5783	+ /* Discount task's util from CPU's util */
	5784	+ lsub_positive(&load, task_h_load(p));
	5785	+
	5786	+ return load;
5758	5787	}
5759	5788
5760		-/*
5761		- * Return a high guess at the load of a migration-target CPU weighted
5762		- * according to the scheduling class and "nice" value.
5763		- */
5764		-static unsigned long target_load(int cpu, int type)
	5789	+static unsigned long cpu_runnable(struct rq *rq)
5765	5790	{
5766		- struct rq *rq = cpu_rq(cpu);
5767		- unsigned long total = weighted_cpuload(rq);
	5791	+ return cfs_rq_runnable_avg(&rq->cfs);
	5792	+}
5768	5793
5769		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5770		- return total;
	5794	+static unsigned long cpu_runnable_without(struct rq rq, struct task_struct p)
	5795	+{
	5796	+ struct cfs_rq *cfs_rq;
	5797	+ unsigned int runnable;
5771	5798
5772		- return max(rq->cpu_load[type-1], total);
	5799	+ /* Task has no contribution or is new */
	5800	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5801	+ return cpu_runnable(rq);
	5802	+
	5803	+ cfs_rq = &rq->cfs;
	5804	+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
	5805	+
	5806	+ /* Discount task's runnable from CPU's runnable */
	5807	+ lsub_positive(&runnable, p->se.avg.runnable_avg);
	5808	+
	5809	+ return runnable;
5773	5810	}
5774	5811
5775	5812	static unsigned long capacity_of(int cpu)
5776	5813	{
5777	5814	return cpu_rq(cpu)->cpu_capacity;
5778		-}
5779		-
5780		-static unsigned long cpu_avg_load_per_task(int cpu)
5781		-{
5782		- struct rq *rq = cpu_rq(cpu);
5783		- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5784		- unsigned long load_avg = weighted_cpuload(rq);
5785		-
5786		- if (nr_running)
5787		- return load_avg / nr_running;
5788		-
5789		- return 0;
5790	5815	}
5791	5816
5792	5817	static void record_wakee(struct task_struct *p)
..	..	@@ -5823,18 +5848,15 @@
5823	5848	* whatever is irrelevant, spread criteria is apparent partner count exceeds
5824	5849	* socket size.
5825	5850	*/
5826		-static int wake_wide(struct task_struct *p, int sibling_count_hint)
	5851	+static int wake_wide(struct task_struct *p)
5827	5852	{
5828	5853	unsigned int master = current->wakee_flips;
5829	5854	unsigned int slave = p->wakee_flips;
5830		- int llc_size = this_cpu_read(sd_llc_size);
5831		-
5832		- if (sibling_count_hint >= llc_size)
5833		- return 1;
	5855	+ int factor = __this_cpu_read(sd_llc_size);
5834	5856
5835	5857	if (master < slave)
5836	5858	swap(master, slave);
5837		- if (slave < llc_size \|\| master < slave * llc_size)
	5859	+ if (slave < factor \|\| master < slave * factor)
5838	5860	return 0;
5839	5861	return 1;
5840	5862	}
..	..	@@ -5882,7 +5904,7 @@
5882	5904	s64 this_eff_load, prev_eff_load;
5883	5905	unsigned long task_load;
5884	5906
5885		- this_eff_load = target_load(this_cpu, sd->wake_idx);
	5907	+ this_eff_load = cpu_load(cpu_rq(this_cpu));
5886	5908
5887	5909	if (sync) {
5888	5910	unsigned long current_load = task_h_load(current);
..	..	@@ -5900,7 +5922,7 @@
5900	5922	this_eff_load *= 100;
5901	5923	this_eff_load *= capacity_of(prev_cpu);
5902	5924
5903		- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
	5925	+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5904	5926	prev_eff_load -= task_load;
5905	5927	if (sched_feat(WA_BIAS))
5906	5928	prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
..	..	@@ -5938,242 +5960,8 @@
5938	5960	return target;
5939	5961	}
5940	5962
5941		-#ifdef CONFIG_SCHED_TUNE
5942		-struct reciprocal_value schedtune_spc_rdiv;
5943		-
5944		-static long
5945		-schedtune_margin(unsigned long signal, long boost)
5946		-{
5947		- long long margin = 0;
5948		-
5949		- /*
5950		- * Signal proportional compensation (SPC)
5951		- *
5952		- * The Boost (B) value is used to compute a Margin (M) which is
5953		- * proportional to the complement of the original Signal (S):
5954		- * M = B * (SCHED_CAPACITY_SCALE - S)
5955		- * The obtained M could be used by the caller to "boost" S.
5956		- */
5957		- if (boost >= 0) {
5958		- margin = SCHED_CAPACITY_SCALE - signal;
5959		- margin *= boost;
5960		- } else
5961		- margin = -signal * boost;
5962		-
5963		- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5964		-
5965		- if (boost < 0)
5966		- margin *= -1;
5967		- return margin;
5968		-}
5969		-
5970		-inline long
5971		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5972		-{
5973		- int boost = schedtune_cpu_boost_with(cpu, p);
5974		- long margin;
5975		-
5976		- if (boost == 0)
5977		- margin = 0;
5978		- else
5979		- margin = schedtune_margin(util, boost);
5980		-
5981		- trace_sched_boost_cpu(cpu, util, margin);
5982		-
5983		- return margin;
5984		-}
5985		-
5986		-long schedtune_task_margin(struct task_struct *task)
5987		-{
5988		- int boost = schedtune_task_boost(task);
5989		- unsigned long util;
5990		- long margin;
5991		-
5992		- if (boost == 0)
5993		- return 0;
5994		-
5995		- util = task_util_est(task);
5996		- margin = schedtune_margin(util, boost);
5997		-
5998		- return margin;
5999		-}
6000		-
6001		-#else /* CONFIG_SCHED_TUNE */
6002		-
6003		-inline long
6004		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6005		-{
6006		- return 0;
6007		-}
6008		-
6009		-#endif /* CONFIG_SCHED_TUNE */
6010		-
6011		-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6012		-
6013		-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6014		-{
6015		- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6016		-}
6017		-
6018		-/*
6019		- * find_idlest_group finds and returns the least busy CPU group within the
6020		- * domain.
6021		- *
6022		- * Assumes p is allowed on at least one CPU in sd.
6023		- */
6024	5963	static struct sched_group *
6025		-find_idlest_group(struct sched_domain sd, struct task_struct p,
6026		- int this_cpu, int sd_flag)
6027		-{
6028		- struct sched_group idlest = NULL, group = sd->groups;
6029		- struct sched_group *most_spare_sg = NULL;
6030		- unsigned long min_runnable_load = ULONG_MAX;
6031		- unsigned long this_runnable_load = ULONG_MAX;
6032		- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6033		- unsigned long most_spare = 0, this_spare = 0;
6034		- int load_idx = sd->forkexec_idx;
6035		- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6036		- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6037		- (sd->imbalance_pct-100) / 100;
6038		-
6039		- if (sd_flag & SD_BALANCE_WAKE)
6040		- load_idx = sd->wake_idx;
6041		-
6042		- do {
6043		- unsigned long load, avg_load, runnable_load;
6044		- unsigned long spare_cap, max_spare_cap;
6045		- int local_group;
6046		- int i;
6047		-
6048		- /* Skip over this group if it has no CPUs allowed */
6049		- if (!cpumask_intersects(sched_group_span(group),
6050		- p->cpus_ptr))
6051		- continue;
6052		-
6053		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6054		- if (sysctl_sched_performance_bias) {
6055		- if (!task_fits_max(p, group_first_cpu(group)))
6056		- continue;
6057		- }
6058		-#endif
6059		-
6060		- local_group = cpumask_test_cpu(this_cpu,
6061		- sched_group_span(group));
6062		-
6063		- /*
6064		- * Tally up the load of all CPUs in the group and find
6065		- * the group containing the CPU with most spare capacity.
6066		- */
6067		- avg_load = 0;
6068		- runnable_load = 0;
6069		- max_spare_cap = 0;
6070		-
6071		- for_each_cpu(i, sched_group_span(group)) {
6072		- /* Bias balancing toward CPUs of our domain */
6073		- if (local_group)
6074		- load = source_load(i, load_idx);
6075		- else
6076		- load = target_load(i, load_idx);
6077		-
6078		- runnable_load += load;
6079		-
6080		- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6081		-
6082		- spare_cap = capacity_spare_without(i, p);
6083		-
6084		- if (spare_cap > max_spare_cap)
6085		- max_spare_cap = spare_cap;
6086		- }
6087		-
6088		- /* Adjust by relative CPU capacity of the group */
6089		- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6090		- group->sgc->capacity;
6091		- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6092		- group->sgc->capacity;
6093		-
6094		- if (local_group) {
6095		- this_runnable_load = runnable_load;
6096		- this_avg_load = avg_load;
6097		- this_spare = max_spare_cap;
6098		- } else {
6099		- if (min_runnable_load > (runnable_load + imbalance)) {
6100		- /*
6101		- * The runnable load is significantly smaller
6102		- * so we can pick this new CPU:
6103		- */
6104		- min_runnable_load = runnable_load;
6105		- min_avg_load = avg_load;
6106		- idlest = group;
6107		- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6108		- (100min_avg_load > imbalance_scaleavg_load)) {
6109		- /*
6110		- * The runnable loads are close so take the
6111		- * blocked load into account through avg_load:
6112		- */
6113		- min_avg_load = avg_load;
6114		- idlest = group;
6115		- }
6116		-
6117		- if (most_spare < max_spare_cap) {
6118		- most_spare = max_spare_cap;
6119		- most_spare_sg = group;
6120		- }
6121		- }
6122		- } while (group = group->next, group != sd->groups);
6123		-
6124		- /*
6125		- * The cross-over point between using spare capacity or least load
6126		- * is too conservative for high utilization tasks on partially
6127		- * utilized systems if we require spare_capacity > task_util(p),
6128		- * so we allow for some task stuffing by using
6129		- * spare_capacity > task_util(p)/2.
6130		- *
6131		- * Spare capacity can't be used for fork because the utilization has
6132		- * not been set yet, we must first select a rq to compute the initial
6133		- * utilization.
6134		- */
6135		- if (sd_flag & SD_BALANCE_FORK)
6136		- goto skip_spare;
6137		-
6138		- if (this_spare > task_util(p) / 2 &&
6139		- imbalance_scalethis_spare > 100most_spare)
6140		- return NULL;
6141		-
6142		- if (most_spare > task_util(p) / 2)
6143		- return most_spare_sg;
6144		-
6145		-skip_spare:
6146		- if (!idlest)
6147		- return NULL;
6148		-
6149		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6150		- if (sysctl_sched_performance_bias) {
6151		- if ((this_runnable_load == ULONG_MAX) \|\| (this_avg_load == ULONG_MAX))
6152		- return idlest;
6153		- }
6154		-#endif
6155		-
6156		- /*
6157		- * When comparing groups across NUMA domains, it's possible for the
6158		- * local domain to be very lightly loaded relative to the remote
6159		- * domains but "imbalance" skews the comparison making remote CPUs
6160		- * look much more favourable. When considering cross-domain, add
6161		- * imbalance to the runnable load on the remote node and consider
6162		- * staying local.
6163		- */
6164		- if ((sd->flags & SD_NUMA) &&
6165		- min_runnable_load + imbalance >= this_runnable_load)
6166		- return NULL;
6167		-
6168		- if (min_runnable_load > (this_runnable_load + imbalance))
6169		- return NULL;
6170		-
6171		- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6172		- (100this_avg_load < imbalance_scalemin_avg_load))
6173		- return NULL;
6174		-
6175		- return idlest;
6176		-}
	5964	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu);
6177	5965
6178	5966	/*
6179	5967	* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
..	..	@@ -6194,6 +5982,9 @@
6194	5982
6195	5983	/* Traverse only the allowed CPUs */
6196	5984	for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
	5985	+ if (sched_idle_cpu(i))
	5986	+ return i;
	5987	+
6197	5988	if (available_idle_cpu(i)) {
6198	5989	struct rq *rq = cpu_rq(i);
6199	5990	struct cpuidle_state *idle = idle_get_state(rq);
..	..	@@ -6217,7 +6008,7 @@
6217	6008	shallowest_idle_cpu = i;
6218	6009	}
6219	6010	} else if (shallowest_idle_cpu == -1) {
6220		- load = weighted_cpuload(cpu_rq(i));
	6011	+ load = cpu_load(cpu_rq(i));
6221	6012	if (load < min_load) {
6222	6013	min_load = load;
6223	6014	least_loaded_cpu = i;
..	..	@@ -6237,7 +6028,7 @@
6237	6028	return prev_cpu;
6238	6029
6239	6030	/*
6240		- * We need task's util for capacity_spare_without, sync it up to
	6031	+ * We need task's util for cpu_util_without, sync it up to
6241	6032	* prev_cpu's last_update_time.
6242	6033	*/
6243	6034	if (!(sd_flag & SD_BALANCE_FORK))
..	..	@@ -6253,7 +6044,7 @@
6253	6044	continue;
6254	6045	}
6255	6046
6256		- group = find_idlest_group(sd, p, cpu, sd_flag);
	6047	+ group = find_idlest_group(sd, p, cpu);
6257	6048	if (!group) {
6258	6049	sd = sd->child;
6259	6050	continue;
..	..	@@ -6356,10 +6147,12 @@
6356	6147	bool idle = true;
6357	6148
6358	6149	for_each_cpu(cpu, cpu_smt_mask(core)) {
6359		- cpumask_clear_cpu(cpu, cpus);
6360		- if (!available_idle_cpu(cpu))
	6150	+ if (!available_idle_cpu(cpu)) {
6361	6151	idle = false;
	6152	+ break;
	6153	+ }
6362	6154	}
	6155	+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6363	6156
6364	6157	if (idle)
6365	6158	return core;
..	..	@@ -6384,9 +6177,10 @@
6384	6177	return -1;
6385	6178
6386	6179	for_each_cpu(cpu, cpu_smt_mask(target)) {
6387		- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
	6180	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) \|\|
	6181	+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6388	6182	continue;
6389		- if (available_idle_cpu(cpu))
	6183	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6390	6184	return cpu;
6391	6185	}
6392	6186
..	..	@@ -6417,8 +6211,8 @@
6417	6211	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6418	6212	struct sched_domain *this_sd;
6419	6213	u64 avg_cost, avg_idle;
6420		- u64 time, cost;
6421		- s64 delta;
	6214	+ u64 time;
	6215	+ int this = smp_processor_id();
6422	6216	int cpu, nr = INT_MAX;
6423	6217
6424	6218	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
..	..	@@ -6443,23 +6237,63 @@
6443	6237	nr = 4;
6444	6238	}
6445	6239
6446		- time = local_clock();
	6240	+ time = cpu_clock(this);
6447	6241
6448	6242	cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6449	6243
6450	6244	for_each_cpu_wrap(cpu, cpus, target) {
6451	6245	if (!--nr)
6452	6246	return -1;
6453		- if (available_idle_cpu(cpu))
	6247	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6454	6248	break;
6455	6249	}
6456	6250
6457		- time = local_clock() - time;
6458		- cost = this_sd->avg_scan_cost;
6459		- delta = (s64)(time - cost) / 8;
6460		- this_sd->avg_scan_cost += delta;
	6251	+ time = cpu_clock(this) - time;
	6252	+ update_avg(&this_sd->avg_scan_cost, time);
6461	6253
6462	6254	return cpu;
	6255	+}
	6256	+
	6257	+/*
	6258	+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
	6259	+ * the task fits. If no CPU is big enough, but there are idle ones, try to
	6260	+ * maximize capacity.
	6261	+ */
	6262	+static int
	6263	+select_idle_capacity(struct task_struct p, struct sched_domain sd, int target)
	6264	+{
	6265	+ unsigned long task_util, best_cap = 0;
	6266	+ int cpu, best_cpu = -1;
	6267	+ struct cpumask *cpus;
	6268	+
	6269	+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
	6270	+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
	6271	+
	6272	+ task_util = uclamp_task_util(p);
	6273	+
	6274	+ for_each_cpu_wrap(cpu, cpus, target) {
	6275	+ unsigned long cpu_cap = capacity_of(cpu);
	6276	+
	6277	+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
	6278	+ continue;
	6279	+ if (fits_capacity(task_util, cpu_cap))
	6280	+ return cpu;
	6281	+
	6282	+ if (cpu_cap > best_cap) {
	6283	+ best_cap = cpu_cap;
	6284	+ best_cpu = cpu;
	6285	+ }
	6286	+ }
	6287	+
	6288	+ return best_cpu;
	6289	+}
	6290	+
	6291	+static inline bool asym_fits_capacity(int task_util, int cpu)
	6292	+{
	6293	+ if (static_branch_unlikely(&sched_asym_cpucapacity))
	6294	+ return fits_capacity(task_util, capacity_of(cpu));
	6295	+
	6296	+ return true;
6463	6297	}
6464	6298
6465	6299	/*
..	..	@@ -6468,24 +6302,54 @@
6468	6302	static int select_idle_sibling(struct task_struct *p, int prev, int target)
6469	6303	{
6470	6304	struct sched_domain *sd;
	6305	+ unsigned long task_util;
6471	6306	int i, recent_used_cpu;
6472	6307
6473		- if (available_idle_cpu(target))
	6308	+ /*
	6309	+ * On asymmetric system, update task utilization because we will check
	6310	+ * that the task fits with cpu's capacity.
	6311	+ */
	6312	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6313	+ sync_entity_load_avg(&p->se);
	6314	+ task_util = uclamp_task_util(p);
	6315	+ }
	6316	+
	6317	+ if ((available_idle_cpu(target) \|\| sched_idle_cpu(target)) &&
	6318	+ asym_fits_capacity(task_util, target))
6474	6319	return target;
6475	6320
6476	6321	/*
6477	6322	* If the previous CPU is cache affine and idle, don't be stupid:
6478	6323	*/
6479		- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
	6324	+ if (prev != target && cpus_share_cache(prev, target) &&
	6325	+ (available_idle_cpu(prev) \|\| sched_idle_cpu(prev)) &&
	6326	+ asym_fits_capacity(task_util, prev))
6480	6327	return prev;
	6328	+
	6329	+ /*
	6330	+ * Allow a per-cpu kthread to stack with the wakee if the
	6331	+ * kworker thread and the tasks previous CPUs are the same.
	6332	+ * The assumption is that the wakee queued work for the
	6333	+ * per-cpu kthread that is now complete and the wakeup is
	6334	+ * essentially a sync wakeup. An obvious example of this
	6335	+ * pattern is IO completions.
	6336	+ */
	6337	+ if (is_per_cpu_kthread(current) &&
	6338	+ in_task() &&
	6339	+ prev == smp_processor_id() &&
	6340	+ this_rq()->nr_running <= 1 &&
	6341	+ asym_fits_capacity(task_util, prev)) {
	6342	+ return prev;
	6343	+ }
6481	6344
6482	6345	/* Check a recently used CPU as a potential idle candidate: */
6483	6346	recent_used_cpu = p->recent_used_cpu;
6484	6347	if (recent_used_cpu != prev &&
6485	6348	recent_used_cpu != target &&
6486	6349	cpus_share_cache(recent_used_cpu, target) &&
6487		- available_idle_cpu(recent_used_cpu) &&
6488		- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
	6350	+ (available_idle_cpu(recent_used_cpu) \|\| sched_idle_cpu(recent_used_cpu)) &&
	6351	+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
	6352	+ asym_fits_capacity(task_util, recent_used_cpu)) {
6489	6353	/*
6490	6354	* Replace recent_used_cpu with prev as it is a potential
6491	6355	* candidate for the next wake:
..	..	@@ -6494,6 +6358,32 @@
6494	6358	return recent_used_cpu;
6495	6359	}
6496	6360
	6361	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6362	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	6363	+ goto sd_llc;
	6364	+ }
	6365	+
	6366	+ /*
	6367	+ * For asymmetric CPU capacity systems, our domain of interest is
	6368	+ * sd_asym_cpucapacity rather than sd_llc.
	6369	+ */
	6370	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6371	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
	6372	+ /*
	6373	+ * On an asymmetric CPU capacity system where an exclusive
	6374	+ * cpuset defines a symmetric island (i.e. one unique
	6375	+ * capacity_orig value through the cpuset), the key will be set
	6376	+ * but the CPUs within that cpuset will not have a domain with
	6377	+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
	6378	+ * capacity path.
	6379	+ */
	6380	+ if (sd) {
	6381	+ i = select_idle_capacity(p, sd, target);
	6382	+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
	6383	+ }
	6384	+ }
	6385	+
	6386	+sd_llc:
6497	6387	sd = rcu_dereference(per_cpu(sd_llc, target));
6498	6388	if (!sd)
6499	6389	return target;
..	..	@@ -6591,7 +6481,7 @@
6591	6481	util = READ_ONCE(cfs_rq->avg.util_avg);
6592	6482
6593	6483	/* Discount task's util from CPU's util */
6594		- util -= min_t(unsigned int, util, task_util(p));
	6484	+ lsub_positive(&util, task_util(p));
6595	6485
6596	6486	/*
6597	6487	* Covered cases:
..	..	@@ -6640,10 +6530,9 @@
6640	6530	* properly fix the execl regression and it helps in further
6641	6531	* reducing the chances for the above race.
6642	6532	*/
6643		- if (unlikely(task_on_rq_queued(p) \|\| current == p)) {
6644		- estimated -= min_t(unsigned int, estimated,
6645		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
6646		- }
	6533	+ if (unlikely(task_on_rq_queued(p) \|\| current == p))
	6534	+ lsub_positive(&estimated, _task_util_est(p));
	6535	+
6647	6536	util = max(util, estimated);
6648	6537	}
6649	6538
..	..	@@ -6653,350 +6542,6 @@
6653	6542	* the cpu_util call.
6654	6543	*/
6655	6544	return min_t(unsigned long, util, capacity_orig_of(cpu));
6656		-}
6657		-
6658		-/*
6659		- * Returns the current capacity of cpu after applying both
6660		- * cpu and freq scaling.
6661		- */
6662		-unsigned long capacity_curr_of(int cpu)
6663		-{
6664		- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6665		- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6666		-
6667		- return cap_scale(max_cap, scale_freq);
6668		-}
6669		-
6670		-static void find_best_target(struct sched_domain sd, cpumask_t cpus,
6671		- struct task_struct *p)
6672		-{
6673		- unsigned long min_util = uclamp_task(p);
6674		- unsigned long target_capacity = ULONG_MAX;
6675		- unsigned long min_wake_util = ULONG_MAX;
6676		- unsigned long target_max_spare_cap = 0;
6677		- unsigned long target_util = ULONG_MAX;
6678		- /* Initialise with deepest possible cstate (INT_MAX) */
6679		- int shallowest_idle_cstate = INT_MAX;
6680		- struct sched_group *sg;
6681		- int best_active_cpu = -1;
6682		- int best_idle_cpu = -1;
6683		- int target_cpu = -1;
6684		- int backup_cpu = -1;
6685		- bool prefer_idle;
6686		- bool boosted;
6687		- int i;
6688		-
6689		- /*
6690		- * In most cases, target_capacity tracks capacity_orig of the most
6691		- * energy efficient CPU candidate, thus requiring to minimise
6692		- * target_capacity. For these cases target_capacity is already
6693		- * initialized to ULONG_MAX.
6694		- * However, for prefer_idle and boosted tasks we look for a high
6695		- * performance CPU, thus requiring to maximise target_capacity. In this
6696		- * case we initialise target_capacity to 0.
6697		- */
6698		- prefer_idle = uclamp_latency_sensitive(p);
6699		- boosted = uclamp_boosted(p);
6700		- if (prefer_idle && boosted)
6701		- target_capacity = 0;
6702		-
6703		- /* Scan CPUs in all SDs */
6704		- sg = sd->groups;
6705		- do {
6706		- for_each_cpu_and(i, p->cpus_ptr, sched_group_span(sg)) {
6707		- unsigned long capacity_curr = capacity_curr_of(i);
6708		- unsigned long capacity_orig = capacity_orig_of(i);
6709		- unsigned long wake_util, new_util;
6710		- long spare_cap;
6711		- int idle_idx = INT_MAX;
6712		-
6713		- if (!cpu_online(i))
6714		- continue;
6715		-
6716		- /*
6717		- * p's blocked utilization is still accounted for on prev_cpu
6718		- * so prev_cpu will receive a negative bias due to the double
6719		- * accounting. However, the blocked utilization may be zero.
6720		- */
6721		- wake_util = cpu_util_without(i, p);
6722		- new_util = wake_util + task_util_est(p);
6723		-
6724		- /*
6725		- * Ensure minimum capacity to grant the required boost.
6726		- * The target CPU can be already at a capacity level higher
6727		- * than the one required to boost the task.
6728		- */
6729		- new_util = max(min_util, new_util);
6730		- if (new_util > capacity_orig)
6731		- continue;
6732		-
6733		- /*
6734		- * Pre-compute the maximum possible capacity we expect
6735		- * to have available on this CPU once the task is
6736		- * enqueued here.
6737		- */
6738		- spare_cap = capacity_orig - new_util;
6739		-
6740		- if (idle_cpu(i))
6741		- idle_idx = idle_get_state_idx(cpu_rq(i));
6742		-
6743		-
6744		- /*
6745		- * Case A) Latency sensitive tasks
6746		- *
6747		- * Unconditionally favoring tasks that prefer idle CPU to
6748		- * improve latency.
6749		- *
6750		- * Looking for:
6751		- * - an idle CPU, whatever its idle_state is, since
6752		- * the first CPUs we explore are more likely to be
6753		- * reserved for latency sensitive tasks.
6754		- * - a non idle CPU where the task fits in its current
6755		- * capacity and has the maximum spare capacity.
6756		- * - a non idle CPU with lower contention from other
6757		- * tasks and running at the lowest possible OPP.
6758		- *
6759		- * The last two goals tries to favor a non idle CPU
6760		- * where the task can run as if it is "almost alone".
6761		- * A maximum spare capacity CPU is favoured since
6762		- * the task already fits into that CPU's capacity
6763		- * without waiting for an OPP chance.
6764		- *
6765		- * The following code path is the only one in the CPUs
6766		- * exploration loop which is always used by
6767		- * prefer_idle tasks. It exits the loop with wither a
6768		- * best_active_cpu or a target_cpu which should
6769		- * represent an optimal choice for latency sensitive
6770		- * tasks.
6771		- */
6772		- if (prefer_idle) {
6773		-
6774		- /*
6775		- * Case A.1: IDLE CPU
6776		- * Return the best IDLE CPU we find:
6777		- * - for boosted tasks: the CPU with the highest
6778		- * performance (i.e. biggest capacity_orig)
6779		- * - for !boosted tasks: the most energy
6780		- * efficient CPU (i.e. smallest capacity_orig)
6781		- */
6782		- if (idle_cpu(i)) {
6783		- if (boosted &&
6784		- capacity_orig < target_capacity)
6785		- continue;
6786		- if (!boosted &&
6787		- capacity_orig > target_capacity)
6788		- continue;
6789		- /*
6790		- * Minimise value of idle state: skip
6791		- * deeper idle states and pick the
6792		- * shallowest.
6793		- */
6794		- if (capacity_orig == target_capacity &&
6795		- sysctl_sched_cstate_aware &&
6796		- idle_idx >= shallowest_idle_cstate)
6797		- continue;
6798		-
6799		- target_capacity = capacity_orig;
6800		- shallowest_idle_cstate = idle_idx;
6801		- best_idle_cpu = i;
6802		- continue;
6803		- }
6804		- if (best_idle_cpu != -1)
6805		- continue;
6806		-
6807		- /*
6808		- * Case A.2: Target ACTIVE CPU
6809		- * Favor CPUs with max spare capacity.
6810		- */
6811		- if (capacity_curr > new_util &&
6812		- spare_cap > target_max_spare_cap) {
6813		- target_max_spare_cap = spare_cap;
6814		- target_cpu = i;
6815		- continue;
6816		- }
6817		- if (target_cpu != -1)
6818		- continue;
6819		-
6820		-
6821		- /*
6822		- * Case A.3: Backup ACTIVE CPU
6823		- * Favor CPUs with:
6824		- * - lower utilization due to other tasks
6825		- * - lower utilization with the task in
6826		- */
6827		- if (wake_util > min_wake_util)
6828		- continue;
6829		- min_wake_util = wake_util;
6830		- best_active_cpu = i;
6831		- continue;
6832		- }
6833		-
6834		- /*
6835		- * Enforce EAS mode
6836		- *
6837		- * For non latency sensitive tasks, skip CPUs that
6838		- * will be overutilized by moving the task there.
6839		- *
6840		- * The goal here is to remain in EAS mode as long as
6841		- * possible at least for !prefer_idle tasks.
6842		- */
6843		- if ((new_util * capacity_margin) >
6844		- (capacity_orig * SCHED_CAPACITY_SCALE))
6845		- continue;
6846		-
6847		- /*
6848		- * Favor CPUs with smaller capacity for non latency
6849		- * sensitive tasks.
6850		- */
6851		- if (capacity_orig > target_capacity)
6852		- continue;
6853		-
6854		- /*
6855		- * Case B) Non latency sensitive tasks on IDLE CPUs.
6856		- *
6857		- * Find an optimal backup IDLE CPU for non latency
6858		- * sensitive tasks.
6859		- *
6860		- * Looking for:
6861		- * - minimizing the capacity_orig,
6862		- * i.e. preferring LITTLE CPUs
6863		- * - favoring shallowest idle states
6864		- * i.e. avoid to wakeup deep-idle CPUs
6865		- *
6866		- * The following code path is used by non latency
6867		- * sensitive tasks if IDLE CPUs are available. If at
6868		- * least one of such CPUs are available it sets the
6869		- * best_idle_cpu to the most suitable idle CPU to be
6870		- * selected.
6871		- *
6872		- * If idle CPUs are available, favour these CPUs to
6873		- * improve performances by spreading tasks.
6874		- * Indeed, the energy_diff() computed by the caller
6875		- * will take care to ensure the minimization of energy
6876		- * consumptions without affecting performance.
6877		- */
6878		- if (idle_cpu(i)) {
6879		- /*
6880		- * Skip CPUs in deeper idle state, but only
6881		- * if they are also less energy efficient.
6882		- * IOW, prefer a deep IDLE LITTLE CPU vs a
6883		- * shallow idle big CPU.
6884		- */
6885		- if (capacity_orig == target_capacity &&
6886		- sysctl_sched_cstate_aware &&
6887		- idle_idx >= shallowest_idle_cstate)
6888		- continue;
6889		-
6890		- target_capacity = capacity_orig;
6891		- shallowest_idle_cstate = idle_idx;
6892		- best_idle_cpu = i;
6893		- continue;
6894		- }
6895		-
6896		- /*
6897		- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6898		- *
6899		- * Pack tasks in the most energy efficient capacities.
6900		- *
6901		- * This task packing strategy prefers more energy
6902		- * efficient CPUs (i.e. pack on smaller maximum
6903		- * capacity CPUs) while also trying to spread tasks to
6904		- * run them all at the lower OPP.
6905		- *
6906		- * This assumes for example that it's more energy
6907		- * efficient to run two tasks on two CPUs at a lower
6908		- * OPP than packing both on a single CPU but running
6909		- * that CPU at an higher OPP.
6910		- *
6911		- * Thus, this case keep track of the CPU with the
6912		- * smallest maximum capacity and highest spare maximum
6913		- * capacity.
6914		- */
6915		-
6916		- /* Favor CPUs with maximum spare capacity */
6917		- if (capacity_orig == target_capacity &&
6918		- spare_cap < target_max_spare_cap)
6919		- continue;
6920		-
6921		- target_max_spare_cap = spare_cap;
6922		- target_capacity = capacity_orig;
6923		- target_util = new_util;
6924		- target_cpu = i;
6925		- }
6926		-
6927		- } while (sg = sg->next, sg != sd->groups);
6928		-
6929		- /*
6930		- * For non latency sensitive tasks, cases B and C in the previous loop,
6931		- * we pick the best IDLE CPU only if we was not able to find a target
6932		- * ACTIVE CPU.
6933		- *
6934		- * Policies priorities:
6935		- *
6936		- * - prefer_idle tasks:
6937		- *
6938		- * a) IDLE CPU available: best_idle_cpu
6939		- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6940		- * capacity (i.e. target_cpu)
6941		- * c) ACTIVE CPU with less contention due to other tasks
6942		- * (i.e. best_active_cpu)
6943		- *
6944		- * - NON prefer_idle tasks:
6945		- *
6946		- * a) ACTIVE CPU: target_cpu
6947		- * b) IDLE CPU: best_idle_cpu
6948		- */
6949		-
6950		- if (prefer_idle && (best_idle_cpu != -1)) {
6951		- target_cpu = best_idle_cpu;
6952		- goto target;
6953		- }
6954		-
6955		- if (target_cpu == -1)
6956		- target_cpu = prefer_idle
6957		- ? best_active_cpu
6958		- : best_idle_cpu;
6959		- else
6960		- backup_cpu = prefer_idle
6961		- ? best_active_cpu
6962		- : best_idle_cpu;
6963		-
6964		- if (backup_cpu >= 0)
6965		- cpumask_set_cpu(backup_cpu, cpus);
6966		- if (target_cpu >= 0) {
6967		-target:
6968		- cpumask_set_cpu(target_cpu, cpus);
6969		- }
6970		-
6971		- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6972		- best_active_cpu, target_cpu, backup_cpu);
6973		-}
6974		-
6975		-/*
6976		- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6977		- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6978		- *
6979		- * In that case WAKE_AFFINE doesn't make sense and we'll let
6980		- * BALANCE_WAKE sort things out.
6981		- */
6982		-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6983		-{
6984		- long min_cap, max_cap;
6985		-
6986		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6987		- return 0;
6988		-
6989		- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6990		- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6991		-
6992		- /* Minimum capacity is close to max, no need to abort wake_affine */
6993		- if (max_cap - min_cap < max_cap >> 3)
6994		- return 0;
6995		-
6996		- /* Bring task utilization in sync with prev_cpu */
6997		- sync_entity_load_avg(&p->se);
6998		-
6999		- return !task_fits_capacity(p, min_cap);
7000	6545	}
7001	6546
7002	6547	/*
..	..	@@ -7038,154 +6583,61 @@
7038	6583	}
7039	6584
7040	6585	/*
7041		- * compute_energy(): Estimates the energy that would be consumed if @p was
	6586	+ * compute_energy(): Estimates the energy that @pd would consume if @p was
7042	6587	* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7043		- * landscape of the * CPUs after the task migration, and uses the Energy Model
	6588	+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
7044	6589	* to compute what would be the energy if we decided to actually migrate that
7045	6590	* task.
7046	6591	*/
7047	6592	static long
7048	6593	compute_energy(struct task_struct p, int dst_cpu, struct perf_domain pd)
7049	6594	{
7050		- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7051		- unsigned long sum_util, energy = 0;
7052		- struct task_struct *tsk;
	6595	+ struct cpumask *pd_mask = perf_domain_span(pd);
	6596	+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
	6597	+ unsigned long max_util = 0, sum_util = 0;
	6598	+ unsigned long energy = 0;
7053	6599	int cpu;
7054	6600
7055		- for (; pd; pd = pd->next) {
7056		- struct cpumask *pd_mask = perf_domain_span(pd);
	6601	+ /*
	6602	+ * The capacity state of CPUs of the current rd can be driven by CPUs
	6603	+ * of another rd if they belong to the same pd. So, account for the
	6604	+ * utilization of these CPUs too by masking pd with cpu_online_mask
	6605	+ * instead of the rd span.
	6606	+ *
	6607	+ * If an entire pd is outside of the current rd, it will not appear in
	6608	+ * its pd list and will not be accounted by compute_energy().
	6609	+ */
	6610	+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
	6611	+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
	6612	+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
7057	6613
7058	6614	/*
7059		- * The energy model mandates all the CPUs of a performance
7060		- * domain have the same capacity.
	6615	+ * Busy time computation: utilization clamping is not
	6616	+ * required since the ratio (sum_util / cpu_capacity)
	6617	+ * is already enough to scale the EM reported power
	6618	+ * consumption at the (eventually clamped) cpu_capacity.
7061	6619	*/
7062		- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7063		- max_util = sum_util = 0;
	6620	+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6621	+ ENERGY_UTIL, NULL);
7064	6622
7065	6623	/*
7066		- * The capacity state of CPUs of the current rd can be driven by
7067		- * CPUs of another rd if they belong to the same performance
7068		- * domain. So, account for the utilization of these CPUs too
7069		- * by masking pd with cpu_online_mask instead of the rd span.
7070		- *
7071		- * If an entire performance domain is outside of the current rd,
7072		- * it will not appear in its pd list and will not be accounted
7073		- * by compute_energy().
	6624	+ * Performance domain frequency: utilization clamping
	6625	+ * must be considered since it affects the selection
	6626	+ * of the performance domain frequency.
	6627	+ * NOTE: in case RT tasks are running, by default the
	6628	+ * FREQUENCY_UTIL's utilization can be max OPP.
7074	6629	*/
7075		- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7076		- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7077		-
7078		- /*
7079		- * Busy time computation: utilization clamping is not
7080		- * required since the ratio (sum_util / cpu_capacity)
7081		- * is already enough to scale the EM reported power
7082		- * consumption at the (eventually clamped) cpu_capacity.
7083		- */
7084		- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7085		- ENERGY_UTIL, NULL);
7086		-
7087		- /*
7088		- * Performance domain frequency: utilization clamping
7089		- * must be considered since it affects the selection
7090		- * of the performance domain frequency.
7091		- * NOTE: in case RT tasks are running, by default the
7092		- * FREQUENCY_UTIL's utilization can be max OPP.
7093		- */
7094		- tsk = cpu == dst_cpu ? p : NULL;
7095		- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7096		- FREQUENCY_UTIL, tsk);
7097		- max_util = max(max_util, cpu_util);
7098		- }
7099		-
7100		- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
	6630	+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6631	+ FREQUENCY_UTIL, tsk);
	6632	+ max_util = max(max_util, cpu_util);
7101	6633	}
	6634	+
	6635	+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
	6636	+ if (!energy)
	6637	+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
7102	6638
7103	6639	return energy;
7104	6640	}
7105		-
7106		-static void select_cpu_candidates(struct sched_domain sd, cpumask_t cpus,
7107		- struct perf_domain pd, struct task_struct p, int prev_cpu)
7108		-{
7109		- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7110		- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7111		- bool prefer_idle = uclamp_latency_sensitive(p);
7112		- bool boosted = uclamp_boosted(p);
7113		- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7114		- unsigned long highest_spare_cap = 0;
7115		- unsigned int min_exit_lat = UINT_MAX;
7116		- int cpu, max_spare_cap_cpu;
7117		- struct cpuidle_state *idle;
7118		-
7119		- for (; pd; pd = pd->next) {
7120		- max_spare_cap_cpu = -1;
7121		- max_spare_cap = 0;
7122		-
7123		- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7124		- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
7125		- continue;
7126		-
7127		- util = cpu_util_next(cpu, p, cpu);
7128		- cpu_cap = capacity_of(cpu);
7129		- spare_cap = cpu_cap - util;
7130		-
7131		- /*
7132		- * Skip CPUs that cannot satisfy the capacity request.
7133		- * IOW, placing the task there would make the CPU
7134		- * overutilized. Take uclamp into account to see how
7135		- * much capacity we can get out of the CPU; this is
7136		- * aligned with schedutil_cpu_util().
7137		- */
7138		- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7139		- if (cpu_cap * 1024 < util * capacity_margin)
7140		- continue;
7141		-
7142		- /*
7143		- * Find the CPU with the maximum spare capacity in
7144		- * the performance domain
7145		- */
7146		- if (spare_cap > max_spare_cap) {
7147		- max_spare_cap = spare_cap;
7148		- max_spare_cap_cpu = cpu;
7149		- }
7150		-
7151		- if (!prefer_idle)
7152		- continue;
7153		-
7154		- if (idle_cpu(cpu)) {
7155		- cpu_cap = capacity_orig_of(cpu);
7156		- if (boosted && cpu_cap < target_cap)
7157		- continue;
7158		- if (!boosted && cpu_cap > target_cap)
7159		- continue;
7160		- idle = idle_get_state(cpu_rq(cpu));
7161		- if (idle && idle->exit_latency > min_exit_lat &&
7162		- cpu_cap == target_cap)
7163		- continue;
7164		-
7165		- if (idle)
7166		- min_exit_lat = idle->exit_latency;
7167		- target_cap = cpu_cap;
7168		- best_idle_cpu = cpu;
7169		- } else if (spare_cap > highest_spare_cap) {
7170		- highest_spare_cap = spare_cap;
7171		- highest_spare_cap_cpu = cpu;
7172		- }
7173		- }
7174		-
7175		- if (!prefer_idle && max_spare_cap_cpu >= 0)
7176		- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7177		- }
7178		-
7179		- if (!prefer_idle)
7180		- return;
7181		-
7182		- if (best_idle_cpu >= 0)
7183		- cpumask_set_cpu(best_idle_cpu, cpus);
7184		- else
7185		- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7186		-}
7187		-
7188		-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
7189	6641
7190	6642	/*
7191	6643	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
..	..	@@ -7226,27 +6678,39 @@
7226	6678	* other use-cases too. So, until someone finds a better way to solve this,
7227	6679	* let's keep things simple by re-using the existing slow path.
7228	6680	*/
7229		-
7230	6681	static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
7231	6682	{
7232		- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
	6683	+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
	6684	+ unsigned long best_delta2 = ULONG_MAX;
7233	6685	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7234		- int weight, cpu, best_energy_cpu = prev_cpu;
7235		- unsigned long cur_energy;
7236		- struct perf_domain *pd;
	6686	+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
	6687	+ unsigned long max_spare_cap_ls = 0, target_cap;
	6688	+ unsigned long cpu_cap, util, base_energy = 0;
	6689	+ bool boosted, latency_sensitive = false;
	6690	+ unsigned int min_exit_lat = UINT_MAX;
	6691	+ int cpu, best_energy_cpu = prev_cpu;
	6692	+ struct cpuidle_state *idle;
7237	6693	struct sched_domain *sd;
7238		- cpumask_t *candidates;
	6694	+ struct perf_domain *pd;
	6695	+ int new_cpu = INT_MAX;
7239	6696
7240		- if (sysctl_sched_sync_hint_enable && sync) {
7241		- cpu = smp_processor_id();
7242		- if (cpumask_test_cpu(cpu, p->cpus_ptr))
7243		- return cpu;
7244		- }
	6697	+ sync_entity_load_avg(&p->se);
	6698	+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
	6699	+ if (new_cpu != INT_MAX)
	6700	+ return new_cpu;
7245	6701
7246	6702	rcu_read_lock();
7247	6703	pd = rcu_dereference(rd->pd);
7248	6704	if (!pd \|\| READ_ONCE(rd->overutilized))
7249	6705	goto fail;
	6706	+
	6707	+ cpu = smp_processor_id();
	6708	+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
	6709	+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
	6710	+ task_fits_capacity(p, capacity_of(cpu))) {
	6711	+ rcu_read_unlock();
	6712	+ return cpu;
	6713	+ }
7250	6714
7251	6715	/*
7252	6716	* Energy-aware wake-up happens on the lowest sched_domain starting
..	..	@@ -7258,59 +6722,149 @@
7258	6722	if (!sd)
7259	6723	goto fail;
7260	6724
7261		- sync_entity_load_avg(&p->se);
7262	6725	if (!task_util_est(p))
7263	6726	goto unlock;
7264	6727
7265		- /* Pre-select a set of candidate CPUs. */
7266		- candidates = this_cpu_ptr(&energy_cpus);
7267		- cpumask_clear(candidates);
	6728	+ latency_sensitive = uclamp_latency_sensitive(p);
	6729	+ boosted = uclamp_boosted(p);
	6730	+ target_cap = boosted ? 0 : ULONG_MAX;
7268	6731
7269		- if (sched_feat(FIND_BEST_TARGET))
7270		- find_best_target(sd, candidates, p);
7271		- else
7272		- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
	6732	+ for (; pd; pd = pd->next) {
	6733	+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
	6734	+ unsigned long base_energy_pd;
	6735	+ int max_spare_cap_cpu = -1;
7273	6736
7274		- /* Bail out if no candidate was found. */
7275		- weight = cpumask_weight(candidates);
7276		- if (!weight)
7277		- goto unlock;
	6737	+ /* Compute the 'base' energy of the pd, without @p */
	6738	+ base_energy_pd = compute_energy(p, -1, pd);
	6739	+ base_energy += base_energy_pd;
7278	6740
7279		- /* If there is only one sensible candidate, select it now. */
7280		- cpu = cpumask_first(candidates);
7281		- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) \|\|
7282		- (cpu == prev_cpu))) {
7283		- best_energy_cpu = cpu;
7284		- goto unlock;
7285		- }
	6741	+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
	6742	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
	6743	+ continue;
7286	6744
7287		- if (cpumask_test_cpu(prev_cpu, p->cpus_ptr))
7288		- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7289		- else
7290		- prev_energy = best_energy = ULONG_MAX;
	6745	+ util = cpu_util_next(cpu, p, cpu);
	6746	+ cpu_cap = capacity_of(cpu);
	6747	+ spare_cap = cpu_cap;
	6748	+ lsub_positive(&spare_cap, util);
7291	6749
7292		- /* Select the best candidate energy-wise. */
7293		- for_each_cpu(cpu, candidates) {
7294		- if (cpu == prev_cpu)
7295		- continue;
7296		- cur_energy = compute_energy(p, cpu, pd);
7297		- if (cur_energy < best_energy) {
7298		- best_energy = cur_energy;
7299		- best_energy_cpu = cpu;
	6750	+ /*
	6751	+ * Skip CPUs that cannot satisfy the capacity request.
	6752	+ * IOW, placing the task there would make the CPU
	6753	+ * overutilized. Take uclamp into account to see how
	6754	+ * much capacity we can get out of the CPU; this is
	6755	+ * aligned with schedutil_cpu_util().
	6756	+ */
	6757	+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
	6758	+ if (!fits_capacity(util, cpu_cap))
	6759	+ continue;
	6760	+
	6761	+ /* Always use prev_cpu as a candidate. */
	6762	+ if (!latency_sensitive && cpu == prev_cpu) {
	6763	+ prev_delta = compute_energy(p, prev_cpu, pd);
	6764	+ prev_delta -= base_energy_pd;
	6765	+ best_delta = min(best_delta, prev_delta);
	6766	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6767	+ if (prev_delta == best_delta)
	6768	+ best_energy_cpu = prev_cpu;
	6769	+ }
	6770	+ }
	6771	+
	6772	+ /*
	6773	+ * Find the CPU with the maximum spare capacity in
	6774	+ * the performance domain
	6775	+ */
	6776	+ if (spare_cap > max_spare_cap) {
	6777	+ max_spare_cap = spare_cap;
	6778	+ max_spare_cap_cpu = cpu;
	6779	+ }
	6780	+
	6781	+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6782	+ if (!latency_sensitive)
	6783	+ continue;
	6784	+ }
	6785	+
	6786	+ if (idle_cpu(cpu)) {
	6787	+ cpu_cap = capacity_orig_of(cpu);
	6788	+ if (boosted && cpu_cap < target_cap)
	6789	+ continue;
	6790	+ if (!boosted && cpu_cap > target_cap)
	6791	+ continue;
	6792	+ idle = idle_get_state(cpu_rq(cpu));
	6793	+ if (idle && idle->exit_latency > min_exit_lat &&
	6794	+ cpu_cap == target_cap)
	6795	+ continue;
	6796	+
	6797	+ if (idle)
	6798	+ min_exit_lat = idle->exit_latency;
	6799	+ target_cap = cpu_cap;
	6800	+ best_idle_cpu = cpu;
	6801	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6802	+ best_delta2 = compute_energy(p, cpu, pd);
	6803	+ best_delta2 -= base_energy_pd;
	6804	+ }
	6805	+ } else if (spare_cap > max_spare_cap_ls) {
	6806	+ max_spare_cap_ls = spare_cap;
	6807	+ max_spare_cap_cpu_ls = cpu;
	6808	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6809	+ if (best_idle_cpu == -1) {
	6810	+ best_delta2 = compute_energy(p, cpu, pd);
	6811	+ best_delta2 -= base_energy_pd;
	6812	+ }
	6813	+ }
	6814	+ }
	6815	+ }
	6816	+
	6817	+ /* Evaluate the energy impact of using this CPU. */
	6818	+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
	6819	+ max_spare_cap_cpu != prev_cpu) {
	6820	+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
	6821	+ cur_delta -= base_energy_pd;
	6822	+ if (cur_delta < best_delta) {
	6823	+ best_delta = cur_delta;
	6824	+ best_energy_cpu = max_spare_cap_cpu;
	6825	+ }
7300	6826	}
7301	6827	}
7302	6828	unlock:
7303	6829	rcu_read_unlock();
7304	6830
	6831	+ if (latency_sensitive)
	6832	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	6833	+
7305	6834	/*
7306	6835	* Pick the best CPU if prev_cpu cannot be used, or if it saves at
7307	6836	* least 6% of the energy used by prev_cpu.
7308	6837	*/
7309		- if (prev_energy == ULONG_MAX)
	6838	+ if (prev_delta == ULONG_MAX)
7310	6839	return best_energy_cpu;
7311	6840
7312		- if ((prev_energy - best_energy) > (prev_energy >> 4))
	6841	+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
7313	6842	return best_energy_cpu;
	6843	+
	6844	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6845	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	6846	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	6847	+ int level = rockchip_perf_get_level();
	6848	+
	6849	+ /*
	6850	+ * when select ROCKCHIP_PERFORMANCE_LOW:
	6851	+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
	6852	+ * is little cpu, so that tasks can migrate from big cpu to little
	6853	+ * cpu easier to save power.
	6854	+ */
	6855	+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
	6856	+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
	6857	+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
	6858	+ return best_energy_cpu;
	6859	+ }
	6860	+
	6861	+ /*
	6862	+ * Pick the idlest cpu if it is a little power increased(<3.1%).
	6863	+ */
	6864	+ if ((best_delta2 <= prev_delta) \|\|
	6865	+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
	6866	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	6867	+ }
7314	6868
7315	6869	return prev_cpu;
7316	6870
..	..	@@ -7333,39 +6887,44 @@
7333	6887	* preempt must be disabled.
7334	6888	*/
7335	6889	static int
7336		-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7337		- int sibling_count_hint)
	6890	+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
7338	6891	{
7339	6892	struct sched_domain tmp, sd = NULL;
7340	6893	int cpu = smp_processor_id();
7341	6894	int new_cpu = prev_cpu;
7342	6895	int want_affine = 0;
7343	6896	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
	6897	+ int target_cpu = -1;
	6898	+
	6899	+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
	6900	+ !(sd_flag & SD_BALANCE_FORK))
	6901	+ sync_entity_load_avg(&p->se);
	6902	+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
	6903	+ wake_flags, &target_cpu);
	6904	+ if (target_cpu >= 0)
	6905	+ return target_cpu;
7344	6906
7345	6907	if (sd_flag & SD_BALANCE_WAKE) {
7346	6908	record_wakee(p);
7347	6909
7348		- if (static_branch_unlikely(&sched_energy_present)) {
7349		- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7350		- goto sd_loop;
	6910	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6911	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	6912	+ goto no_eas;
	6913	+ }
7351	6914
	6915	+ if (sched_energy_enabled()) {
7352	6916	new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
7353	6917	if (new_cpu >= 0)
7354	6918	return new_cpu;
7355	6919	new_cpu = prev_cpu;
7356	6920	}
7357	6921
7358		- want_affine = !wake_wide(p, sibling_count_hint) &&
7359		- !wake_cap(p, cpu, prev_cpu) &&
7360		- cpumask_test_cpu(cpu, p->cpus_ptr);
	6922	+no_eas:
	6923	+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
7361	6924	}
7362	6925
7363		-sd_loop:
7364	6926	rcu_read_lock();
7365	6927	for_each_domain(cpu, tmp) {
7366		- if (!(tmp->flags & SD_LOAD_BALANCE))
7367		- break;
7368		-
7369	6928	/*
7370	6929	* If both 'cpu' and 'prev_cpu' are part of this domain,
7371	6930	* cpu is a valid SD_WAKE_AFFINE target.
..	..	@@ -7392,6 +6951,23 @@
7392	6951	/* Fast path */
7393	6952
7394	6953	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
	6954	+
	6955	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6956	+ struct root_domain *rd = cpu_rq(cpu)->rd;
	6957	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	6958	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	6959	+ int level = rockchip_perf_get_level();
	6960	+
	6961	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	6962	+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	6963	+ cpumask_test_cpu(new_cpu, cpul_mask)) {
	6964	+ for_each_domain(cpu, tmp) {
	6965	+ sd = tmp;
	6966	+ }
	6967	+ if (sd)
	6968	+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
	6969	+ }
	6970	+ }
7395	6971
7396	6972	if (want_affine)
7397	6973	current->recent_used_cpu = cpu;
..	..	@@ -7469,6 +7045,15 @@
7469	7045	{
7470	7046	remove_entity_load_avg(&p->se);
7471	7047	}
	7048	+
	7049	+static int
	7050	+balance_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
	7051	+{
	7052	+ if (rq->nr_running)
	7053	+ return 1;
	7054	+
	7055	+ return newidle_balance(rq, rf) != 0;
	7056	+}
7472	7057	#endif /* CONFIG_SMP */
7473	7058
7474	7059	static unsigned long wakeup_gran(struct sched_entity *se)
..	..	@@ -7522,7 +7107,7 @@
7522	7107
7523	7108	static void set_last_buddy(struct sched_entity *se)
7524	7109	{
7525		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7110	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7526	7111	return;
7527	7112
7528	7113	for_each_sched_entity(se) {
..	..	@@ -7534,7 +7119,7 @@
7534	7119
7535	7120	static void set_next_buddy(struct sched_entity *se)
7536	7121	{
7537		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7122	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7538	7123	return;
7539	7124
7540	7125	for_each_sched_entity(se) {
..	..	@@ -7560,6 +7145,7 @@
7560	7145	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7561	7146	int scale = cfs_rq->nr_running >= sched_nr_latency;
7562	7147	int next_buddy_marked = 0;
	7148	+ bool preempt = false, nopreempt = false;
7563	7149
7564	7150	if (unlikely(se == pse))
7565	7151	return;
..	..	@@ -7592,8 +7178,8 @@
7592	7178	return;
7593	7179
7594	7180	/* Idle tasks are by definition preempted by non-idle tasks. */
7595		- if (unlikely(curr->policy == SCHED_IDLE) &&
7596		- likely(p->policy != SCHED_IDLE))
	7181	+ if (unlikely(task_has_idle_policy(curr)) &&
	7182	+ likely(!task_has_idle_policy(p)))
7597	7183	goto preempt;
7598	7184
7599	7185	/*
..	..	@@ -7605,6 +7191,12 @@
7605	7191
7606	7192	find_matching_se(&se, &pse);
7607	7193	update_curr(cfs_rq_of(se));
	7194	+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
	7195	+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
	7196	+ if (preempt)
	7197	+ goto preempt;
	7198	+ if (nopreempt)
	7199	+ return;
7608	7200	BUG_ON(!pse);
7609	7201	if (wakeup_preempt_entity(se, pse) == 1) {
7610	7202	/*
..	..	@@ -7636,20 +7228,21 @@
7636	7228	set_last_buddy(se);
7637	7229	}
7638	7230
7639		-static struct task_struct *
	7231	+struct task_struct *
7640	7232	pick_next_task_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
7641	7233	{
7642	7234	struct cfs_rq *cfs_rq = &rq->cfs;
7643		- struct sched_entity *se;
7644		- struct task_struct *p;
	7235	+ struct sched_entity *se = NULL;
	7236	+ struct task_struct *p = NULL;
7645	7237	int new_tasks;
	7238	+ bool repick = false;
7646	7239
7647	7240	again:
7648		- if (!cfs_rq->nr_running)
	7241	+ if (!sched_fair_runnable(rq))
7649	7242	goto idle;
7650	7243
7651	7244	#ifdef CONFIG_FAIR_GROUP_SCHED
7652		- if (prev->sched_class != &fair_sched_class)
	7245	+ if (!prev \|\| prev->sched_class != &fair_sched_class)
7653	7246	goto simple;
7654	7247
7655	7248	/*
..	..	@@ -7696,7 +7289,7 @@
7696	7289	} while (cfs_rq);
7697	7290
7698	7291	p = task_of(se);
7699		-
	7292	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
7700	7293	/*
7701	7294	* Since we haven't yet done put_prev_entity and if the selected task
7702	7295	* is a different task than we started out with, try and touch the
..	..	@@ -7726,8 +7319,15 @@
7726	7319	goto done;
7727	7320	simple:
7728	7321	#endif
	7322	+ if (prev)
	7323	+ put_prev_task(rq, prev);
7729	7324
7730		- put_prev_task(rq, prev);
	7325	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
	7326	+ if (repick) {
	7327	+ for_each_sched_entity(se)
	7328	+ set_next_entity(cfs_rq_of(se), se);
	7329	+ goto done;
	7330	+ }
7731	7331
7732	7332	do {
7733	7333	se = pick_next_entity(cfs_rq, NULL);
..	..	@@ -7755,11 +7355,13 @@
7755	7355	return p;
7756	7356
7757	7357	idle:
7758		- update_misfit_status(NULL, rq);
7759		- new_tasks = idle_balance(rq, rf);
	7358	+ if (!rf)
	7359	+ return NULL;
	7360	+
	7361	+ new_tasks = newidle_balance(rq, rf);
7760	7362
7761	7363	/*
7762		- * Because idle_balance() releases (and re-acquires) rq->lock, it is
	7364	+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
7763	7365	* possible for any higher priority task to appear. In that case we
7764	7366	* must re-start the pick_next_entity() loop.
7765	7367	*/
..	..	@@ -7776,6 +7378,11 @@
7776	7378	update_idle_rq_clock_pelt(rq);
7777	7379
7778	7380	return NULL;
	7381	+}
	7382	+
	7383	+static struct task_struct __pick_next_task_fair(struct rq rq)
	7384	+{
	7385	+ return pick_next_task_fair(rq, NULL, NULL);
7779	7386	}
7780	7387
7781	7388	/*
..	..	@@ -7828,7 +7435,7 @@
7828	7435	set_skip_buddy(se);
7829	7436	}
7830	7437
7831		-static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
	7438	+static bool yield_to_task_fair(struct rq rq, struct task_struct p)
7832	7439	{
7833	7440	struct sched_entity *se = &p->se;
7834	7441
..	..	@@ -7963,15 +7570,54 @@
7963	7570	* rewrite all of this once again.]
7964	7571	*/
7965	7572
7966		-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7573	+unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7574	+EXPORT_SYMBOL_GPL(max_load_balance_interval);
7967	7575
7968	7576	enum fbq_type { regular, remote, all };
7969	7577
	7578	+/*
	7579	+ * 'group_type' describes the group of CPUs at the moment of load balancing.
	7580	+ *
	7581	+ * The enum is ordered by pulling priority, with the group with lowest priority
	7582	+ * first so the group_type can simply be compared when selecting the busiest
	7583	+ * group. See update_sd_pick_busiest().
	7584	+ */
7970	7585	enum group_type {
7971		- group_other = 0,
	7586	+ /* The group has spare capacity that can be used to run more tasks. */
	7587	+ group_has_spare = 0,
	7588	+ /*
	7589	+ * The group is fully used and the tasks don't compete for more CPU
	7590	+ * cycles. Nevertheless, some tasks might wait before running.
	7591	+ */
	7592	+ group_fully_busy,
	7593	+ /*
	7594	+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
	7595	+ * and must be migrated to a more powerful CPU.
	7596	+ */
7972	7597	group_misfit_task,
	7598	+ /*
	7599	+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
	7600	+ * and the task should be migrated to it instead of running on the
	7601	+ * current CPU.
	7602	+ */
	7603	+ group_asym_packing,
	7604	+ /*
	7605	+ * The tasks' affinity constraints previously prevented the scheduler
	7606	+ * from balancing the load across the system.
	7607	+ */
7973	7608	group_imbalanced,
7974		- group_overloaded,
	7609	+ /*
	7610	+ * The CPU is overloaded and can't provide expected CPU cycles to all
	7611	+ * tasks.
	7612	+ */
	7613	+ group_overloaded
	7614	+};
	7615	+
	7616	+enum migration_type {
	7617	+ migrate_load = 0,
	7618	+ migrate_util,
	7619	+ migrate_task,
	7620	+ migrate_misfit
7975	7621	};
7976	7622
7977	7623	#define LBF_ALL_PINNED 0x01
..	..	@@ -7994,7 +7640,6 @@
7994	7640	int new_dst_cpu;
7995	7641	enum cpu_idle_type idle;
7996	7642	long imbalance;
7997		- unsigned int src_grp_nr_running;
7998	7643	/* The set of CPUs under consideration for load-balancing */
7999	7644	struct cpumask *cpus;
8000	7645
..	..	@@ -8005,8 +7650,9 @@
8005	7650	unsigned int loop_max;
8006	7651
8007	7652	enum fbq_type fbq_type;
8008		- enum group_type src_grp_type;
	7653	+ enum migration_type migration_type;
8009	7654	struct list_head tasks;
	7655	+ struct rq_flags *src_rq_rf;
8010	7656	};
8011	7657
8012	7658	/*
..	..	@@ -8021,7 +7667,11 @@
8021	7667	if (p->sched_class != &fair_sched_class)
8022	7668	return 0;
8023	7669
8024		- if (unlikely(p->policy == SCHED_IDLE))
	7670	+ if (unlikely(task_has_idle_policy(p)))
	7671	+ return 0;
	7672	+
	7673	+ /* SMT siblings share cache */
	7674	+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
8025	7675	return 0;
8026	7676
8027	7677	/*
..	..	@@ -8109,8 +7759,13 @@
8109	7759	int can_migrate_task(struct task_struct p, struct lb_env env)
8110	7760	{
8111	7761	int tsk_cache_hot;
	7762	+ int can_migrate = 1;
8112	7763
8113	7764	lockdep_assert_held(&env->src_rq->lock);
	7765	+
	7766	+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
	7767	+ if (!can_migrate)
	7768	+ return 0;
8114	7769
8115	7770	/*
8116	7771	* We do not migrate tasks that are:
..	..	@@ -8120,6 +7775,10 @@
8120	7775	* 4) are cache-hot on their current CPU.
8121	7776	*/
8122	7777	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
	7778	+ return 0;
	7779	+
	7780	+ /* Disregard pcpu kthreads; they are where they need to be. */
	7781	+ if (kthread_is_per_cpu(p))
8123	7782	return 0;
8124	7783
8125	7784	if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
..	..	@@ -8188,9 +7847,20 @@
8188	7847	*/
8189	7848	static void detach_task(struct task_struct p, struct lb_env env)
8190	7849	{
	7850	+ int detached = 0;
	7851	+
8191	7852	lockdep_assert_held(&env->src_rq->lock);
8192	7853
8193		- p->on_rq = TASK_ON_RQ_MIGRATING;
	7854	+ /*
	7855	+ * The vendor hook may drop the lock temporarily, so
	7856	+ * pass the rq flags to unpin lock. We expect the
	7857	+ * rq lock to be held after return.
	7858	+ */
	7859	+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
	7860	+ env->dst_cpu, &detached);
	7861	+ if (detached)
	7862	+ return;
	7863	+
8194	7864	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
8195	7865	set_task_cpu(p, env->dst_cpu);
8196	7866	}
..	..	@@ -8229,7 +7899,7 @@
8229	7899	static const unsigned int sched_nr_migrate_break = 32;
8230	7900
8231	7901	/*
8232		- * detach_tasks() -- tries to detach up to imbalance weighted load from
	7902	+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
8233	7903	* busiest_rq, as part of a balancing operation within domain "sd".
8234	7904	*
8235	7905	* Returns number of detached tasks if successful and 0 otherwise.
..	..	@@ -8237,8 +7907,8 @@
8237	7907	static int detach_tasks(struct lb_env *env)
8238	7908	{
8239	7909	struct list_head *tasks = &env->src_rq->cfs_tasks;
	7910	+ unsigned long util, load;
8240	7911	struct task_struct *p;
8241		- unsigned long load;
8242	7912	int detached = 0;
8243	7913
8244	7914	lockdep_assert_held(&env->src_rq->lock);
..	..	@@ -8268,39 +7938,64 @@
8268	7938	break;
8269	7939	}
8270	7940
8271		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8272		- if (sysctl_sched_performance_bias) {
8273		- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8274		- goto next;
8275		- }
8276		-#endif
8277		-
8278	7941	if (!can_migrate_task(p, env))
8279	7942	goto next;
8280	7943
8281		- /*
8282		- * Depending of the number of CPUs and tasks and the
8283		- * cgroup hierarchy, task_h_load() can return a null
8284		- * value. Make sure that env->imbalance decreases
8285		- * otherwise detach_tasks() will stop only after
8286		- * detaching up to loop_max tasks.
8287		- */
8288		- load = max_t(unsigned long, task_h_load(p), 1);
	7944	+ switch (env->migration_type) {
	7945	+ case migrate_load:
	7946	+ /*
	7947	+ * Depending of the number of CPUs and tasks and the
	7948	+ * cgroup hierarchy, task_h_load() can return a null
	7949	+ * value. Make sure that env->imbalance decreases
	7950	+ * otherwise detach_tasks() will stop only after
	7951	+ * detaching up to loop_max tasks.
	7952	+ */
	7953	+ load = max_t(unsigned long, task_h_load(p), 1);
8289	7954
	7955	+ if (sched_feat(LB_MIN) &&
	7956	+ load < 16 && !env->sd->nr_balance_failed)
	7957	+ goto next;
8290	7958
8291		- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8292		- goto next;
	7959	+ /*
	7960	+ * Make sure that we don't migrate too much load.
	7961	+ * Nevertheless, let relax the constraint if
	7962	+ * scheduler fails to find a good waiting task to
	7963	+ * migrate.
	7964	+ */
	7965	+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
	7966	+ goto next;
8293	7967
8294		- if ((load / 2) > env->imbalance)
8295		- goto next;
	7968	+ env->imbalance -= load;
	7969	+ break;
	7970	+
	7971	+ case migrate_util:
	7972	+ util = task_util_est(p);
	7973	+
	7974	+ if (util > env->imbalance)
	7975	+ goto next;
	7976	+
	7977	+ env->imbalance -= util;
	7978	+ break;
	7979	+
	7980	+ case migrate_task:
	7981	+ env->imbalance--;
	7982	+ break;
	7983	+
	7984	+ case migrate_misfit:
	7985	+ /* This is not a misfit task */
	7986	+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
	7987	+ goto next;
	7988	+
	7989	+ env->imbalance = 0;
	7990	+ break;
	7991	+ }
8296	7992
8297	7993	detach_task(p, env);
8298	7994	list_add(&p->se.group_node, &env->tasks);
8299	7995
8300	7996	detached++;
8301		- env->imbalance -= load;
8302	7997
8303		-#ifdef CONFIG_PREEMPT
	7998	+#ifdef CONFIG_PREEMPTION
8304	7999	/*
8305	8000	* NEWIDLE balancing is a source of latency, so preemptible
8306	8001	* kernels will stop after the first task is detached to minimize
..	..	@@ -8312,7 +8007,7 @@
8312	8007
8313	8008	/*
8314	8009	* We only want to steal up to the prescribed amount of
8315		- * weighted load.
	8010	+ * load/util/tasks.
8316	8011	*/
8317	8012	if (env->imbalance <= 0)
8318	8013	break;
..	..	@@ -8341,7 +8036,6 @@
8341	8036
8342	8037	BUG_ON(task_rq(p) != rq);
8343	8038	activate_task(rq, p, ENQUEUE_NOCLOCK);
8344		- p->on_rq = TASK_ON_RQ_QUEUED;
8345	8039	check_preempt_curr(rq, p, 0);
8346	8040	}
8347	8041
..	..	@@ -8382,6 +8076,7 @@
8382	8076	rq_unlock(env->dst_rq, &rf);
8383	8077	}
8384	8078
	8079	+#ifdef CONFIG_NO_HZ_COMMON
8385	8080	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8386	8081	{
8387	8082	if (cfs_rq->avg.load_avg)
..	..	@@ -8401,12 +8096,54 @@
8401	8096	if (READ_ONCE(rq->avg_dl.util_avg))
8402	8097	return true;
8403	8098
	8099	+ if (thermal_load_avg(rq))
	8100	+ return true;
	8101	+
8404	8102	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8405	8103	if (READ_ONCE(rq->avg_irq.util_avg))
8406	8104	return true;
8407	8105	#endif
8408	8106
8409	8107	return false;
	8108	+}
	8109	+
	8110	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
	8111	+{
	8112	+ rq->last_blocked_load_update_tick = jiffies;
	8113	+
	8114	+ if (!has_blocked)
	8115	+ rq->has_blocked_load = 0;
	8116	+}
	8117	+#else
	8118	+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
	8119	+static inline bool others_have_blocked(struct rq *rq) { return false; }
	8120	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
	8121	+#endif
	8122	+
	8123	+static bool __update_blocked_others(struct rq rq, bool done)
	8124	+{
	8125	+ const struct sched_class *curr_class;
	8126	+ u64 now = rq_clock_pelt(rq);
	8127	+ unsigned long thermal_pressure;
	8128	+ bool decayed;
	8129	+
	8130	+ /*
	8131	+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
	8132	+ * DL and IRQ signals have been updated before updating CFS.
	8133	+ */
	8134	+ curr_class = rq->curr->sched_class;
	8135	+
	8136	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	8137	+
	8138	+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) \|
	8139	+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) \|
	8140	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) \|
	8141	+ update_irq_load_avg(rq, 0);
	8142	+
	8143	+ if (others_have_blocked(rq))
	8144	+ *done = false;
	8145	+
	8146	+ return decayed;
8410	8147	}
8411	8148
8412	8149	#ifdef CONFIG_FAIR_GROUP_SCHED
..	..	@@ -8422,22 +8159,17 @@
8422	8159	if (cfs_rq->avg.util_sum)
8423	8160	return false;
8424	8161
8425		- if (cfs_rq->avg.runnable_load_sum)
	8162	+ if (cfs_rq->avg.runnable_sum)
8426	8163	return false;
8427	8164
8428	8165	return true;
8429	8166	}
8430	8167
8431		-static void update_blocked_averages(int cpu)
	8168	+static bool __update_blocked_fair(struct rq rq, bool done)
8432	8169	{
8433		- struct rq *rq = cpu_rq(cpu);
8434	8170	struct cfs_rq cfs_rq, pos;
8435		- const struct sched_class *curr_class;
8436		- struct rq_flags rf;
8437		- bool done = true;
8438		-
8439		- rq_lock_irqsave(rq, &rf);
8440		- update_rq_clock(rq);
	8171	+ bool decayed = false;
	8172	+ int cpu = cpu_of(rq);
8441	8173
8442	8174	/*
8443	8175	* Iterates the task_group tree in a bottom up fashion, see
..	..	@@ -8446,8 +8178,12 @@
8446	8178	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8447	8179	struct sched_entity *se;
8448	8180
8449		- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8450		- update_tg_load_avg(cfs_rq, 0);
	8181	+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
	8182	+ update_tg_load_avg(cfs_rq);
	8183	+
	8184	+ if (cfs_rq == &rq->cfs)
	8185	+ decayed = true;
	8186	+ }
8451	8187
8452	8188	/* Propagate pending load changes to the parent, if any: */
8453	8189	se = cfs_rq->tg->se[cpu];
..	..	@@ -8463,23 +8199,10 @@
8463	8199
8464	8200	/* Don't need periodic decay once load/util_avg are null */
8465	8201	if (cfs_rq_has_blocked(cfs_rq))
8466		- done = false;
	8202	+ *done = false;
8467	8203	}
8468	8204
8469		- curr_class = rq->curr->sched_class;
8470		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8471		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8472		- update_irq_load_avg(rq, 0);
8473		- /* Don't need periodic decay once load/util_avg are null */
8474		- if (others_have_blocked(rq))
8475		- done = false;
8476		-
8477		-#ifdef CONFIG_NO_HZ_COMMON
8478		- rq->last_blocked_load_update_tick = jiffies;
8479		- if (done)
8480		- rq->has_blocked_load = 0;
8481		-#endif
8482		- rq_unlock_irqrestore(rq, &rf);
	8205	+ return decayed;
8483	8206	}
8484	8207
8485	8208	/*
..	..	@@ -8529,27 +8252,16 @@
8529	8252	cfs_rq_load_avg(cfs_rq) + 1);
8530	8253	}
8531	8254	#else
8532		-static inline void update_blocked_averages(int cpu)
	8255	+static bool __update_blocked_fair(struct rq rq, bool done)
8533	8256	{
8534		- struct rq *rq = cpu_rq(cpu);
8535	8257	struct cfs_rq *cfs_rq = &rq->cfs;
8536		- const struct sched_class *curr_class;
8537		- struct rq_flags rf;
	8258	+ bool decayed;
8538	8259
8539		- rq_lock_irqsave(rq, &rf);
8540		- update_rq_clock(rq);
8541		- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8260	+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8261	+ if (cfs_rq_has_blocked(cfs_rq))
	8262	+ *done = false;
8542	8263
8543		- curr_class = rq->curr->sched_class;
8544		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8545		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8546		- update_irq_load_avg(rq, 0);
8547		-#ifdef CONFIG_NO_HZ_COMMON
8548		- rq->last_blocked_load_update_tick = jiffies;
8549		- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8550		- rq->has_blocked_load = 0;
8551		-#endif
8552		- rq_unlock_irqrestore(rq, &rf);
	8264	+ return decayed;
8553	8265	}
8554	8266
8555	8267	static unsigned long task_h_load(struct task_struct *p)
..	..	@@ -8557,6 +8269,24 @@
8557	8269	return p->se.avg.load_avg;
8558	8270	}
8559	8271	#endif
	8272	+
	8273	+static void update_blocked_averages(int cpu)
	8274	+{
	8275	+ bool decayed = false, done = true;
	8276	+ struct rq *rq = cpu_rq(cpu);
	8277	+ struct rq_flags rf;
	8278	+
	8279	+ rq_lock_irqsave(rq, &rf);
	8280	+ update_rq_clock(rq);
	8281	+
	8282	+ decayed \|= __update_blocked_others(rq, &done);
	8283	+ decayed \|= __update_blocked_fair(rq, &done);
	8284	+
	8285	+ update_blocked_load_status(rq, !done);
	8286	+ if (decayed)
	8287	+ cpufreq_update_util(rq, 0);
	8288	+ rq_unlock_irqrestore(rq, &rf);
	8289	+}
8560	8290
8561	8291	/******** Helpers for find_busiest_group **********************/
8562	8292
..	..	@@ -8566,15 +8296,15 @@
8566	8296	struct sg_lb_stats {
8567	8297	unsigned long avg_load; /Avg load across the CPUs of the group /
8568	8298	unsigned long group_load; /* Total load over the CPUs of the group */
8569		- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8570		- unsigned long load_per_task;
8571	8299	unsigned long group_capacity;
8572		- unsigned long group_util; /* Total utilization of the group */
8573		- unsigned int sum_nr_running; /* Nr tasks running in the group */
	8300	+ unsigned long group_util; /* Total utilization over the CPUs of the group */
	8301	+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
	8302	+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
	8303	+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
8574	8304	unsigned int idle_cpus;
8575	8305	unsigned int group_weight;
8576	8306	enum group_type group_type;
8577		- int group_no_capacity;
	8307	+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
8578	8308	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
8579	8309	#ifdef CONFIG_NUMA_BALANCING
8580	8310	unsigned int nr_numa_running;
..	..	@@ -8589,10 +8319,10 @@
8589	8319	struct sd_lb_stats {
8590	8320	struct sched_group busiest; / Busiest group in this sd */
8591	8321	struct sched_group local; / Local group in this sd */
8592		- unsigned long total_running;
8593	8322	unsigned long total_load; /* Total load of all groups in sd */
8594	8323	unsigned long total_capacity; /* Total capacity of all groups in sd */
8595	8324	unsigned long avg_load; /* Average load across all groups in sd */
	8325	+ unsigned int prefer_sibling; /* tasks should go to sibling first */
8596	8326
8597	8327	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
8598	8328	struct sg_lb_stats local_stat; /* Statistics of the local group */
..	..	@@ -8603,54 +8333,26 @@
8603	8333	/*
8604	8334	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
8605	8335	* local_stat because update_sg_lb_stats() does a full clear/assignment.
8606		- * We must however clear busiest_stat::avg_load because
8607		- * update_sd_pick_busiest() reads this before assignment.
	8336	+ * We must however set busiest_stat::group_type and
	8337	+ * busiest_stat::idle_cpus to the worst busiest group because
	8338	+ * update_sd_pick_busiest() reads these before assignment.
8608	8339	*/
8609	8340	*sds = (struct sd_lb_stats){
8610	8341	.busiest = NULL,
8611	8342	.local = NULL,
8612		- .total_running = 0UL,
8613	8343	.total_load = 0UL,
8614	8344	.total_capacity = 0UL,
8615	8345	.busiest_stat = {
8616		- .avg_load = 0UL,
8617		- .sum_nr_running = 0,
8618		- .group_type = group_other,
	8346	+ .idle_cpus = UINT_MAX,
	8347	+ .group_type = group_has_spare,
8619	8348	},
8620	8349	};
8621	8350	}
8622	8351
8623		-/**
8624		- * get_sd_load_idx - Obtain the load index for a given sched domain.
8625		- * @sd: The sched_domain whose load_idx is to be obtained.
8626		- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8627		- *
8628		- * Return: The load index.
8629		- */
8630		-static inline int get_sd_load_idx(struct sched_domain *sd,
8631		- enum cpu_idle_type idle)
8632		-{
8633		- int load_idx;
8634		-
8635		- switch (idle) {
8636		- case CPU_NOT_IDLE:
8637		- load_idx = sd->busy_idx;
8638		- break;
8639		-
8640		- case CPU_NEWLY_IDLE:
8641		- load_idx = sd->newidle_idx;
8642		- break;
8643		- default:
8644		- load_idx = sd->idle_idx;
8645		- break;
8646		- }
8647		-
8648		- return load_idx;
8649		-}
8650		-
8651		-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
	8352	+static unsigned long scale_rt_capacity(int cpu)
8652	8353	{
8653	8354	struct rq *rq = cpu_rq(cpu);
	8355	+ unsigned long max = arch_scale_cpu_capacity(cpu);
8654	8356	unsigned long used, free;
8655	8357	unsigned long irq;
8656	8358
..	..	@@ -8659,8 +8361,15 @@
8659	8361	if (unlikely(irq >= max))
8660	8362	return 1;
8661	8363
	8364	+ /*
	8365	+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
	8366	+ * (running and not running) with weights 0 and 1024 respectively.
	8367	+ * avg_thermal.load_avg tracks thermal pressure and the weighted
	8368	+ * average uses the actual delta max capacity(load).
	8369	+ */
8662	8370	used = READ_ONCE(rq->avg_rt.util_avg);
8663	8371	used += READ_ONCE(rq->avg_dl.util_avg);
	8372	+ used += thermal_load_avg(rq);
8664	8373
8665	8374	if (unlikely(used >= max))
8666	8375	return 1;
..	..	@@ -8670,52 +8379,20 @@
8670	8379	return scale_irq_capacity(free, irq, max);
8671	8380	}
8672	8381
8673		-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8674		- raw_spin_lock_init(&mcc->lock);
8675		- mcc->val = 0;
8676		- mcc->cpu = -1;
8677		-}
8678		-
8679	8382	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8680	8383	{
8681		- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
	8384	+ unsigned long capacity = scale_rt_capacity(cpu);
8682	8385	struct sched_group *sdg = sd->groups;
8683		- struct max_cpu_capacity *mcc;
8684		- unsigned long max_capacity;
8685		- int max_cap_cpu;
8686		- unsigned long flags;
8687	8386
8688		- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8689		-
8690		- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8691		- capacity >>= SCHED_CAPACITY_SHIFT;
8692		-
8693		- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8694		-
8695		- raw_spin_lock_irqsave(&mcc->lock, flags);
8696		- max_capacity = mcc->val;
8697		- max_cap_cpu = mcc->cpu;
8698		-
8699		- if ((max_capacity > capacity && max_cap_cpu == cpu) \|\|
8700		- (max_capacity < capacity)) {
8701		- mcc->val = capacity;
8702		- mcc->cpu = cpu;
8703		-#ifdef CONFIG_SCHED_DEBUG
8704		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8705		- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8706		- // cpu, capacity);
8707		- goto skip_unlock;
8708		-#endif
8709		- }
8710		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8711		-
8712		-skip_unlock: __attribute__ ((unused));
8713		- capacity = scale_rt_capacity(cpu, capacity);
	8387	+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8714	8388
8715	8389	if (!capacity)
8716	8390	capacity = 1;
8717	8391
	8392	+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
8718	8393	cpu_rq(cpu)->cpu_capacity = capacity;
	8394	+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
	8395	+
8719	8396	sdg->sgc->capacity = capacity;
8720	8397	sdg->sgc->min_capacity = capacity;
8721	8398	sdg->sgc->max_capacity = capacity;
..	..	@@ -8748,29 +8425,11 @@
8748	8425	*/
8749	8426
8750	8427	for_each_cpu(cpu, sched_group_span(sdg)) {
8751		- struct sched_group_capacity *sgc;
8752		- struct rq *rq = cpu_rq(cpu);
	8428	+ unsigned long cpu_cap = capacity_of(cpu);
8753	8429
8754		- /*
8755		- * build_sched_domains() -> init_sched_groups_capacity()
8756		- * gets here before we've attached the domains to the
8757		- * runqueues.
8758		- *
8759		- * Use capacity_of(), which is set irrespective of domains
8760		- * in update_cpu_capacity().
8761		- *
8762		- * This avoids capacity from being 0 and
8763		- * causing divide-by-zero issues on boot.
8764		- */
8765		- if (unlikely(!rq->sd)) {
8766		- capacity += capacity_of(cpu);
8767		- } else {
8768		- sgc = rq->sd->groups->sgc;
8769		- capacity += sgc->capacity;
8770		- }
8771		-
8772		- min_capacity = min(capacity, min_capacity);
8773		- max_capacity = max(capacity, max_capacity);
	8430	+ capacity += cpu_cap;
	8431	+ min_capacity = min(cpu_cap, min_capacity);
	8432	+ max_capacity = max(cpu_cap, max_capacity);
8774	8433	}
8775	8434	} else {
8776	8435	/*
..	..	@@ -8804,6 +8463,18 @@
8804	8463	{
8805	8464	return ((rq->cpu_capacity * sd->imbalance_pct) <
8806	8465	(rq->cpu_capacity_orig * 100));
	8466	+}
	8467	+
	8468	+/*
	8469	+ * Check whether a rq has a misfit task and if it looks like we can actually
	8470	+ * help that task: we can migrate the task to a CPU of higher capacity, or
	8471	+ * the task's current CPU is heavily pressured.
	8472	+ */
	8473	+static inline int check_misfit_status(struct rq rq, struct sched_domain sd)
	8474	+{
	8475	+ return rq->misfit_task_load &&
	8476	+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity \|\|
	8477	+ check_cpu_capacity(rq, sd));
8807	8478	}
8808	8479
8809	8480	/*
..	..	@@ -8853,13 +8524,17 @@
8853	8524	* any benefit for the load balance.
8854	8525	*/
8855	8526	static inline bool
8856		-group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
	8527	+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8857	8528	{
8858	8529	if (sgs->sum_nr_running < sgs->group_weight)
8859	8530	return true;
8860	8531
	8532	+ if ((sgs->group_capacity * imbalance_pct) <
	8533	+ (sgs->group_runnable * 100))
	8534	+ return false;
	8535	+
8861	8536	if ((sgs->group_capacity * 100) >
8862		- (sgs->group_util * env->sd->imbalance_pct))
	8537	+ (sgs->group_util * imbalance_pct))
8863	8538	return true;
8864	8539
8865	8540	return false;
..	..	@@ -8874,13 +8549,17 @@
8874	8549	* false.
8875	8550	*/
8876	8551	static inline bool
8877		-group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
	8552	+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8878	8553	{
8879	8554	if (sgs->sum_nr_running <= sgs->group_weight)
8880	8555	return false;
8881	8556
8882	8557	if ((sgs->group_capacity * 100) <
8883		- (sgs->group_util * env->sd->imbalance_pct))
	8558	+ (sgs->group_util * imbalance_pct))
	8559	+ return true;
	8560	+
	8561	+ if ((sgs->group_capacity * imbalance_pct) <
	8562	+ (sgs->group_runnable * 100))
8884	8563	return true;
8885	8564
8886	8565	return false;
..	..	@@ -8893,8 +8572,7 @@
8893	8572	static inline bool
8894	8573	group_smaller_min_cpu_capacity(struct sched_group sg, struct sched_group ref)
8895	8574	{
8896		- return sg->sgc->min_capacity * capacity_margin <
8897		- ref->sgc->min_capacity * 1024;
	8575	+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
8898	8576	}
8899	8577
8900	8578	/*
..	..	@@ -8904,24 +8582,30 @@
8904	8582	static inline bool
8905	8583	group_smaller_max_cpu_capacity(struct sched_group sg, struct sched_group ref)
8906	8584	{
8907		- return sg->sgc->max_capacity * capacity_margin <
8908		- ref->sgc->max_capacity * 1024;
	8585	+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
8909	8586	}
8910	8587
8911	8588	static inline enum
8912		-group_type group_classify(struct sched_group *group,
	8589	+group_type group_classify(unsigned int imbalance_pct,
	8590	+ struct sched_group *group,
8913	8591	struct sg_lb_stats *sgs)
8914	8592	{
8915		- if (sgs->group_no_capacity)
	8593	+ if (group_is_overloaded(imbalance_pct, sgs))
8916	8594	return group_overloaded;
8917	8595
8918	8596	if (sg_imbalanced(group))
8919	8597	return group_imbalanced;
8920	8598
	8599	+ if (sgs->group_asym_packing)
	8600	+ return group_asym_packing;
	8601	+
8921	8602	if (sgs->group_misfit_task_load)
8922	8603	return group_misfit_task;
8923	8604
8924		- return group_other;
	8605	+ if (!group_has_capacity(imbalance_pct, sgs))
	8606	+ return group_fully_busy;
	8607	+
	8608	+ return group_has_spare;
8925	8609	}
8926	8610
8927	8611	static bool update_nohz_stats(struct rq *rq, bool force)
..	..	@@ -8958,12 +8642,11 @@
8958	8642	struct sg_lb_stats *sgs,
8959	8643	int *sg_status)
8960	8644	{
8961		- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8962		- int load_idx = get_sd_load_idx(env->sd, env->idle);
8963		- unsigned long load;
8964		- int i, nr_running;
	8645	+ int i, nr_running, local_group;
8965	8646
8966	8647	memset(sgs, 0, sizeof(*sgs));
	8648	+
	8649	+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8967	8650
8968	8651	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8969	8652	struct rq *rq = cpu_rq(i);
..	..	@@ -8971,17 +8654,14 @@
8971	8654	if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8972	8655	env->flags \|= LBF_NOHZ_AGAIN;
8973	8656
8974		- /* Bias balancing toward CPUs of our domain: */
8975		- if (local_group)
8976		- load = target_load(i, load_idx);
8977		- else
8978		- load = source_load(i, load_idx);
8979		-
8980		- sgs->group_load += load;
	8657	+ sgs->group_load += cpu_load(rq);
8981	8658	sgs->group_util += cpu_util(i);
8982		- sgs->sum_nr_running += rq->cfs.h_nr_running;
	8659	+ sgs->group_runnable += cpu_runnable(rq);
	8660	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8983	8661
8984	8662	nr_running = rq->nr_running;
	8663	+ sgs->sum_nr_running += nr_running;
	8664	+
8985	8665	if (nr_running > 1)
8986	8666	*sg_status \|= SG_OVERLOAD;
8987	8667
..	..	@@ -8992,13 +8672,19 @@
8992	8672	sgs->nr_numa_running += rq->nr_numa_running;
8993	8673	sgs->nr_preferred_running += rq->nr_preferred_running;
8994	8674	#endif
8995		- sgs->sum_weighted_load += weighted_cpuload(rq);
8996	8675	/*
8997	8676	* No need to call idle_cpu() if nr_running is not 0
8998	8677	*/
8999		- if (!nr_running && idle_cpu(i))
	8678	+ if (!nr_running && idle_cpu(i)) {
9000	8679	sgs->idle_cpus++;
	8680	+ /* Idle cpu can't have misfit task */
	8681	+ continue;
	8682	+ }
9001	8683
	8684	+ if (local_group)
	8685	+ continue;
	8686	+
	8687	+ /* Check for a misfit task on the cpu */
9002	8688	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9003	8689	sgs->group_misfit_task_load < rq->misfit_task_load) {
9004	8690	sgs->group_misfit_task_load = rq->misfit_task_load;
..	..	@@ -9006,17 +8692,24 @@
9006	8692	}
9007	8693	}
9008	8694
9009		- /* Adjust by relative CPU capacity of the group */
9010		- sgs->group_capacity = group->sgc->capacity;
9011		- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
	8695	+ /* Check if dst CPU is idle and preferred to this group */
	8696	+ if (env->sd->flags & SD_ASYM_PACKING &&
	8697	+ env->idle != CPU_NOT_IDLE &&
	8698	+ sgs->sum_h_nr_running &&
	8699	+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
	8700	+ sgs->group_asym_packing = 1;
	8701	+ }
9012	8702
9013		- if (sgs->sum_nr_running)
9014		- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
	8703	+ sgs->group_capacity = group->sgc->capacity;
9015	8704
9016	8705	sgs->group_weight = group->group_weight;
9017	8706
9018		- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9019		- sgs->group_type = group_classify(group, sgs);
	8707	+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
	8708	+
	8709	+ /* Computing avg_load makes sense only when group is overloaded */
	8710	+ if (sgs->group_type == group_overloaded)
	8711	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	8712	+ sgs->group_capacity;
9020	8713	}
9021	8714
9022	8715	/**
..	..	@@ -9039,6 +8732,10 @@
9039	8732	{
9040	8733	struct sg_lb_stats *busiest = &sds->busiest_stat;
9041	8734
	8735	+ /* Make sure that there is at least one task to pull */
	8736	+ if (!sgs->sum_h_nr_running)
	8737	+ return false;
	8738	+
9042	8739	/*
9043	8740	* Don't try to pull misfit tasks we can't help.
9044	8741	* We can use max_capacity here as reduction in capacity on some
..	..	@@ -9047,7 +8744,7 @@
9047	8744	*/
9048	8745	if (sgs->group_type == group_misfit_task &&
9049	8746	(!group_smaller_max_cpu_capacity(sg, sds->local) \|\|
9050		- !group_has_capacity(env, &sds->local_stat)))
	8747	+ sds->local_stat.group_type != group_has_spare))
9051	8748	return false;
9052	8749
9053	8750	if (sgs->group_type > busiest->group_type)
..	..	@@ -9056,62 +8753,92 @@
9056	8753	if (sgs->group_type < busiest->group_type)
9057	8754	return false;
9058	8755
9059		- if (sgs->avg_load <= busiest->avg_load)
	8756	+ /*
	8757	+ * The candidate and the current busiest group are the same type of
	8758	+ * group. Let check which one is the busiest according to the type.
	8759	+ */
	8760	+
	8761	+ switch (sgs->group_type) {
	8762	+ case group_overloaded:
	8763	+ /* Select the overloaded group with highest avg_load. */
	8764	+ if (sgs->avg_load <= busiest->avg_load)
	8765	+ return false;
	8766	+ break;
	8767	+
	8768	+ case group_imbalanced:
	8769	+ /*
	8770	+ * Select the 1st imbalanced group as we don't have any way to
	8771	+ * choose one more than another.
	8772	+ */
9060	8773	return false;
9061	8774
9062		- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9063		- goto asym_packing;
9064		-
9065		- /*
9066		- * Candidate sg has no more than one task per CPU and
9067		- * has higher per-CPU capacity. Migrating tasks to less
9068		- * capable CPUs may harm throughput. Maximize throughput,
9069		- * power/energy consequences are not considered.
9070		- */
9071		- if (sgs->sum_nr_running <= sgs->group_weight &&
9072		- group_smaller_min_cpu_capacity(sds->local, sg))
9073		- return false;
9074		-
9075		- /*
9076		- * If we have more than one misfit sg go with the biggest misfit.
9077		- */
9078		- if (sgs->group_type == group_misfit_task &&
9079		- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9080		- return false;
9081		-
9082		-asym_packing:
9083		- /* This is the busiest node in its class. */
9084		- if (!(env->sd->flags & SD_ASYM_PACKING))
9085		- return true;
9086		-
9087		- /* No ASYM_PACKING if target CPU is already busy */
9088		- if (env->idle == CPU_NOT_IDLE)
9089		- return true;
9090		- /*
9091		- * ASYM_PACKING needs to move all the work to the highest
9092		- * prority CPUs in the group, therefore mark all groups
9093		- * of lower priority than ourself as busy.
9094		- */
9095		- if (sgs->sum_nr_running &&
9096		- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9097		- if (!sds->busiest)
9098		- return true;
9099		-
	8775	+ case group_asym_packing:
9100	8776	/* Prefer to move from lowest priority CPU's work */
9101		- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9102		- sg->asym_prefer_cpu))
9103		- return true;
	8777	+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
	8778	+ return false;
	8779	+ break;
	8780	+
	8781	+ case group_misfit_task:
	8782	+ /*
	8783	+ * If we have more than one misfit sg go with the biggest
	8784	+ * misfit.
	8785	+ */
	8786	+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
	8787	+ return false;
	8788	+ break;
	8789	+
	8790	+ case group_fully_busy:
	8791	+ /*
	8792	+ * Select the fully busy group with highest avg_load. In
	8793	+ * theory, there is no need to pull task from such kind of
	8794	+ * group because tasks have all compute capacity that they need
	8795	+ * but we can still improve the overall throughput by reducing
	8796	+ * contention when accessing shared HW resources.
	8797	+ *
	8798	+ * XXX for now avg_load is not computed and always 0 so we
	8799	+ * select the 1st one.
	8800	+ */
	8801	+ if (sgs->avg_load <= busiest->avg_load)
	8802	+ return false;
	8803	+ break;
	8804	+
	8805	+ case group_has_spare:
	8806	+ /*
	8807	+ * Select not overloaded group with lowest number of idle cpus
	8808	+ * and highest number of running tasks. We could also compare
	8809	+ * the spare capacity which is more stable but it can end up
	8810	+ * that the group has less spare capacity but finally more idle
	8811	+ * CPUs which means less opportunity to pull tasks.
	8812	+ */
	8813	+ if (sgs->idle_cpus > busiest->idle_cpus)
	8814	+ return false;
	8815	+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
	8816	+ (sgs->sum_nr_running <= busiest->sum_nr_running))
	8817	+ return false;
	8818	+
	8819	+ break;
9104	8820	}
9105	8821
9106		- return false;
	8822	+ /*
	8823	+ * Candidate sg has no more than one task per CPU and has higher
	8824	+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
	8825	+ * throughput. Maximize throughput, power/energy consequences are not
	8826	+ * considered.
	8827	+ */
	8828	+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
	8829	+ (sgs->group_type <= group_fully_busy) &&
	8830	+ (group_smaller_min_cpu_capacity(sds->local, sg)))
	8831	+ return false;
	8832	+
	8833	+ return true;
9107	8834	}
9108	8835
9109	8836	#ifdef CONFIG_NUMA_BALANCING
9110	8837	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9111	8838	{
9112		- if (sgs->sum_nr_running > sgs->nr_numa_running)
	8839	+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
9113	8840	return regular;
9114		- if (sgs->sum_nr_running > sgs->nr_preferred_running)
	8841	+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
9115	8842	return remote;
9116	8843	return all;
9117	8844	}
..	..	@@ -9136,18 +8863,334 @@
9136	8863	}
9137	8864	#endif /* CONFIG_NUMA_BALANCING */
9138	8865
	8866	+
	8867	+struct sg_lb_stats;
	8868	+
	8869	+/*
	8870	+ * task_running_on_cpu - return 1 if @p is running on @cpu.
	8871	+ */
	8872	+
	8873	+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
	8874	+{
	8875	+ /* Task has no contribution or is new */
	8876	+ if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	8877	+ return 0;
	8878	+
	8879	+ if (task_on_rq_queued(p))
	8880	+ return 1;
	8881	+
	8882	+ return 0;
	8883	+}
	8884	+
	8885	+/**
	8886	+ * idle_cpu_without - would a given CPU be idle without p ?
	8887	+ * @cpu: the processor on which idleness is tested.
	8888	+ * @p: task which should be ignored.
	8889	+ *
	8890	+ * Return: 1 if the CPU would be idle. 0 otherwise.
	8891	+ */
	8892	+static int idle_cpu_without(int cpu, struct task_struct *p)
	8893	+{
	8894	+ struct rq *rq = cpu_rq(cpu);
	8895	+
	8896	+ if (rq->curr != rq->idle && rq->curr != p)
	8897	+ return 0;
	8898	+
	8899	+ /*
	8900	+ * rq->nr_running can't be used but an updated version without the
	8901	+ * impact of p on cpu must be used instead. The updated nr_running
	8902	+ * be computed and tested before calling idle_cpu_without().
	8903	+ */
	8904	+
	8905	+#ifdef CONFIG_SMP
	8906	+ if (rq->ttwu_pending)
	8907	+ return 0;
	8908	+#endif
	8909	+
	8910	+ return 1;
	8911	+}
	8912	+
	8913	+/*
	8914	+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
	8915	+ * @sd: The sched_domain level to look for idlest group.
	8916	+ * @group: sched_group whose statistics are to be updated.
	8917	+ * @sgs: variable to hold the statistics for this group.
	8918	+ * @p: The task for which we look for the idlest group/CPU.
	8919	+ */
	8920	+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
	8921	+ struct sched_group *group,
	8922	+ struct sg_lb_stats *sgs,
	8923	+ struct task_struct *p)
	8924	+{
	8925	+ int i, nr_running;
	8926	+
	8927	+ memset(sgs, 0, sizeof(*sgs));
	8928	+
	8929	+ for_each_cpu(i, sched_group_span(group)) {
	8930	+ struct rq *rq = cpu_rq(i);
	8931	+ unsigned int local;
	8932	+
	8933	+ sgs->group_load += cpu_load_without(rq, p);
	8934	+ sgs->group_util += cpu_util_without(i, p);
	8935	+ sgs->group_runnable += cpu_runnable_without(rq, p);
	8936	+ local = task_running_on_cpu(i, p);
	8937	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
	8938	+
	8939	+ nr_running = rq->nr_running - local;
	8940	+ sgs->sum_nr_running += nr_running;
	8941	+
	8942	+ /*
	8943	+ * No need to call idle_cpu_without() if nr_running is not 0
	8944	+ */
	8945	+ if (!nr_running && idle_cpu_without(i, p))
	8946	+ sgs->idle_cpus++;
	8947	+
	8948	+ }
	8949	+
	8950	+ /* Check if task fits in the group */
	8951	+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
	8952	+ !task_fits_capacity(p, group->sgc->max_capacity)) {
	8953	+ sgs->group_misfit_task_load = 1;
	8954	+ }
	8955	+
	8956	+ sgs->group_capacity = group->sgc->capacity;
	8957	+
	8958	+ sgs->group_weight = group->group_weight;
	8959	+
	8960	+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
	8961	+
	8962	+ /*
	8963	+ * Computing avg_load makes sense only when group is fully busy or
	8964	+ * overloaded
	8965	+ */
	8966	+ if (sgs->group_type == group_fully_busy \|\|
	8967	+ sgs->group_type == group_overloaded)
	8968	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	8969	+ sgs->group_capacity;
	8970	+}
	8971	+
	8972	+static bool update_pick_idlest(struct sched_group *idlest,
	8973	+ struct sg_lb_stats *idlest_sgs,
	8974	+ struct sched_group *group,
	8975	+ struct sg_lb_stats *sgs)
	8976	+{
	8977	+ if (sgs->group_type < idlest_sgs->group_type)
	8978	+ return true;
	8979	+
	8980	+ if (sgs->group_type > idlest_sgs->group_type)
	8981	+ return false;
	8982	+
	8983	+ /*
	8984	+ * The candidate and the current idlest group are the same type of
	8985	+ * group. Let check which one is the idlest according to the type.
	8986	+ */
	8987	+
	8988	+ switch (sgs->group_type) {
	8989	+ case group_overloaded:
	8990	+ case group_fully_busy:
	8991	+ /* Select the group with lowest avg_load. */
	8992	+ if (idlest_sgs->avg_load <= sgs->avg_load)
	8993	+ return false;
	8994	+ break;
	8995	+
	8996	+ case group_imbalanced:
	8997	+ case group_asym_packing:
	8998	+ /* Those types are not used in the slow wakeup path */
	8999	+ return false;
	9000	+
	9001	+ case group_misfit_task:
	9002	+ /* Select group with the highest max capacity */
	9003	+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
	9004	+ return false;
	9005	+ break;
	9006	+
	9007	+ case group_has_spare:
	9008	+ /* Select group with most idle CPUs */
	9009	+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
	9010	+ return false;
	9011	+
	9012	+ /* Select group with lowest group_util */
	9013	+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
	9014	+ idlest_sgs->group_util <= sgs->group_util)
	9015	+ return false;
	9016	+
	9017	+ break;
	9018	+ }
	9019	+
	9020	+ return true;
	9021	+}
	9022	+
	9023	+/*
	9024	+ * find_idlest_group() finds and returns the least busy CPU group within the
	9025	+ * domain.
	9026	+ *
	9027	+ * Assumes p is allowed on at least one CPU in sd.
	9028	+ */
	9029	+static struct sched_group *
	9030	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
	9031	+{
	9032	+ struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
	9033	+ struct sg_lb_stats local_sgs, tmp_sgs;
	9034	+ struct sg_lb_stats *sgs;
	9035	+ unsigned long imbalance;
	9036	+ struct sg_lb_stats idlest_sgs = {
	9037	+ .avg_load = UINT_MAX,
	9038	+ .group_type = group_overloaded,
	9039	+ };
	9040	+
	9041	+ imbalance = scale_load_down(NICE_0_LOAD) *
	9042	+ (sd->imbalance_pct-100) / 100;
	9043	+
	9044	+ do {
	9045	+ int local_group;
	9046	+
	9047	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	9048	+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
	9049	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	9050	+ int level = rockchip_perf_get_level();
	9051	+
	9052	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	9053	+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	9054	+ !cpumask_intersects(sched_group_span(group), cpub_mask))
	9055	+ continue;
	9056	+ }
	9057	+
	9058	+ /* Skip over this group if it has no CPUs allowed */
	9059	+ if (!cpumask_intersects(sched_group_span(group),
	9060	+ p->cpus_ptr))
	9061	+ continue;
	9062	+
	9063	+ local_group = cpumask_test_cpu(this_cpu,
	9064	+ sched_group_span(group));
	9065	+
	9066	+ if (local_group) {
	9067	+ sgs = &local_sgs;
	9068	+ local = group;
	9069	+ } else {
	9070	+ sgs = &tmp_sgs;
	9071	+ }
	9072	+
	9073	+ update_sg_wakeup_stats(sd, group, sgs, p);
	9074	+
	9075	+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
	9076	+ idlest = group;
	9077	+ idlest_sgs = *sgs;
	9078	+ }
	9079	+
	9080	+ } while (group = group->next, group != sd->groups);
	9081	+
	9082	+
	9083	+ /* There is no idlest group to push tasks to */
	9084	+ if (!idlest)
	9085	+ return NULL;
	9086	+
	9087	+ /* The local group has been skipped because of CPU affinity */
	9088	+ if (!local)
	9089	+ return idlest;
	9090	+
	9091	+ /*
	9092	+ * If the local group is idler than the selected idlest group
	9093	+ * don't try and push the task.
	9094	+ */
	9095	+ if (local_sgs.group_type < idlest_sgs.group_type)
	9096	+ return NULL;
	9097	+
	9098	+ /*
	9099	+ * If the local group is busier than the selected idlest group
	9100	+ * try and push the task.
	9101	+ */
	9102	+ if (local_sgs.group_type > idlest_sgs.group_type)
	9103	+ return idlest;
	9104	+
	9105	+ switch (local_sgs.group_type) {
	9106	+ case group_overloaded:
	9107	+ case group_fully_busy:
	9108	+ /*
	9109	+ * When comparing groups across NUMA domains, it's possible for
	9110	+ * the local domain to be very lightly loaded relative to the
	9111	+ * remote domains but "imbalance" skews the comparison making
	9112	+ * remote CPUs look much more favourable. When considering
	9113	+ * cross-domain, add imbalance to the load on the remote node
	9114	+ * and consider staying local.
	9115	+ */
	9116	+
	9117	+ if ((sd->flags & SD_NUMA) &&
	9118	+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
	9119	+ return NULL;
	9120	+
	9121	+ /*
	9122	+ * If the local group is less loaded than the selected
	9123	+ * idlest group don't try and push any tasks.
	9124	+ */
	9125	+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
	9126	+ return NULL;
	9127	+
	9128	+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
	9129	+ return NULL;
	9130	+ break;
	9131	+
	9132	+ case group_imbalanced:
	9133	+ case group_asym_packing:
	9134	+ /* Those type are not used in the slow wakeup path */
	9135	+ return NULL;
	9136	+
	9137	+ case group_misfit_task:
	9138	+ /* Select group with the highest max capacity */
	9139	+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
	9140	+ return NULL;
	9141	+ break;
	9142	+
	9143	+ case group_has_spare:
	9144	+ if (sd->flags & SD_NUMA) {
	9145	+#ifdef CONFIG_NUMA_BALANCING
	9146	+ int idlest_cpu;
	9147	+ /*
	9148	+ * If there is spare capacity at NUMA, try to select
	9149	+ * the preferred node
	9150	+ */
	9151	+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
	9152	+ return NULL;
	9153	+
	9154	+ idlest_cpu = cpumask_first(sched_group_span(idlest));
	9155	+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
	9156	+ return idlest;
	9157	+#endif
	9158	+ /*
	9159	+ * Otherwise, keep the task on this node to stay close
	9160	+ * its wakeup source and improve locality. If there is
	9161	+ * a real need of migration, periodic load balance will
	9162	+ * take care of it.
	9163	+ */
	9164	+ if (local_sgs.idle_cpus)
	9165	+ return NULL;
	9166	+ }
	9167	+
	9168	+ /*
	9169	+ * Select group with highest number of idle CPUs. We could also
	9170	+ * compare the utilization which is more stable but it can end
	9171	+ * up that the group has less spare capacity but finally more
	9172	+ * idle CPUs which means more opportunity to run task.
	9173	+ */
	9174	+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
	9175	+ return NULL;
	9176	+ break;
	9177	+ }
	9178	+
	9179	+ return idlest;
	9180	+}
	9181	+
9139	9182	/**
9140	9183	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9141	9184	* @env: The load balancing environment.
9142	9185	* @sds: variable to hold the statistics for this sched_domain.
9143	9186	*/
	9187	+
9144	9188	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
9145	9189	{
9146	9190	struct sched_domain *child = env->sd->child;
9147	9191	struct sched_group *sg = env->sd->groups;
9148	9192	struct sg_lb_stats *local = &sds->local_stat;
9149	9193	struct sg_lb_stats tmp_sgs;
9150		- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9151	9194	int sg_status = 0;
9152	9195
9153	9196	#ifdef CONFIG_NO_HZ_COMMON
..	..	@@ -9174,22 +9217,6 @@
9174	9217	if (local_group)
9175	9218	goto next_group;
9176	9219
9177		- /*
9178		- * In case the child domain prefers tasks go to siblings
9179		- * first, lower the sg capacity so that we'll try
9180		- * and move all the excess tasks away. We lower the capacity
9181		- * of a group only if the local group has the capacity to fit
9182		- * these excess tasks. The extra check prevents the case where
9183		- * you always pull from the heaviest group when it is already
9184		- * under-utilized (possible with a large weight task outweighs
9185		- * the tasks on the system).
9186		- */
9187		- if (prefer_sibling && sds->local &&
9188		- group_has_capacity(env, local) &&
9189		- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9190		- sgs->group_no_capacity = 1;
9191		- sgs->group_type = group_classify(sg, sgs);
9192		- }
9193	9220
9194	9221	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9195	9222	sds->busiest = sg;
..	..	@@ -9198,12 +9225,14 @@
9198	9225
9199	9226	next_group:
9200	9227	/* Now, start updating sd_lb_stats */
9201		- sds->total_running += sgs->sum_nr_running;
9202	9228	sds->total_load += sgs->group_load;
9203	9229	sds->total_capacity += sgs->group_capacity;
9204	9230
9205	9231	sg = sg->next;
9206	9232	} while (sg != env->sd->groups);
	9233	+
	9234	+ /* Tag domain that child domain prefers tasks go to siblings first */
	9235	+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9207	9236
9208	9237	#ifdef CONFIG_NO_HZ_COMMON
9209	9238	if ((env->flags & LBF_NOHZ_AGAIN) &&
..	..	@@ -9217,8 +9246,6 @@
9217	9246	if (env->sd->flags & SD_NUMA)
9218	9247	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9219	9248
9220		- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9221		-
9222	9249	if (!env->sd->parent) {
9223	9250	struct root_domain *rd = env->dst_rq->rd;
9224	9251
..	..	@@ -9227,144 +9254,28 @@
9227	9254
9228	9255	/* Update over-utilization (tipping point, U >= 0) indicator */
9229	9256	WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9230		- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
	9257	+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9231	9258	} else if (sg_status & SG_OVERUTILIZED) {
9232		- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9233		- trace_sched_overutilized(1);
9234		- }
	9259	+ struct root_domain *rd = env->dst_rq->rd;
9235	9260
	9261	+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
	9262	+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
	9263	+ }
9236	9264	}
9237	9265
9238		-/**
9239		- * check_asym_packing - Check to see if the group is packed into the
9240		- * sched domain.
9241		- *
9242		- * This is primarily intended to used at the sibling level. Some
9243		- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9244		- * case of POWER7, it can move to lower SMT modes only when higher
9245		- * threads are idle. When in lower SMT modes, the threads will
9246		- * perform better since they share less core resources. Hence when we
9247		- * have idle threads, we want them to be the higher ones.
9248		- *
9249		- * This packing function is run on idle threads. It checks to see if
9250		- * the busiest CPU in this domain (core in the P7 case) has a higher
9251		- * CPU number than the packing function is being run on. Here we are
9252		- * assuming lower CPU number will be equivalent to lower a SMT thread
9253		- * number.
9254		- *
9255		- * Return: 1 when packing is required and a task should be moved to
9256		- * this CPU. The amount of the imbalance is returned in env->imbalance.
9257		- *
9258		- * @env: The load balancing environment.
9259		- * @sds: Statistics of the sched_domain which is to be packed
9260		- */
9261		-static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
	9266	+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
9262	9267	{
9263		- int busiest_cpu;
9264		-
9265		- if (!(env->sd->flags & SD_ASYM_PACKING))
9266		- return 0;
9267		-
9268		- if (env->idle == CPU_NOT_IDLE)
9269		- return 0;
9270		-
9271		- if (!sds->busiest)
9272		- return 0;
9273		-
9274		- busiest_cpu = sds->busiest->asym_prefer_cpu;
9275		- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9276		- return 0;
9277		-
9278		- env->imbalance = DIV_ROUND_CLOSEST(
9279		- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9280		- SCHED_CAPACITY_SCALE);
9281		-
9282		- return 1;
9283		-}
9284		-
9285		-/**
9286		- * fix_small_imbalance - Calculate the minor imbalance that exists
9287		- * amongst the groups of a sched_domain, during
9288		- * load balancing.
9289		- * @env: The load balancing environment.
9290		- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9291		- */
9292		-static inline
9293		-void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
9294		-{
9295		- unsigned long tmp, capa_now = 0, capa_move = 0;
9296		- unsigned int imbn = 2;
9297		- unsigned long scaled_busy_load_per_task;
9298		- struct sg_lb_stats local, busiest;
9299		-
9300		- local = &sds->local_stat;
9301		- busiest = &sds->busiest_stat;
9302		-
9303		- if (!local->sum_nr_running)
9304		- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9305		- else if (busiest->load_per_task > local->load_per_task)
9306		- imbn = 1;
9307		-
9308		- scaled_busy_load_per_task =
9309		- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9310		- busiest->group_capacity;
9311		-
9312		- if (busiest->avg_load + scaled_busy_load_per_task >=
9313		- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9314		- env->imbalance = busiest->load_per_task;
9315		- return;
9316		- }
	9268	+ unsigned int imbalance_min;
9317	9269
9318	9270	/*
9319		- * OK, we don't have enough imbalance to justify moving tasks,
9320		- * however we may be able to increase total CPU capacity used by
9321		- * moving them.
	9271	+ * Allow a small imbalance based on a simple pair of communicating
	9272	+ * tasks that remain local when the source domain is almost idle.
9322	9273	*/
	9274	+ imbalance_min = 2;
	9275	+ if (nr_running <= imbalance_min)
	9276	+ return 0;
9323	9277
9324		- capa_now += busiest->group_capacity *
9325		- min(busiest->load_per_task, busiest->avg_load);
9326		- capa_now += local->group_capacity *
9327		- min(local->load_per_task, local->avg_load);
9328		- capa_now /= SCHED_CAPACITY_SCALE;
9329		-
9330		- /* Amount of load we'd subtract */
9331		- if (busiest->avg_load > scaled_busy_load_per_task) {
9332		- capa_move += busiest->group_capacity *
9333		- min(busiest->load_per_task,
9334		- busiest->avg_load - scaled_busy_load_per_task);
9335		- }
9336		-
9337		- /* Amount of load we'd add */
9338		- if (busiest->avg_load * busiest->group_capacity <
9339		- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9340		- tmp = (busiest->avg_load * busiest->group_capacity) /
9341		- local->group_capacity;
9342		- } else {
9343		- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9344		- local->group_capacity;
9345		- }
9346		- capa_move += local->group_capacity *
9347		- min(local->load_per_task, local->avg_load + tmp);
9348		- capa_move /= SCHED_CAPACITY_SCALE;
9349		-
9350		- /* Move if we gain throughput */
9351		- if (capa_move > capa_now) {
9352		- env->imbalance = busiest->load_per_task;
9353		- return;
9354		- }
9355		-
9356		- /* We can't see throughput improvement with the load-based
9357		- * method, but it is possible depending upon group size and
9358		- * capacity range that there might still be an underutilized
9359		- * cpu available in an asymmetric capacity system. Do one last
9360		- * check just in case.
9361		- */
9362		- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9363		- busiest->group_type == group_overloaded &&
9364		- busiest->sum_nr_running > busiest->group_weight &&
9365		- local->sum_nr_running < local->group_weight &&
9366		- local->group_capacity < busiest->group_capacity)
9367		- env->imbalance = busiest->load_per_task;
	9278	+ return imbalance;
9368	9279	}
9369	9280
9370	9281	/**
..	..	@@ -9375,96 +9286,169 @@
9375	9286	*/
9376	9287	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
9377	9288	{
9378		- unsigned long max_pull, load_above_capacity = ~0UL;
9379	9289	struct sg_lb_stats local, busiest;
9380	9290
9381	9291	local = &sds->local_stat;
9382	9292	busiest = &sds->busiest_stat;
9383	9293
	9294	+ if (busiest->group_type == group_misfit_task) {
	9295	+ /* Set imbalance to allow misfit tasks to be balanced. */
	9296	+ env->migration_type = migrate_misfit;
	9297	+ env->imbalance = 1;
	9298	+ return;
	9299	+ }
	9300	+
	9301	+ if (busiest->group_type == group_asym_packing) {
	9302	+ /*
	9303	+ * In case of asym capacity, we will try to migrate all load to
	9304	+ * the preferred CPU.
	9305	+ */
	9306	+ env->migration_type = migrate_task;
	9307	+ env->imbalance = busiest->sum_h_nr_running;
	9308	+ return;
	9309	+ }
	9310	+
9384	9311	if (busiest->group_type == group_imbalanced) {
9385	9312	/*
9386	9313	* In the group_imb case we cannot rely on group-wide averages
9387		- * to ensure CPU-load equilibrium, look at wider averages. XXX
	9314	+ * to ensure CPU-load equilibrium, try to move any task to fix
	9315	+ * the imbalance. The next load balance will take care of
	9316	+ * balancing back the system.
9388	9317	*/
9389		- busiest->load_per_task =
9390		- min(busiest->load_per_task, sds->avg_load);
	9318	+ env->migration_type = migrate_task;
	9319	+ env->imbalance = 1;
	9320	+ return;
9391	9321	}
9392	9322
9393	9323	/*
9394		- * Avg load of busiest sg can be less and avg load of local sg can
9395		- * be greater than avg load across all sgs of sd because avg load
9396		- * factors in sg capacity and sgs with smaller group_type are
9397		- * skipped when updating the busiest sg:
	9324	+ * Try to use spare capacity of local group without overloading it or
	9325	+ * emptying busiest.
9398	9326	*/
9399		- if (busiest->group_type != group_misfit_task &&
9400		- (busiest->avg_load <= sds->avg_load \|\|
9401		- local->avg_load >= sds->avg_load)) {
9402		- env->imbalance = 0;
9403		- return fix_small_imbalance(env, sds);
	9327	+ if (local->group_type == group_has_spare) {
	9328	+ if ((busiest->group_type > group_fully_busy) &&
	9329	+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
	9330	+ /*
	9331	+ * If busiest is overloaded, try to fill spare
	9332	+ * capacity. This might end up creating spare capacity
	9333	+ * in busiest or busiest still being overloaded but
	9334	+ * there is no simple way to directly compute the
	9335	+ * amount of load to migrate in order to balance the
	9336	+ * system.
	9337	+ */
	9338	+ env->migration_type = migrate_util;
	9339	+ env->imbalance = max(local->group_capacity, local->group_util) -
	9340	+ local->group_util;
	9341	+
	9342	+ /*
	9343	+ * In some cases, the group's utilization is max or even
	9344	+ * higher than capacity because of migrations but the
	9345	+ * local CPU is (newly) idle. There is at least one
	9346	+ * waiting task in this overloaded busiest group. Let's
	9347	+ * try to pull it.
	9348	+ */
	9349	+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
	9350	+ env->migration_type = migrate_task;
	9351	+ env->imbalance = 1;
	9352	+ }
	9353	+
	9354	+ return;
	9355	+ }
	9356	+
	9357	+ if (busiest->group_weight == 1 \|\| sds->prefer_sibling) {
	9358	+ unsigned int nr_diff = busiest->sum_nr_running;
	9359	+ /*
	9360	+ * When prefer sibling, evenly spread running tasks on
	9361	+ * groups.
	9362	+ */
	9363	+ env->migration_type = migrate_task;
	9364	+ lsub_positive(&nr_diff, local->sum_nr_running);
	9365	+ env->imbalance = nr_diff >> 1;
	9366	+ } else {
	9367	+
	9368	+ /*
	9369	+ * If there is no overload, we just want to even the number of
	9370	+ * idle cpus.
	9371	+ */
	9372	+ env->migration_type = migrate_task;
	9373	+ env->imbalance = max_t(long, 0, (local->idle_cpus -
	9374	+ busiest->idle_cpus) >> 1);
	9375	+ }
	9376	+
	9377	+ /* Consider allowing a small imbalance between NUMA groups */
	9378	+ if (env->sd->flags & SD_NUMA)
	9379	+ env->imbalance = adjust_numa_imbalance(env->imbalance,
	9380	+ busiest->sum_nr_running);
	9381	+
	9382	+ return;
9404	9383	}
9405	9384
9406	9385	/*
9407		- * If there aren't any idle CPUs, avoid creating some.
	9386	+ * Local is fully busy but has to take more load to relieve the
	9387	+ * busiest group
9408	9388	*/
9409		- if (busiest->group_type == group_overloaded &&
9410		- local->group_type == group_overloaded) {
9411		- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9412		- if (load_above_capacity > busiest->group_capacity) {
9413		- load_above_capacity -= busiest->group_capacity;
9414		- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9415		- load_above_capacity /= busiest->group_capacity;
9416		- } else
9417		- load_above_capacity = ~0UL;
	9389	+ if (local->group_type < group_overloaded) {
	9390	+ /*
	9391	+ * Local will become overloaded so the avg_load metrics are
	9392	+ * finally needed.
	9393	+ */
	9394	+
	9395	+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
	9396	+ local->group_capacity;
	9397	+
	9398	+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
	9399	+ sds->total_capacity;
	9400	+ /*
	9401	+ * If the local group is more loaded than the selected
	9402	+ * busiest group don't try to pull any tasks.
	9403	+ */
	9404	+ if (local->avg_load >= busiest->avg_load) {
	9405	+ env->imbalance = 0;
	9406	+ return;
	9407	+ }
9418	9408	}
9419	9409
9420	9410	/*
9421		- * We're trying to get all the CPUs to the average_load, so we don't
9422		- * want to push ourselves above the average load, nor do we wish to
9423		- * reduce the max loaded CPU below the average load. At the same time,
9424		- * we also don't want to reduce the group load below the group
9425		- * capacity. Thus we look for the minimum possible imbalance.
	9411	+ * Both group are or will become overloaded and we're trying to get all
	9412	+ * the CPUs to the average_load, so we don't want to push ourselves
	9413	+ * above the average load, nor do we wish to reduce the max loaded CPU
	9414	+ * below the average load. At the same time, we also don't want to
	9415	+ * reduce the group load below the group capacity. Thus we look for
	9416	+ * the minimum possible imbalance.
9426	9417	*/
9427		- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9428		-
9429		- /* How much load to actually move to equalise the imbalance */
	9418	+ env->migration_type = migrate_load;
9430	9419	env->imbalance = min(
9431		- max_pull * busiest->group_capacity,
	9420	+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9432	9421	(sds->avg_load - local->avg_load) * local->group_capacity
9433	9422	) / SCHED_CAPACITY_SCALE;
9434		-
9435		- /* Boost imbalance to allow misfit task to be balanced.
9436		- * Always do this if we are doing a NEWLY_IDLE balance
9437		- * on the assumption that any tasks we have must not be
9438		- * long-running (and hence we cannot rely upon load).
9439		- * However if we are not idle, we should assume the tasks
9440		- * we have are longer running and not override load-based
9441		- * calculations above unless we are sure that the local
9442		- * group is underutilized.
9443		- */
9444		- if (busiest->group_type == group_misfit_task &&
9445		- (env->idle == CPU_NEWLY_IDLE \|\|
9446		- local->sum_nr_running < local->group_weight)) {
9447		- env->imbalance = max_t(long, env->imbalance,
9448		- busiest->group_misfit_task_load);
9449		- }
9450		-
9451		- /*
9452		- * if *imbalance is less than the average load per runnable task
9453		- * there is no guarantee that any tasks will be moved so we'll have
9454		- * a think about bumping its value to force at least one task to be
9455		- * moved
9456		- */
9457		- if (env->imbalance < busiest->load_per_task)
9458		- return fix_small_imbalance(env, sds);
9459	9423	}
9460	9424
9461	9425	/***** find_busiest_group() helpers end here *******************/
	9426	+
	9427	+/*
	9428	+ * Decision matrix according to the local and busiest group type:
	9429	+ *
	9430	+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
	9431	+ * has_spare nr_idle balanced N/A N/A balanced balanced
	9432	+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
	9433	+ * misfit_task force N/A N/A N/A force force
	9434	+ * asym_packing force force N/A N/A force force
	9435	+ * imbalanced force force N/A N/A force force
	9436	+ * overloaded force force N/A N/A force avg_load
	9437	+ *
	9438	+ * N/A : Not Applicable because already filtered while updating
	9439	+ * statistics.
	9440	+ * balanced : The system is balanced for these 2 groups.
	9441	+ * force : Calculate the imbalance as load migration is probably needed.
	9442	+ * avg_load : Only if imbalance is significant enough.
	9443	+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
	9444	+ * different in groups.
	9445	+ */
9462	9446
9463	9447	/**
9464	9448	* find_busiest_group - Returns the busiest group within the sched_domain
9465	9449	* if there is an imbalance.
9466	9450	*
9467		- * Also calculates the amount of weighted load which should be moved
	9451	+ * Also calculates the amount of runnable load which should be moved
9468	9452	* to restore balance.
9469	9453	*
9470	9454	* @env: The load balancing environment.
..	..	@@ -9479,32 +9463,36 @@
9479	9463	init_sd_lb_stats(&sds);
9480	9464
9481	9465	/*
9482		- * Compute the various statistics relavent for load balancing at
	9466	+ * Compute the various statistics relevant for load balancing at
9483	9467	* this level.
9484	9468	*/
9485	9469	update_sd_lb_stats(env, &sds);
9486	9470
9487		- if (static_branch_unlikely(&sched_energy_present)) {
	9471	+ if (sched_energy_enabled()) {
9488	9472	struct root_domain *rd = env->dst_rq->rd;
	9473	+ int out_balance = 1;
9489	9474
9490		- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
	9475	+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
	9476	+ &out_balance);
	9477	+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
	9478	+ && out_balance)
9491	9479	goto out_balanced;
9492	9480	}
9493	9481
9494	9482	local = &sds.local_stat;
9495	9483	busiest = &sds.busiest_stat;
9496	9484
9497		- /* ASYM feature bypasses nice load balance check */
9498		- if (check_asym_packing(env, &sds))
9499		- return sds.busiest;
9500		-
9501	9485	/* There is no busy sibling group to pull tasks from */
9502		- if (!sds.busiest \|\| busiest->sum_nr_running == 0)
	9486	+ if (!sds.busiest)
9503	9487	goto out_balanced;
9504	9488
9505		- /* XXX broken for overlapping NUMA groups */
9506		- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9507		- / sds.total_capacity;
	9489	+ /* Misfit tasks should be dealt with regardless of the avg load */
	9490	+ if (busiest->group_type == group_misfit_task)
	9491	+ goto force_balance;
	9492	+
	9493	+ /* ASYM feature bypasses nice load balance check */
	9494	+ if (busiest->group_type == group_asym_packing)
	9495	+ goto force_balance;
9508	9496
9509	9497	/*
9510	9498	* If the busiest group is imbalanced the below checks don't
..	..	@@ -9515,55 +9503,80 @@
9515	9503	goto force_balance;
9516	9504
9517	9505	/*
9518		- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9519		- * capacities from resulting in underutilization due to avg_load.
9520		- */
9521		- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9522		- busiest->group_no_capacity)
9523		- goto force_balance;
9524		-
9525		- /* Misfit tasks should be dealt with regardless of the avg load */
9526		- if (busiest->group_type == group_misfit_task)
9527		- goto force_balance;
9528		-
9529		- /*
9530	9506	* If the local group is busier than the selected busiest group
9531	9507	* don't try and pull any tasks.
9532	9508	*/
9533		- if (local->avg_load >= busiest->avg_load)
	9509	+ if (local->group_type > busiest->group_type)
9534	9510	goto out_balanced;
9535	9511
9536	9512	/*
9537		- * Don't pull any tasks if this group is already above the domain
9538		- * average load.
	9513	+ * When groups are overloaded, use the avg_load to ensure fairness
	9514	+ * between tasks.
9539	9515	*/
9540		- if (local->avg_load >= sds.avg_load)
9541		- goto out_balanced;
9542		-
9543		- if (env->idle == CPU_IDLE) {
	9516	+ if (local->group_type == group_overloaded) {
9544	9517	/*
9545		- * This CPU is idle. If the busiest group is not overloaded
9546		- * and there is no imbalance between this and busiest group
9547		- * wrt idle CPUs, it is balanced. The imbalance becomes
9548		- * significant if the diff is greater than 1 otherwise we
9549		- * might end up to just move the imbalance on another group
	9518	+ * If the local group is more loaded than the selected
	9519	+ * busiest group don't try to pull any tasks.
9550	9520	*/
9551		- if ((busiest->group_type != group_overloaded) &&
9552		- (local->idle_cpus <= (busiest->idle_cpus + 1)))
	9521	+ if (local->avg_load >= busiest->avg_load)
9553	9522	goto out_balanced;
9554		- } else {
	9523	+
	9524	+ /* XXX broken for overlapping NUMA groups */
	9525	+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
	9526	+ sds.total_capacity;
	9527	+
9555	9528	/*
9556		- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9557		- * imbalance_pct to be conservative.
	9529	+ * Don't pull any tasks if this group is already above the
	9530	+ * domain average load.
	9531	+ */
	9532	+ if (local->avg_load >= sds.avg_load)
	9533	+ goto out_balanced;
	9534	+
	9535	+ /*
	9536	+ * If the busiest group is more loaded, use imbalance_pct to be
	9537	+ * conservative.
9558	9538	*/
9559	9539	if (100 * busiest->avg_load <=
9560	9540	env->sd->imbalance_pct * local->avg_load)
9561	9541	goto out_balanced;
9562	9542	}
9563	9543
	9544	+ /* Try to move all excess tasks to child's sibling domain */
	9545	+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
	9546	+ busiest->sum_nr_running > local->sum_nr_running + 1)
	9547	+ goto force_balance;
	9548	+
	9549	+ if (busiest->group_type != group_overloaded) {
	9550	+ if (env->idle == CPU_NOT_IDLE)
	9551	+ /*
	9552	+ * If the busiest group is not overloaded (and as a
	9553	+ * result the local one too) but this CPU is already
	9554	+ * busy, let another idle CPU try to pull task.
	9555	+ */
	9556	+ goto out_balanced;
	9557	+
	9558	+ if (busiest->group_weight > 1 &&
	9559	+ local->idle_cpus <= (busiest->idle_cpus + 1))
	9560	+ /*
	9561	+ * If the busiest group is not overloaded
	9562	+ * and there is no imbalance between this and busiest
	9563	+ * group wrt idle CPUs, it is balanced. The imbalance
	9564	+ * becomes significant if the diff is greater than 1
	9565	+ * otherwise we might end up to just move the imbalance
	9566	+ * on another group. Of course this applies only if
	9567	+ * there is more than 1 CPU per group.
	9568	+ */
	9569	+ goto out_balanced;
	9570	+
	9571	+ if (busiest->sum_h_nr_running == 1)
	9572	+ /*
	9573	+ * busiest doesn't have any tasks waiting to run
	9574	+ */
	9575	+ goto out_balanced;
	9576	+ }
	9577	+
9564	9578	force_balance:
9565	9579	/* Looks like there is an imbalance. Compute it */
9566		- env->src_grp_type = busiest->group_type;
9567	9580	calculate_imbalance(env, &sds);
9568	9581	return env->imbalance ? sds.busiest : NULL;
9569	9582
..	..	@@ -9579,11 +9592,18 @@
9579	9592	struct sched_group *group)
9580	9593	{
9581	9594	struct rq busiest = NULL, rq;
9582		- unsigned long busiest_load = 0, busiest_capacity = 1;
9583		- int i;
	9595	+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
	9596	+ unsigned int busiest_nr = 0;
	9597	+ int i, done = 0;
	9598	+
	9599	+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
	9600	+ &busiest, &done);
	9601	+ if (done)
	9602	+ return busiest;
9584	9603
9585	9604	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9586		- unsigned long capacity, wl;
	9605	+ unsigned long capacity, load, util;
	9606	+ unsigned int nr_running;
9587	9607	enum fbq_type rt;
9588	9608
9589	9609	rq = cpu_rq(i);
..	..	@@ -9611,20 +9631,8 @@
9611	9631	if (rt > env->fbq_type)
9612	9632	continue;
9613	9633
9614		- /*
9615		- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9616		- * seek the "biggest" misfit task.
9617		- */
9618		- if (env->src_grp_type == group_misfit_task) {
9619		- if (rq->misfit_task_load > busiest_load) {
9620		- busiest_load = rq->misfit_task_load;
9621		- busiest = rq;
9622		- }
9623		-
9624		- continue;
9625		- }
9626		-
9627	9634	capacity = capacity_of(i);
	9635	+ nr_running = rq->cfs.h_nr_running;
9628	9636
9629	9637	/*
9630	9638	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
..	..	@@ -9634,35 +9642,77 @@
9634	9642	*/
9635	9643	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9636	9644	capacity_of(env->dst_cpu) < capacity &&
9637		- rq->nr_running == 1)
	9645	+ nr_running == 1)
9638	9646	continue;
9639	9647
9640		- wl = weighted_cpuload(rq);
	9648	+ switch (env->migration_type) {
	9649	+ case migrate_load:
	9650	+ /*
	9651	+ * When comparing with load imbalance, use cpu_load()
	9652	+ * which is not scaled with the CPU capacity.
	9653	+ */
	9654	+ load = cpu_load(rq);
9641	9655
9642		- /*
9643		- * When comparing with imbalance, use weighted_cpuload()
9644		- * which is not scaled with the CPU capacity.
9645		- */
	9656	+ if (nr_running == 1 && load > env->imbalance &&
	9657	+ !check_cpu_capacity(rq, env->sd))
	9658	+ break;
9646	9659
9647		- if (rq->nr_running == 1 && wl > env->imbalance &&
9648		- !check_cpu_capacity(rq, env->sd))
9649		- continue;
	9660	+ /*
	9661	+ * For the load comparisons with the other CPUs,
	9662	+ * consider the cpu_load() scaled with the CPU
	9663	+ * capacity, so that the load can be moved away
	9664	+ * from the CPU that is potentially running at a
	9665	+ * lower capacity.
	9666	+ *
	9667	+ * Thus we're looking for max(load_i / capacity_i),
	9668	+ * crosswise multiplication to rid ourselves of the
	9669	+ * division works out to:
	9670	+ * load_i * capacity_j > load_j * capacity_i;
	9671	+ * where j is our previous maximum.
	9672	+ */
	9673	+ if (load * busiest_capacity > busiest_load * capacity) {
	9674	+ busiest_load = load;
	9675	+ busiest_capacity = capacity;
	9676	+ busiest = rq;
	9677	+ }
	9678	+ break;
9650	9679
9651		- /*
9652		- * For the load comparisons with the other CPU's, consider
9653		- * the weighted_cpuload() scaled with the CPU capacity, so
9654		- * that the load can be moved away from the CPU that is
9655		- * potentially running at a lower capacity.
9656		- *
9657		- * Thus we're looking for max(wl_i / capacity_i), crosswise
9658		- * multiplication to rid ourselves of the division works out
9659		- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9660		- * our previous maximum.
9661		- */
9662		- if (wl * busiest_capacity > busiest_load * capacity) {
9663		- busiest_load = wl;
9664		- busiest_capacity = capacity;
9665		- busiest = rq;
	9680	+ case migrate_util:
	9681	+ util = cpu_util(cpu_of(rq));
	9682	+
	9683	+ /*
	9684	+ * Don't try to pull utilization from a CPU with one
	9685	+ * running task. Whatever its utilization, we will fail
	9686	+ * detach the task.
	9687	+ */
	9688	+ if (nr_running <= 1)
	9689	+ continue;
	9690	+
	9691	+ if (busiest_util < util) {
	9692	+ busiest_util = util;
	9693	+ busiest = rq;
	9694	+ }
	9695	+ break;
	9696	+
	9697	+ case migrate_task:
	9698	+ if (busiest_nr < nr_running) {
	9699	+ busiest_nr = nr_running;
	9700	+ busiest = rq;
	9701	+ }
	9702	+ break;
	9703	+
	9704	+ case migrate_misfit:
	9705	+ /*
	9706	+ * For ASYM_CPUCAPACITY domains with misfit tasks we
	9707	+ * simply seek the "biggest" misfit task.
	9708	+ */
	9709	+ if (rq->misfit_task_load > busiest_load) {
	9710	+ busiest_load = rq->misfit_task_load;
	9711	+ busiest = rq;
	9712	+ }
	9713	+
	9714	+ break;
	9715	+
9666	9716	}
9667	9717	}
9668	9718
..	..	@@ -9675,21 +9725,25 @@
9675	9725	*/
9676	9726	#define MAX_PINNED_INTERVAL 512
9677	9727
9678		-static int need_active_balance(struct lb_env *env)
	9728	+static inline bool
	9729	+asym_active_balance(struct lb_env *env)
	9730	+{
	9731	+ /*
	9732	+ * ASYM_PACKING needs to force migrate tasks from busy but
	9733	+ * lower priority CPUs in order to pack all tasks in the
	9734	+ * highest priority CPUs.
	9735	+ */
	9736	+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
	9737	+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
	9738	+}
	9739	+
	9740	+static inline bool
	9741	+voluntary_active_balance(struct lb_env *env)
9679	9742	{
9680	9743	struct sched_domain *sd = env->sd;
9681	9744
9682		- if (env->idle == CPU_NEWLY_IDLE) {
9683		-
9684		- /*
9685		- * ASYM_PACKING needs to force migrate tasks from busy but
9686		- * lower priority CPUs in order to pack all tasks in the
9687		- * highest priority CPUs.
9688		- */
9689		- if ((sd->flags & SD_ASYM_PACKING) &&
9690		- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9691		- return 1;
9692		- }
	9745	+ if (asym_active_balance(env))
	9746	+ return 1;
9693	9747
9694	9748	/*
9695	9749	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
..	..	@@ -9704,19 +9758,18 @@
9704	9758	return 1;
9705	9759	}
9706	9760
9707		- if (env->src_grp_type == group_misfit_task)
	9761	+ if (env->migration_type == migrate_misfit)
9708	9762	return 1;
9709	9763
9710		- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9711		- env->src_rq->cfs.h_nr_running == 1 &&
9712		- cpu_overutilized(env->src_cpu) &&
9713		- !cpu_overutilized(env->dst_cpu)) {
9714		- return 1;
9715		- }
	9764	+ return 0;
	9765	+}
9716	9766
9717		- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9718		- return 1;
	9767	+static int need_active_balance(struct lb_env *env)
	9768	+{
	9769	+ struct sched_domain *sd = env->sd;
9719	9770
	9771	+ if (voluntary_active_balance(env))
	9772	+ return 1;
9720	9773
9721	9774	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9722	9775	}
..	..	@@ -9726,7 +9779,17 @@
9726	9779	static int should_we_balance(struct lb_env *env)
9727	9780	{
9728	9781	struct sched_group *sg = env->sd->groups;
9729		- int cpu, balance_cpu = -1;
	9782	+ int cpu;
	9783	+
	9784	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	9785	+ struct root_domain *rd = env->dst_rq->rd;
	9786	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	9787	+ int level = rockchip_perf_get_level();
	9788	+
	9789	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	9790	+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
	9791	+ return 0;
	9792	+ }
9730	9793
9731	9794	/*
9732	9795	* Ensure the balancing environment is consistent; can happen
..	..	@@ -9747,18 +9810,12 @@
9747	9810	if (!idle_cpu(cpu))
9748	9811	continue;
9749	9812
9750		- balance_cpu = cpu;
9751		- break;
	9813	+ /* Are we the first idle CPU? */
	9814	+ return cpu == env->dst_cpu;
9752	9815	}
9753	9816
9754		- if (balance_cpu == -1)
9755		- balance_cpu = group_balance_cpu(sg);
9756		-
9757		- /*
9758		- * First idle CPU or the first CPU(busiest) in this sched group
9759		- * is eligible for doing load balancing at this and above domains.
9760		- */
9761		- return balance_cpu == env->dst_cpu;
	9817	+ /* Are we the first CPU of this group ? */
	9818	+ return group_balance_cpu(sg) == env->dst_cpu;
9762	9819	}
9763	9820
9764	9821	/*
..	..	@@ -9830,6 +9887,7 @@
9830	9887
9831	9888	more_balance:
9832	9889	rq_lock_irqsave(busiest, &rf);
	9890	+ env.src_rq_rf = &rf;
9833	9891	update_rq_clock(busiest);
9834	9892
9835	9893	/*
..	..	@@ -9882,7 +9940,7 @@
9882	9940	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9883	9941
9884	9942	/* Prevent to re-select dst_cpu via env's CPUs */
9885		- cpumask_clear_cpu(env.dst_cpu, env.cpus);
	9943	+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9886	9944
9887	9945	env.dst_rq = cpu_rq(env.new_dst_cpu);
9888	9946	env.dst_cpu = env.new_dst_cpu;
..	..	@@ -9909,7 +9967,7 @@
9909	9967
9910	9968	/* All tasks on this runqueue were pinned by CPU affinity */
9911	9969	if (unlikely(env.flags & LBF_ALL_PINNED)) {
9912		- cpumask_clear_cpu(cpu_of(busiest), cpus);
	9970	+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
9913	9971	/*
9914	9972	* Attempting to continue load balancing at the current
9915	9973	* sched_domain level only makes sense if there are
..	..	@@ -9936,8 +9994,7 @@
9936	9994	* excessive cache_hot migrations and active balances.
9937	9995	*/
9938	9996	if (idle != CPU_NEWLY_IDLE)
9939		- if (env.src_grp_nr_running > 1)
9940		- sd->nr_balance_failed++;
	9997	+ sd->nr_balance_failed++;
9941	9998
9942	9999	if (need_active_balance(&env)) {
9943	10000	unsigned long flags;
..	..	@@ -9980,7 +10037,7 @@
9980	10037	} else
9981	10038	sd->nr_balance_failed = 0;
9982	10039
9983		- if (likely(!active_balance)) {
	10040	+ if (likely(!active_balance) \|\| voluntary_active_balance(&env)) {
9984	10041	/* We were unbalanced, so reset the balancing interval */
9985	10042	sd->balance_interval = sd->min_interval;
9986	10043	} else {
..	..	@@ -10023,18 +10080,18 @@
10023	10080	ld_moved = 0;
10024	10081
10025	10082	/*
10026		- * idle_balance() disregards balance intervals, so we could repeatedly
10027		- * reach this code, which would lead to balance_interval skyrocketting
10028		- * in a short amount of time. Skip the balance_interval increase logic
10029		- * to avoid that.
	10083	+ * newidle_balance() disregards balance intervals, so we could
	10084	+ * repeatedly reach this code, which would lead to balance_interval
	10085	+ * skyrocketting in a short amount of time. Skip the balance_interval
	10086	+ * increase logic to avoid that.
10030	10087	*/
10031	10088	if (env.idle == CPU_NEWLY_IDLE)
10032	10089	goto out;
10033	10090
10034	10091	/* tune up the balancing interval */
10035		- if (((env.flags & LBF_ALL_PINNED) &&
10036		- sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
10037		- (sd->balance_interval < sd->max_interval))
	10092	+ if ((env.flags & LBF_ALL_PINNED &&
	10093	+ sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
	10094	+ sd->balance_interval < sd->max_interval)
10038	10095	sd->balance_interval *= 2;
10039	10096	out:
10040	10097	return ld_moved;
..	..	@@ -10050,6 +10107,15 @@
10050	10107
10051	10108	/* scale ms to jiffies */
10052	10109	interval = msecs_to_jiffies(interval);
	10110	+
	10111	+ /*
	10112	+ * Reduce likelihood of busy balancing at higher domains racing with
	10113	+ * balancing at lower domains by preventing their balancing periods
	10114	+ * from being multiples of each other.
	10115	+ */
	10116	+ if (cpu_busy)
	10117	+ interval -= 1;
	10118	+
10053	10119	interval = clamp(interval, 1UL, max_load_balance_interval);
10054	10120
10055	10121	return interval;
..	..	@@ -10112,9 +10178,8 @@
10112	10178	/* Search for an sd spanning us and the target CPU. */
10113	10179	rcu_read_lock();
10114	10180	for_each_domain(target_cpu, sd) {
10115		- if ((sd->flags & SD_LOAD_BALANCE) &&
10116		- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10117		- break;
	10181	+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
	10182	+ break;
10118	10183	}
10119	10184
10120	10185	if (likely(sd)) {
..	..	@@ -10132,6 +10197,7 @@
10132	10197	* about DST_PINNED.
10133	10198	*/
10134	10199	.flags = LBF_DST_PINNED,
	10200	+ .src_rq_rf = &rf,
10135	10201	};
10136	10202
10137	10203	schedstat_inc(sd->alb_count);
..	..	@@ -10167,7 +10233,7 @@
10167	10233	*/
10168	10234	void update_max_interval(void)
10169	10235	{
10170		- max_load_balance_interval = HZ*num_online_cpus()/10;
	10236	+ max_load_balance_interval = HZ*num_active_cpus()/10;
10171	10237	}
10172	10238
10173	10239	/*
..	..	@@ -10180,6 +10246,7 @@
10180	10246	{
10181	10247	int continue_balancing = 1;
10182	10248	int cpu = rq->cpu;
	10249	+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10183	10250	unsigned long interval;
10184	10251	struct sched_domain *sd;
10185	10252	/* Earliest time when we have to do rebalance again */
..	..	@@ -10187,6 +10254,10 @@
10187	10254	int update_next_balance = 0;
10188	10255	int need_serialize, need_decay = 0;
10189	10256	u64 max_cost = 0;
	10257	+
	10258	+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
	10259	+ if (!continue_balancing)
	10260	+ return;
10190	10261
10191	10262	rcu_read_lock();
10192	10263	for_each_domain(cpu, sd) {
..	..	@@ -10202,9 +10273,6 @@
10202	10273	}
10203	10274	max_cost += sd->max_newidle_lb_cost;
10204	10275
10205		- if (!(sd->flags & SD_LOAD_BALANCE))
10206		- continue;
10207		-
10208	10276	/*
10209	10277	* Stop the load balance at this level. There is another
10210	10278	* CPU in our sched group which is doing load balancing more
..	..	@@ -10216,7 +10284,7 @@
10216	10284	break;
10217	10285	}
10218	10286
10219		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10287	+ interval = get_sd_balance_interval(sd, busy);
10220	10288
10221	10289	need_serialize = sd->flags & SD_SERIALIZE;
10222	10290	if (need_serialize) {
..	..	@@ -10232,9 +10300,10 @@
10232	10300	* state even if we migrated tasks. Update it.
10233	10301	*/
10234	10302	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
	10303	+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10235	10304	}
10236	10305	sd->last_balance = jiffies;
10237		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10306	+ interval = get_sd_balance_interval(sd, busy);
10238	10307	}
10239	10308	if (need_serialize)
10240	10309	spin_unlock(&balancing);
..	..	@@ -10294,7 +10363,11 @@
10294	10363
10295	10364	static inline int find_new_ilb(void)
10296	10365	{
10297		- int ilb;
	10366	+ int ilb = -1;
	10367	+
	10368	+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
	10369	+ if (ilb >= 0)
	10370	+ return ilb;
10298	10371
10299	10372	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
10300	10373	housekeeping_cpumask(HK_FLAG_MISC)) {
..	..	@@ -10325,29 +10398,25 @@
10325	10398	if (ilb_cpu >= nr_cpu_ids)
10326	10399	return;
10327	10400
	10401	+ /*
	10402	+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
	10403	+ * the first flag owns it; cleared by nohz_csd_func().
	10404	+ */
10328	10405	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10329	10406	if (flags & NOHZ_KICK_MASK)
10330	10407	return;
10331	10408
10332	10409	/*
10333		- * Use smp_send_reschedule() instead of resched_cpu().
10334		- * This way we generate a sched IPI on the target CPU which
	10410	+ * This way we generate an IPI on the target CPU which
10335	10411	* is idle. And the softirq performing nohz idle load balance
10336	10412	* will be run before returning from the IPI.
10337	10413	*/
10338		- smp_send_reschedule(ilb_cpu);
	10414	+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10339	10415	}
10340	10416
10341	10417	/*
10342		- * Current heuristic for kicking the idle load balancer in the presence
10343		- * of an idle cpu in the system.
10344		- * - This rq has more than one task.
10345		- * - This rq has at least one CFS task and the capacity of the CPU is
10346		- * significantly reduced because of RT tasks or IRQs.
10347		- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10348		- * multiple busy cpu.
10349		- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10350		- * domain span are idle.
	10418	+ * Current decision point for kicking the idle load balancer in the presence
	10419	+ * of idle CPUs in the system.
10351	10420	*/
10352	10421	static void nohz_balancer_kick(struct rq *rq)
10353	10422	{
..	..	@@ -10356,6 +10425,7 @@
10356	10425	struct sched_domain *sd;
10357	10426	int nr_busy, i, cpu = rq->cpu;
10358	10427	unsigned int flags = 0;
	10428	+ int done = 0;
10359	10429
10360	10430	if (unlikely(rq->idle_balance))
10361	10431	return;
..	..	@@ -10380,30 +10450,25 @@
10380	10450	if (time_before(now, nohz.next_balance))
10381	10451	goto out;
10382	10452
10383		- if (rq->nr_running >= 2 \|\| rq->misfit_task_load) {
	10453	+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
	10454	+ if (done)
	10455	+ goto out;
	10456	+
	10457	+ if (rq->nr_running >= 2) {
10384	10458	flags = NOHZ_KICK_MASK;
10385	10459	goto out;
10386	10460	}
10387	10461
10388	10462	rcu_read_lock();
10389		- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10390		- if (sds) {
10391		- /*
10392		- * XXX: write a coherent comment on why we do this.
10393		- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10394		- */
10395		- nr_busy = atomic_read(&sds->nr_busy_cpus);
10396		- if (nr_busy > 1) {
10397		- flags = NOHZ_KICK_MASK;
10398		- goto unlock;
10399		- }
10400		-
10401		- }
10402	10463
10403	10464	sd = rcu_dereference(rq->sd);
10404	10465	if (sd) {
10405		- if ((rq->cfs.h_nr_running >= 1) &&
10406		- check_cpu_capacity(rq, sd)) {
	10466	+ /*
	10467	+ * If there's a CFS task and the current CPU has reduced
	10468	+ * capacity; kick the ILB to see if there's a better CPU to run
	10469	+ * on.
	10470	+ */
	10471	+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10407	10472	flags = NOHZ_KICK_MASK;
10408	10473	goto unlock;
10409	10474	}
..	..	@@ -10411,15 +10476,55 @@
10411	10476
10412	10477	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10413	10478	if (sd) {
10414		- for_each_cpu(i, sched_domain_span(sd)) {
10415		- if (i == cpu \|\|
10416		- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10417		- continue;
10418		-
	10479	+ /*
	10480	+ * When ASYM_PACKING; see if there's a more preferred CPU
	10481	+ * currently idle; in which case, kick the ILB to move tasks
	10482	+ * around.
	10483	+ */
	10484	+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10419	10485	if (sched_asym_prefer(i, cpu)) {
10420	10486	flags = NOHZ_KICK_MASK;
10421	10487	goto unlock;
10422	10488	}
	10489	+ }
	10490	+ }
	10491	+
	10492	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
	10493	+ if (sd) {
	10494	+ /*
	10495	+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
	10496	+ * to run the misfit task on.
	10497	+ */
	10498	+ if (check_misfit_status(rq, sd)) {
	10499	+ flags = NOHZ_KICK_MASK;
	10500	+ goto unlock;
	10501	+ }
	10502	+
	10503	+ /*
	10504	+ * For asymmetric systems, we do not want to nicely balance
	10505	+ * cache use, instead we want to embrace asymmetry and only
	10506	+ * ensure tasks have enough CPU capacity.
	10507	+ *
	10508	+ * Skip the LLC logic because it's not relevant in that case.
	10509	+ */
	10510	+ goto unlock;
	10511	+ }
	10512	+
	10513	+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
	10514	+ if (sds) {
	10515	+ /*
	10516	+ * If there is an imbalance between LLC domains (IOW we could
	10517	+ * increase the overall cache use), we need some less-loaded LLC
	10518	+ * domain to pull some load. Likewise, we may need to spread
	10519	+ * load within the current LLC domain (e.g. packed SMT cores but
	10520	+ * other CPUs are idle). We can't really know from here how busy
	10521	+ * the others are - so just get a nohz balance going if it looks
	10522	+ * like this LLC domain has tasks we could move.
	10523	+ */
	10524	+ nr_busy = atomic_read(&sds->nr_busy_cpus);
	10525	+ if (nr_busy > 1) {
	10526	+ flags = NOHZ_KICK_MASK;
	10527	+ goto unlock;
10423	10528	}
10424	10529	}
10425	10530	unlock:
..	..	@@ -10485,9 +10590,20 @@
10485	10590
10486	10591	SCHED_WARN_ON(cpu != smp_processor_id());
10487	10592
10488		- /* If this CPU is going down, then nothing needs to be done: */
10489		- if (!cpu_active(cpu))
	10593	+ if (!cpu_active(cpu)) {
	10594	+ /*
	10595	+ * A CPU can be paused while it is idle with it's tick
	10596	+ * stopped. nohz_balance_exit_idle() should be called
	10597	+ * from the local CPU, so it can't be called during
	10598	+ * pause. This results in paused CPU participating in
	10599	+ * the nohz idle balance, which should be avoided.
	10600	+ *
	10601	+ * When the paused CPU exits idle and enters again,
	10602	+ * exempt the paused CPU from nohz_balance_exit_idle.
	10603	+ */
	10604	+ nohz_balance_exit_idle(rq);
10490	10605	return;
	10606	+ }
10491	10607
10492	10608	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
10493	10609	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
..	..	@@ -10600,7 +10716,6 @@
10600	10716
10601	10717	rq_lock_irqsave(rq, &rf);
10602	10718	update_rq_clock(rq);
10603		- cpu_load_update_idle(rq);
10604	10719	rq_unlock_irqrestore(rq, &rf);
10605	10720
10606	10721	if (flags & NOHZ_BALANCE_KICK)
..	..	@@ -10650,22 +10765,14 @@
10650	10765	*/
10651	10766	static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10652	10767	{
10653		- int this_cpu = this_rq->cpu;
10654		- unsigned int flags;
	10768	+ unsigned int flags = this_rq->nohz_idle_balance;
10655	10769
10656		- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
	10770	+ if (!flags)
10657	10771	return false;
10658	10772
10659		- if (idle != CPU_IDLE) {
10660		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10661		- return false;
10662		- }
	10773	+ this_rq->nohz_idle_balance = 0;
10663	10774
10664		- /*
10665		- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10666		- */
10667		- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10668		- if (!(flags & NOHZ_KICK_MASK))
	10775	+ if (idle != CPU_IDLE)
10669	10776	return false;
10670	10777
10671	10778	_nohz_idle_balance(this_rq, flags, idle);
..	..	@@ -10719,15 +10826,26 @@
10719	10826	/*
10720	10827	* idle_balance is called by schedule() if this_cpu is about to become
10721	10828	* idle. Attempts to pull tasks from other CPUs.
	10829	+ *
	10830	+ * Returns:
	10831	+ * < 0 - we released the lock and there are !fair tasks present
	10832	+ * 0 - failed, no new tasks
	10833	+ * > 0 - success, new (fair) tasks present
10722	10834	*/
10723		-static int idle_balance(struct rq this_rq, struct rq_flags rf)
	10835	+static int newidle_balance(struct rq this_rq, struct rq_flags rf)
10724	10836	{
10725	10837	unsigned long next_balance = jiffies + HZ;
10726	10838	int this_cpu = this_rq->cpu;
10727	10839	struct sched_domain *sd;
10728	10840	int pulled_task = 0;
10729	10841	u64 curr_cost = 0;
	10842	+ int done = 0;
10730	10843
	10844	+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
	10845	+ if (done)
	10846	+ return pulled_task;
	10847	+
	10848	+ update_misfit_status(NULL, this_rq);
10731	10849	/*
10732	10850	* We must set idle_stamp _before_ calling idle_balance(), such that we
10733	10851	* measure the duration of idle_balance() as idle time.
..	..	@@ -10769,9 +10887,6 @@
10769	10887	for_each_domain(this_cpu, sd) {
10770	10888	int continue_balancing = 1;
10771	10889	u64 t0, domain_cost;
10772		-
10773		- if (!(sd->flags & SD_LOAD_BALANCE))
10774		- continue;
10775	10890
10776	10891	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10777	10892	update_next_balance(sd, &next_balance);
..	..	@@ -10962,6 +11077,9 @@
10962	11077	if (!task_on_rq_queued(p))
10963	11078	return;
10964	11079
	11080	+ if (rq->cfs.nr_running == 1)
	11081	+ return;
	11082	+
10965	11083	/*
10966	11084	* Reschedule if we are currently running on this runqueue and
10967	11085	* our priority decreased, or if we are not currently running on
..	..	@@ -11040,7 +11158,7 @@
11040	11158	/* Catch up with the cfs_rq and remove our load when we leave */
11041	11159	update_load_avg(cfs_rq, se, 0);
11042	11160	detach_entity_load_avg(cfs_rq, se);
11043		- update_tg_load_avg(cfs_rq, false);
	11161	+ update_tg_load_avg(cfs_rq);
11044	11162	propagate_entity_cfs_rq(se);
11045	11163	}
11046	11164
..	..	@@ -11058,8 +11176,8 @@
11058	11176
11059	11177	/* Synchronize entity with its cfs_rq */
11060	11178	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11061		- attach_entity_load_avg(cfs_rq, se, 0);
11062		- update_tg_load_avg(cfs_rq, false);
	11179	+ attach_entity_load_avg(cfs_rq, se);
	11180	+ update_tg_load_avg(cfs_rq);
11063	11181	propagate_entity_cfs_rq(se);
11064	11182	}
11065	11183
..	..	@@ -11118,9 +11236,19 @@
11118	11236	* This routine is mostly called to set cfs_rq->curr field when a task
11119	11237	* migrates between groups/classes.
11120	11238	*/
11121		-static void set_curr_task_fair(struct rq *rq)
	11239	+static void set_next_task_fair(struct rq rq, struct task_struct p, bool first)
11122	11240	{
11123		- struct sched_entity *se = &rq->curr->se;
	11241	+ struct sched_entity *se = &p->se;
	11242	+
	11243	+#ifdef CONFIG_SMP
	11244	+ if (task_on_rq_queued(p)) {
	11245	+ /*
	11246	+ * Move the next running task to the front of the list, so our
	11247	+ * cfs_tasks list becomes MRU one.
	11248	+ */
	11249	+ list_move(&se->group_node, &rq->cfs_tasks);
	11250	+ }
	11251	+#endif
11124	11252
11125	11253	for_each_sched_entity(se) {
11126	11254	struct cfs_rq *cfs_rq = cfs_rq_of(se);
..	..	@@ -11381,8 +11509,8 @@
11381	11509	/*
11382	11510	* All the scheduling class methods:
11383	11511	*/
11384		-const struct sched_class fair_sched_class = {
11385		- .next = &idle_sched_class,
	11512	+const struct sched_class fair_sched_class
	11513	+ __section("__fair_sched_class") = {
11386	11514	.enqueue_task = enqueue_task_fair,
11387	11515	.dequeue_task = dequeue_task_fair,
11388	11516	.yield_task = yield_task_fair,
..	..	@@ -11390,10 +11518,12 @@
11390	11518
11391	11519	.check_preempt_curr = check_preempt_wakeup,
11392	11520
11393		- .pick_next_task = pick_next_task_fair,
	11521	+ .pick_next_task = __pick_next_task_fair,
11394	11522	.put_prev_task = put_prev_task_fair,
	11523	+ .set_next_task = set_next_task_fair,
11395	11524
11396	11525	#ifdef CONFIG_SMP
	11526	+ .balance = balance_fair,
11397	11527	.select_task_rq = select_task_rq_fair,
11398	11528	.migrate_task_rq = migrate_task_rq_fair,
11399	11529
..	..	@@ -11404,7 +11534,6 @@
11404	11534	.set_cpus_allowed = set_cpus_allowed_common,
11405	11535	#endif
11406	11536
11407		- .set_curr_task = set_curr_task_fair,
11408	11537	.task_tick = task_tick_fair,
11409	11538	.task_fork = task_fork_fair,
11410	11539
..	..	@@ -11474,3 +11603,101 @@
11474	11603	#endif /* SMP */
11475	11604
11476	11605	}
	11606	+
	11607	+/*
	11608	+ * Helper functions to facilitate extracting info from tracepoints.
	11609	+ */
	11610	+
	11611	+const struct sched_avg sched_trace_cfs_rq_avg(struct cfs_rq cfs_rq)
	11612	+{
	11613	+#ifdef CONFIG_SMP
	11614	+ return cfs_rq ? &cfs_rq->avg : NULL;
	11615	+#else
	11616	+ return NULL;
	11617	+#endif
	11618	+}
	11619	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
	11620	+
	11621	+char sched_trace_cfs_rq_path(struct cfs_rq cfs_rq, char *str, int len)
	11622	+{
	11623	+ if (!cfs_rq) {
	11624	+ if (str)
	11625	+ strlcpy(str, "(null)", len);
	11626	+ else
	11627	+ return NULL;
	11628	+ }
	11629	+
	11630	+ cfs_rq_tg_path(cfs_rq, str, len);
	11631	+ return str;
	11632	+}
	11633	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
	11634	+
	11635	+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
	11636	+{
	11637	+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
	11638	+}
	11639	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
	11640	+
	11641	+const struct sched_avg sched_trace_rq_avg_rt(struct rq rq)
	11642	+{
	11643	+#ifdef CONFIG_SMP
	11644	+ return rq ? &rq->avg_rt : NULL;
	11645	+#else
	11646	+ return NULL;
	11647	+#endif
	11648	+}
	11649	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
	11650	+
	11651	+const struct sched_avg sched_trace_rq_avg_dl(struct rq rq)
	11652	+{
	11653	+#ifdef CONFIG_SMP
	11654	+ return rq ? &rq->avg_dl : NULL;
	11655	+#else
	11656	+ return NULL;
	11657	+#endif
	11658	+}
	11659	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
	11660	+
	11661	+const struct sched_avg sched_trace_rq_avg_irq(struct rq rq)
	11662	+{
	11663	+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
	11664	+ return rq ? &rq->avg_irq : NULL;
	11665	+#else
	11666	+ return NULL;
	11667	+#endif
	11668	+}
	11669	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
	11670	+
	11671	+int sched_trace_rq_cpu(struct rq *rq)
	11672	+{
	11673	+ return rq ? cpu_of(rq) : -1;
	11674	+}
	11675	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
	11676	+
	11677	+int sched_trace_rq_cpu_capacity(struct rq *rq)
	11678	+{
	11679	+ return rq ?
	11680	+#ifdef CONFIG_SMP
	11681	+ rq->cpu_capacity
	11682	+#else
	11683	+ SCHED_CAPACITY_SCALE
	11684	+#endif
	11685	+ : -1;
	11686	+}
	11687	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
	11688	+
	11689	+const struct cpumask sched_trace_rd_span(struct root_domain rd)
	11690	+{
	11691	+#ifdef CONFIG_SMP
	11692	+ return rd ? rd->span : NULL;
	11693	+#else
	11694	+ return NULL;
	11695	+#endif
	11696	+}
	11697	+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
	11698	+
	11699	+int sched_trace_rq_nr_running(struct rq *rq)
	11700	+{
	11701	+ return rq ? rq->nr_running : -1;
	11702	+}
	11703	+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);