~hc/RK356X_SDK_RELEASE.git

..	..	@@ -20,12 +20,11 @@
20	20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	22	*/
23		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
24		-#include <linux/cpufreq.h>
25		-#endif
26	23	#include "sched.h"
27	24
28		-#include <trace/events/sched.h>
	25	+#include <trace/hooks/sched.h>
	26	+
	27	+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_runtime);
29	28
30	29	/*
31	30	* Targeted preemption latency for CPU-bound tasks:
..	..	@@ -41,17 +40,8 @@
41	40	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
42	41	*/
43	42	unsigned int sysctl_sched_latency = 6000000ULL;
44		-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
45		-
46		-/*
47		- * Enable/disable honoring sync flag in energy-aware wakeups.
48		- */
49		-unsigned int sysctl_sched_sync_hint_enable = 1;
50		-
51		-/*
52		- * Enable/disable using cstate knowledge in idle sibling selection
53		- */
54		-unsigned int sysctl_sched_cstate_aware = 1;
	43	+EXPORT_SYMBOL_GPL(sysctl_sched_latency);
	44	+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55	45
56	46	/*
57	47	* The initial- and re-scaling of tunables is configurable
..	..	@@ -71,8 +61,9 @@
71	61	*
72	62	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
73	63	*/
74		-unsigned int sysctl_sched_min_granularity = 750000ULL;
75		-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
	64	+unsigned int sysctl_sched_min_granularity = 750000ULL;
	65	+EXPORT_SYMBOL_GPL(sysctl_sched_min_granularity);
	66	+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
76	67
77	68	/*
78	69	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
..	..	@@ -94,10 +85,23 @@
94	85	*
95	86	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
96	87	*/
97		-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
98		-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
	88	+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
	89	+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
99	90
100	91	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
	92	+
	93	+int sched_thermal_decay_shift;
	94	+static int __init setup_sched_thermal_decay_shift(char *str)
	95	+{
	96	+ int _shift = 0;
	97	+
	98	+ if (kstrtoint(str, 0, &_shift))
	99	+ pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
	100	+
	101	+ sched_thermal_decay_shift = clamp(_shift, 0, 10);
	102	+ return 1;
	103	+}
	104	+__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
101	105
102	106	#ifdef CONFIG_SMP
103	107	/*
..	..	@@ -107,6 +111,14 @@
107	111	{
108	112	return -cpu;
109	113	}
	114	+
	115	+/*
	116	+ * The margin used when comparing utilization with CPU capacity.
	117	+ *
	118	+ * (default: ~20%)
	119	+ */
	120	+#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
	121	+
110	122	#endif
111	123
112	124	#ifdef CONFIG_CFS_BANDWIDTH
..	..	@@ -122,18 +134,6 @@
122	134	*/
123	135	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
124	136	#endif
125		-
126		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
127		-unsigned int sysctl_sched_performance_bias = 1;
128		-#endif
129		-
130		-/*
131		- * The margin used when comparing utilization with CPU capacity:
132		- * util * margin < capacity * 1024
133		- *
134		- * (default: ~20%)
135		- */
136		-unsigned int capacity_margin = 1280;
137	137
138	138	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
139	139	{
..	..	@@ -195,7 +195,7 @@
195	195	#undef SET_SYSCTL
196	196	}
197	197
198		-void sched_init_granularity(void)
	198	+void __init sched_init_granularity(void)
199	199	{
200	200	update_sysctl();
201	201	}
..	..	@@ -246,8 +246,7 @@
246	246	}
247	247	}
248	248
249		- /* hint to use a 32x32->64 mul */
250		- fact = (u64)(u32)fact * lw->inv_weight;
	249	+ fact = mul_u32_u32(fact, lw->inv_weight);
251	250
252	251	while (fact >> 32) {
253	252	fact >>= 1;
..	..	@@ -290,6 +289,19 @@
290	289	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
291	290	{
292	291	return grp->my_q;
	292	+}
	293	+
	294	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	295	+{
	296	+ if (!path)
	297	+ return;
	298	+
	299	+ if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
	300	+ autogroup_path(cfs_rq->tg, path, len);
	301	+ else if (cfs_rq && cfs_rq->tg->css.cgroup)
	302	+ cgroup_path(cfs_rq->tg->css.cgroup, path, len);
	303	+ else
	304	+ strlcpy(path, "(null)", len);
293	305	}
294	306
295	307	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -466,6 +478,12 @@
466	478	return NULL;
467	479	}
468	480
	481	+static inline void cfs_rq_tg_path(struct cfs_rq cfs_rq, char path, int len)
	482	+{
	483	+ if (path)
	484	+ strlcpy(path, "(null)", len);
	485	+}
	486	+
469	487	static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
470	488	{
471	489	return true;
..	..	@@ -567,6 +585,7 @@
567	585	struct sched_entity *entry;
568	586	bool leftmost = true;
569	587
	588	+ trace_android_rvh_enqueue_entity(cfs_rq, se);
570	589	/*
571	590	* Find the right place in the rbtree:
572	591	*/
..	..	@@ -592,6 +611,7 @@
592	611
593	612	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
594	613	{
	614	+ trace_android_rvh_dequeue_entity(cfs_rq, se);
595	615	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
596	616	}
597	617
..	..	@@ -631,8 +651,7 @@
631	651	*/
632	652
633	653	int sched_proc_update_handler(struct ctl_table *table, int write,
634		- void __user buffer, size_t lenp,
635		- loff_t *ppos)
	654	+ void buffer, size_t lenp, loff_t *ppos)
636	655	{
637	656	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
638	657	unsigned int factor = get_update_sysctl_factor();
..	..	@@ -689,7 +708,13 @@
689	708	*/
690	709	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
691	710	{
692		- u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
	711	+ unsigned int nr_running = cfs_rq->nr_running;
	712	+ u64 slice;
	713	+
	714	+ if (sched_feat(ALT_PERIOD))
	715	+ nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
	716	+
	717	+ slice = __sched_period(nr_running + !se->on_rq);
693	718
694	719	for_each_sched_entity(se) {
695	720	struct load_weight *load;
..	..	@@ -706,6 +731,10 @@
706	731	}
707	732	slice = __calc_delta(slice, se->load.weight, load);
708	733	}
	734	+
	735	+ if (sched_feat(BASE_SLICE))
	736	+ slice = max(slice, (u64)sysctl_sched_min_granularity);
	737	+
709	738	return slice;
710	739	}
711	740
..	..	@@ -734,26 +763,17 @@
734	763	memset(sa, 0, sizeof(*sa));
735	764
736	765	/*
737		- * Tasks are intialized with full load to be seen as heavy tasks until
	766	+ * Tasks are initialized with full load to be seen as heavy tasks until
738	767	* they get a chance to stabilize to their real load level.
739		- * Group entities are intialized with zero load to reflect the fact that
	768	+ * Group entities are initialized with zero load to reflect the fact that
740	769	* nothing has been attached to the task group yet.
741	770	*/
742	771	if (entity_is_task(se))
743		- sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
	772	+ sa->load_avg = scale_load_down(se->load.weight);
744	773
745		- se->runnable_weight = se->load.weight;
746		-
747		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
748		- if (sysctl_sched_performance_bias) {
749		- sa->util_avg = SCHED_CAPACITY_SCALE >> 1;
750		- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
751		- }
752		-#endif
753	774	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
754	775	}
755	776
756		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
757	777	static void attach_entity_cfs_rq(struct sched_entity *se);
758	778
759	779	/*
..	..	@@ -782,18 +802,15 @@
782	802	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
783	803	* if util_avg > util_avg_cap.
784	804	*/
785		-void post_init_entity_util_avg(struct sched_entity *se)
	805	+void post_init_entity_util_avg(struct task_struct *p)
786	806	{
	807	+ struct sched_entity *se = &p->se;
787	808	struct cfs_rq *cfs_rq = cfs_rq_of(se);
788	809	struct sched_avg *sa = &se->avg;
789		- long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
	810	+ long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
790	811	long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
791	812
792		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
793		- if (!sysctl_sched_performance_bias && (cap > 0)) {
794		-#else
795	813	if (cap > 0) {
796		-#endif
797	814	if (cfs_rq->avg.util_avg != 0) {
798	815	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
799	816	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
..	..	@@ -805,24 +822,25 @@
805	822	}
806	823	}
807	824
808		- if (entity_is_task(se)) {
809		- struct task_struct *p = task_of(se);
810		- if (p->sched_class != &fair_sched_class) {
811		- /*
812		- * For !fair tasks do:
813		- *
814		- update_cfs_rq_load_avg(now, cfs_rq);
815		- attach_entity_load_avg(cfs_rq, se, 0);
816		- switched_from_fair(rq, p);
817		- *
818		- * such that the next switched_to_fair() has the
819		- * expected state.
820		- */
821		- se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
822		- return;
823		- }
	825	+ sa->runnable_avg = sa->util_avg;
	826	+
	827	+ if (p->sched_class != &fair_sched_class) {
	828	+ /*
	829	+ * For !fair tasks do:
	830	+ *
	831	+ update_cfs_rq_load_avg(now, cfs_rq);
	832	+ attach_entity_load_avg(cfs_rq, se);
	833	+ switched_from_fair(rq, p);
	834	+ *
	835	+ * such that the next switched_to_fair() has the
	836	+ * expected state.
	837	+ */
	838	+ se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
	839	+ return;
824	840	}
825	841
	842	+ /* Hook before this se's util is attached to cfs_rq's util */
	843	+ trace_android_rvh_post_init_entity_util_avg(se);
826	844	attach_entity_cfs_rq(se);
827	845	}
828	846
..	..	@@ -830,10 +848,10 @@
830	848	void init_entity_runnable_average(struct sched_entity *se)
831	849	{
832	850	}
833		-void post_init_entity_util_avg(struct sched_entity *se)
	851	+void post_init_entity_util_avg(struct task_struct *p)
834	852	{
835	853	}
836		-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	854	+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
837	855	{
838	856	}
839	857	#endif /* CONFIG_SMP */
..	..	@@ -983,7 +1001,6 @@
983	1001	}
984	1002
985	1003	trace_sched_stat_blocked(tsk, delta);
986		- trace_sched_blocked_reason(tsk);
987	1004
988	1005	/*
989	1006	* Blocking time is in units of nanosecs, so shift by
..	..	@@ -1078,7 +1095,7 @@
1078	1095	unsigned int sysctl_numa_balancing_scan_delay = 1000;
1079	1096
1080	1097	struct numa_group {
1081		- atomic_t refcount;
	1098	+ refcount_t refcount;
1082	1099
1083	1100	spinlock_t lock; /* nr_tasks, tasks */
1084	1101	int nr_tasks;
..	..	@@ -1094,7 +1111,7 @@
1094	1111	* more by CPU use than by memory faults.
1095	1112	*/
1096	1113	unsigned long *faults_cpu;
1097		- unsigned long faults[0];
	1114	+ unsigned long faults[];
1098	1115	};
1099	1116
1100	1117	/*
..	..	@@ -1164,7 +1181,7 @@
1164	1181	unsigned long shared = group_faults_shared(ng);
1165	1182	unsigned long private = group_faults_priv(ng);
1166	1183
1167		- period *= atomic_read(&ng->refcount);
	1184	+ period *= refcount_read(&ng->refcount);
1168	1185	period *= shared + 1;
1169	1186	period /= private + shared + 1;
1170	1187	}
..	..	@@ -1189,7 +1206,7 @@
1189	1206	unsigned long private = group_faults_priv(ng);
1190	1207	unsigned long period = smax;
1191	1208
1192		- period *= atomic_read(&ng->refcount);
	1209	+ period *= refcount_read(&ng->refcount);
1193	1210	period *= shared + 1;
1194	1211	period /= private + shared + 1;
1195	1212
..	..	@@ -1199,56 +1216,15 @@
1199	1216	return max(smin, smax);
1200	1217	}
1201	1218
1202		-void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1203		-{
1204		- int mm_users = 0;
1205		- struct mm_struct *mm = p->mm;
1206		-
1207		- if (mm) {
1208		- mm_users = atomic_read(&mm->mm_users);
1209		- if (mm_users == 1) {
1210		- mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1211		- mm->numa_scan_seq = 0;
1212		- }
1213		- }
1214		- p->node_stamp = 0;
1215		- p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1216		- p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1217		- p->numa_work.next = &p->numa_work;
1218		- p->numa_faults = NULL;
1219		- RCU_INIT_POINTER(p->numa_group, NULL);
1220		- p->last_task_numa_placement = 0;
1221		- p->last_sum_exec_runtime = 0;
1222		-
1223		- /* New address space, reset the preferred nid */
1224		- if (!(clone_flags & CLONE_VM)) {
1225		- p->numa_preferred_nid = -1;
1226		- return;
1227		- }
1228		-
1229		- /*
1230		- * New thread, keep existing numa_preferred_nid which should be copied
1231		- * already by arch_dup_task_struct but stagger when scans start.
1232		- */
1233		- if (mm) {
1234		- unsigned int delay;
1235		-
1236		- delay = min_t(unsigned int, task_scan_max(current),
1237		- current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1238		- delay += 2 * TICK_NSEC;
1239		- p->node_stamp = delay;
1240		- }
1241		-}
1242		-
1243	1219	static void account_numa_enqueue(struct rq rq, struct task_struct p)
1244	1220	{
1245		- rq->nr_numa_running += (p->numa_preferred_nid != -1);
	1221	+ rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1246	1222	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1247	1223	}
1248	1224
1249	1225	static void account_numa_dequeue(struct rq rq, struct task_struct p)
1250	1226	{
1251		- rq->nr_numa_running -= (p->numa_preferred_nid != -1);
	1227	+ rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1252	1228	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1253	1229	}
1254	1230
..	..	@@ -1474,7 +1450,7 @@
1474	1450	* two full passes of the "multi-stage node selection" test that is
1475	1451	* executed below.
1476	1452	*/
1477		- if ((p->numa_preferred_nid == -1 \|\| p->numa_scan_seq <= 4) &&
	1453	+ if ((p->numa_preferred_nid == NUMA_NO_NODE \|\| p->numa_scan_seq <= 4) &&
1478	1454	(cpupid_pid_unset(last_cpupid) \|\| cpupid_match_pid(p, last_cpupid)))
1479	1455	return true;
1480	1456
..	..	@@ -1527,55 +1503,52 @@
1527	1503	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1528	1504	}
1529	1505
1530		-static unsigned long weighted_cpuload(struct rq *rq);
1531		-static unsigned long source_load(int cpu, int type);
1532		-static unsigned long target_load(int cpu, int type);
	1506	+/*
	1507	+ * 'numa_type' describes the node at the moment of load balancing.
	1508	+ */
	1509	+enum numa_type {
	1510	+ /* The node has spare capacity that can be used to run more tasks. */
	1511	+ node_has_spare = 0,
	1512	+ /*
	1513	+ * The node is fully used and the tasks don't compete for more CPU
	1514	+ * cycles. Nevertheless, some tasks might wait before running.
	1515	+ */
	1516	+ node_fully_busy,
	1517	+ /*
	1518	+ * The node is overloaded and can't provide expected CPU cycles to all
	1519	+ * tasks.
	1520	+ */
	1521	+ node_overloaded
	1522	+};
1533	1523
1534	1524	/* Cached statistics for all CPUs within a node */
1535	1525	struct numa_stats {
1536	1526	unsigned long load;
1537		-
	1527	+ unsigned long runnable;
	1528	+ unsigned long util;
1538	1529	/* Total compute capacity of CPUs on a node */
1539	1530	unsigned long compute_capacity;
1540		-
1541	1531	unsigned int nr_running;
	1532	+ unsigned int weight;
	1533	+ enum numa_type node_type;
	1534	+ int idle_cpu;
1542	1535	};
1543	1536
1544		-/*
1545		- * XXX borrowed from update_sg_lb_stats
1546		- */
1547		-static void update_numa_stats(struct numa_stats *ns, int nid)
	1537	+static inline bool is_core_idle(int cpu)
1548	1538	{
1549		- int smt, cpu, cpus = 0;
1550		- unsigned long capacity;
	1539	+#ifdef CONFIG_SCHED_SMT
	1540	+ int sibling;
1551	1541
1552		- memset(ns, 0, sizeof(*ns));
1553		- for_each_cpu(cpu, cpumask_of_node(nid)) {
1554		- struct rq *rq = cpu_rq(cpu);
	1542	+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
	1543	+ if (cpu == sibling)
	1544	+ continue;
1555	1545
1556		- ns->nr_running += rq->nr_running;
1557		- ns->load += weighted_cpuload(rq);
1558		- ns->compute_capacity += capacity_of(cpu);
1559		-
1560		- cpus++;
	1546	+ if (!idle_cpu(sibling))
	1547	+ return false;
1561	1548	}
	1549	+#endif
1562	1550
1563		- /*
1564		- * If we raced with hotplug and there are no CPUs left in our mask
1565		- * the @ns structure is NULL'ed and task_numa_compare() will
1566		- * not find this node attractive.
1567		- *
1568		- * We'll detect a huge imbalance and bail there.
1569		- */
1570		- if (!cpus)
1571		- return;
1572		-
1573		- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1574		- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1575		- capacity = cpus / smt; /* cores */
1576		-
1577		- capacity = min_t(unsigned, capacity,
1578		- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
	1551	+ return true;
1579	1552	}
1580	1553
1581	1554	struct task_numa_env {
..	..	@@ -1594,20 +1567,132 @@
1594	1567	int best_cpu;
1595	1568	};
1596	1569
	1570	+static unsigned long cpu_load(struct rq *rq);
	1571	+static unsigned long cpu_runnable(struct rq *rq);
	1572	+static unsigned long cpu_util(int cpu);
	1573	+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
	1574	+
	1575	+static inline enum
	1576	+numa_type numa_classify(unsigned int imbalance_pct,
	1577	+ struct numa_stats *ns)
	1578	+{
	1579	+ if ((ns->nr_running > ns->weight) &&
	1580	+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) \|\|
	1581	+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
	1582	+ return node_overloaded;
	1583	+
	1584	+ if ((ns->nr_running < ns->weight) \|\|
	1585	+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
	1586	+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
	1587	+ return node_has_spare;
	1588	+
	1589	+ return node_fully_busy;
	1590	+}
	1591	+
	1592	+#ifdef CONFIG_SCHED_SMT
	1593	+/* Forward declarations of select_idle_sibling helpers */
	1594	+static inline bool test_idle_cores(int cpu, bool def);
	1595	+static inline int numa_idle_core(int idle_core, int cpu)
	1596	+{
	1597	+ if (!static_branch_likely(&sched_smt_present) \|\|
	1598	+ idle_core >= 0 \|\| !test_idle_cores(cpu, false))
	1599	+ return idle_core;
	1600	+
	1601	+ /*
	1602	+ * Prefer cores instead of packing HT siblings
	1603	+ * and triggering future load balancing.
	1604	+ */
	1605	+ if (is_core_idle(cpu))
	1606	+ idle_core = cpu;
	1607	+
	1608	+ return idle_core;
	1609	+}
	1610	+#else
	1611	+static inline int numa_idle_core(int idle_core, int cpu)
	1612	+{
	1613	+ return idle_core;
	1614	+}
	1615	+#endif
	1616	+
	1617	+/*
	1618	+ * Gather all necessary information to make NUMA balancing placement
	1619	+ * decisions that are compatible with standard load balancer. This
	1620	+ * borrows code and logic from update_sg_lb_stats but sharing a
	1621	+ * common implementation is impractical.
	1622	+ */
	1623	+static void update_numa_stats(struct task_numa_env *env,
	1624	+ struct numa_stats *ns, int nid,
	1625	+ bool find_idle)
	1626	+{
	1627	+ int cpu, idle_core = -1;
	1628	+
	1629	+ memset(ns, 0, sizeof(*ns));
	1630	+ ns->idle_cpu = -1;
	1631	+
	1632	+ rcu_read_lock();
	1633	+ for_each_cpu(cpu, cpumask_of_node(nid)) {
	1634	+ struct rq *rq = cpu_rq(cpu);
	1635	+
	1636	+ ns->load += cpu_load(rq);
	1637	+ ns->runnable += cpu_runnable(rq);
	1638	+ ns->util += cpu_util(cpu);
	1639	+ ns->nr_running += rq->cfs.h_nr_running;
	1640	+ ns->compute_capacity += capacity_of(cpu);
	1641	+
	1642	+ if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
	1643	+ if (READ_ONCE(rq->numa_migrate_on) \|\|
	1644	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr))
	1645	+ continue;
	1646	+
	1647	+ if (ns->idle_cpu == -1)
	1648	+ ns->idle_cpu = cpu;
	1649	+
	1650	+ idle_core = numa_idle_core(idle_core, cpu);
	1651	+ }
	1652	+ }
	1653	+ rcu_read_unlock();
	1654	+
	1655	+ ns->weight = cpumask_weight(cpumask_of_node(nid));
	1656	+
	1657	+ ns->node_type = numa_classify(env->imbalance_pct, ns);
	1658	+
	1659	+ if (idle_core >= 0)
	1660	+ ns->idle_cpu = idle_core;
	1661	+}
	1662	+
1597	1663	static void task_numa_assign(struct task_numa_env *env,
1598	1664	struct task_struct *p, long imp)
1599	1665	{
1600	1666	struct rq *rq = cpu_rq(env->dst_cpu);
1601	1667
1602		- /* Bail out if run-queue part of active NUMA balance. */
1603		- if (xchg(&rq->numa_migrate_on, 1))
1604		- return;
	1668	+ /* Check if run-queue part of active NUMA balance. */
	1669	+ if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
	1670	+ int cpu;
	1671	+ int start = env->dst_cpu;
1605	1672
	1673	+ /* Find alternative idle CPU. */
	1674	+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
	1675	+ if (cpu == env->best_cpu \|\| !idle_cpu(cpu) \|\|
	1676	+ !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
	1677	+ continue;
	1678	+ }
	1679	+
	1680	+ env->dst_cpu = cpu;
	1681	+ rq = cpu_rq(env->dst_cpu);
	1682	+ if (!xchg(&rq->numa_migrate_on, 1))
	1683	+ goto assign;
	1684	+ }
	1685	+
	1686	+ /* Failed to find an alternative idle CPU */
	1687	+ return;
	1688	+ }
	1689	+
	1690	+assign:
1606	1691	/*
1607	1692	* Clear previous best_cpu/rq numa-migrate flag, since task now
1608	1693	* found a better CPU to move/swap.
1609	1694	*/
1610		- if (env->best_cpu != -1) {
	1695	+ if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
1611	1696	rq = cpu_rq(env->best_cpu);
1612	1697	WRITE_ONCE(rq->numa_migrate_on, 0);
1613	1698	}
..	..	@@ -1663,7 +1748,7 @@
1663	1748	* into account that it might be best if task running on the dst_cpu should
1664	1749	* be exchanged with the source task
1665	1750	*/
1666		-static void task_numa_compare(struct task_numa_env *env,
	1751	+static bool task_numa_compare(struct task_numa_env *env,
1667	1752	long taskimp, long groupimp, bool maymove)
1668	1753	{
1669	1754	struct numa_group cur_ng, p_ng = deref_curr_numa_group(env->p);
..	..	@@ -1674,12 +1759,13 @@
1674	1759	int dist = env->dist;
1675	1760	long moveimp = imp;
1676	1761	long load;
	1762	+ bool stopsearch = false;
1677	1763
1678	1764	if (READ_ONCE(dst_rq->numa_migrate_on))
1679		- return;
	1765	+ return false;
1680	1766
1681	1767	rcu_read_lock();
1682		- cur = task_rcu_dereference(&dst_rq->curr);
	1768	+ cur = rcu_dereference(dst_rq->curr);
1683	1769	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
1684	1770	cur = NULL;
1685	1771
..	..	@@ -1687,8 +1773,10 @@
1687	1773	* Because we have preemption enabled we can get migrated around and
1688	1774	* end try selecting ourselves (current == env->p) as a swap candidate.
1689	1775	*/
1690		- if (cur == env->p)
	1776	+ if (cur == env->p) {
	1777	+ stopsearch = true;
1691	1778	goto unlock;
	1779	+ }
1692	1780
1693	1781	if (!cur) {
1694	1782	if (maymove && moveimp >= env->best_imp)
..	..	@@ -1697,18 +1785,27 @@
1697	1785	goto unlock;
1698	1786	}
1699	1787
	1788	+ /* Skip this swap candidate if cannot move to the source cpu. */
	1789	+ if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
	1790	+ goto unlock;
	1791	+
	1792	+ /*
	1793	+ * Skip this swap candidate if it is not moving to its preferred
	1794	+ * node and the best task is.
	1795	+ */
	1796	+ if (env->best_task &&
	1797	+ env->best_task->numa_preferred_nid == env->src_nid &&
	1798	+ cur->numa_preferred_nid != env->src_nid) {
	1799	+ goto unlock;
	1800	+ }
	1801	+
1700	1802	/*
1701	1803	* "imp" is the fault differential for the source task between the
1702	1804	* source and destination node. Calculate the total differential for
1703	1805	* the source task and potential destination task. The more negative
1704	1806	* the value is, the more remote accesses that would be expected to
1705	1807	* be incurred if the tasks were swapped.
1706		- */
1707		- /* Skip this swap candidate if cannot move to the source cpu */
1708		- if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1709		- goto unlock;
1710		-
1711		- /*
	1808	+ *
1712	1809	* If dst and source tasks are in the same NUMA group, or not
1713	1810	* in any group then look only at task weights.
1714	1811	*/
..	..	@@ -1735,9 +1832,31 @@
1735	1832	task_weight(cur, env->dst_nid, dist);
1736	1833	}
1737	1834
	1835	+ /* Discourage picking a task already on its preferred node */
	1836	+ if (cur->numa_preferred_nid == env->dst_nid)
	1837	+ imp -= imp / 16;
	1838	+
	1839	+ /*
	1840	+ * Encourage picking a task that moves to its preferred node.
	1841	+ * This potentially makes imp larger than it's maximum of
	1842	+ * 1998 (see SMALLIMP and task_weight for why) but in this
	1843	+ * case, it does not matter.
	1844	+ */
	1845	+ if (cur->numa_preferred_nid == env->src_nid)
	1846	+ imp += imp / 8;
	1847	+
1738	1848	if (maymove && moveimp > imp && moveimp > env->best_imp) {
1739	1849	imp = moveimp;
1740	1850	cur = NULL;
	1851	+ goto assign;
	1852	+ }
	1853	+
	1854	+ /*
	1855	+ * Prefer swapping with a task moving to its preferred node over a
	1856	+ * task that is not.
	1857	+ */
	1858	+ if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
	1859	+ env->best_task->numa_preferred_nid != env->src_nid) {
1741	1860	goto assign;
1742	1861	}
1743	1862
..	..	@@ -1764,50 +1883,104 @@
1764	1883	goto unlock;
1765	1884
1766	1885	assign:
1767		- /*
1768		- * One idle CPU per node is evaluated for a task numa move.
1769		- * Call select_idle_sibling to maybe find a better one.
1770		- */
	1886	+ /* Evaluate an idle CPU for a task numa move. */
1771	1887	if (!cur) {
	1888	+ int cpu = env->dst_stats.idle_cpu;
	1889	+
	1890	+ /* Nothing cached so current CPU went idle since the search. */
	1891	+ if (cpu < 0)
	1892	+ cpu = env->dst_cpu;
	1893	+
1772	1894	/*
1773		- * select_idle_siblings() uses an per-CPU cpumask that
1774		- * can be used from IRQ context.
	1895	+ * If the CPU is no longer truly idle and the previous best CPU
	1896	+ * is, keep using it.
1775	1897	*/
1776		- local_irq_disable();
1777		- env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1778		- env->dst_cpu);
1779		- local_irq_enable();
	1898	+ if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
	1899	+ idle_cpu(env->best_cpu)) {
	1900	+ cpu = env->best_cpu;
	1901	+ }
	1902	+
	1903	+ env->dst_cpu = cpu;
1780	1904	}
1781	1905
1782	1906	task_numa_assign(env, cur, imp);
	1907	+
	1908	+ /*
	1909	+ * If a move to idle is allowed because there is capacity or load
	1910	+ * balance improves then stop the search. While a better swap
	1911	+ * candidate may exist, a search is not free.
	1912	+ */
	1913	+ if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
	1914	+ stopsearch = true;
	1915	+
	1916	+ /*
	1917	+ * If a swap candidate must be identified and the current best task
	1918	+ * moves its preferred node then stop the search.
	1919	+ */
	1920	+ if (!maymove && env->best_task &&
	1921	+ env->best_task->numa_preferred_nid == env->src_nid) {
	1922	+ stopsearch = true;
	1923	+ }
1783	1924	unlock:
1784	1925	rcu_read_unlock();
	1926	+
	1927	+ return stopsearch;
1785	1928	}
1786	1929
1787	1930	static void task_numa_find_cpu(struct task_numa_env *env,
1788	1931	long taskimp, long groupimp)
1789	1932	{
1790		- long src_load, dst_load, load;
1791	1933	bool maymove = false;
1792	1934	int cpu;
1793	1935
1794		- load = task_h_load(env->p);
1795		- dst_load = env->dst_stats.load + load;
1796		- src_load = env->src_stats.load - load;
1797		-
1798	1936	/*
1799		- * If the improvement from just moving env->p direction is better
1800		- * than swapping tasks around, check if a move is possible.
	1937	+ * If dst node has spare capacity, then check if there is an
	1938	+ * imbalance that would be overruled by the load balancer.
1801	1939	*/
1802		- maymove = !load_too_imbalanced(src_load, dst_load, env);
	1940	+ if (env->dst_stats.node_type == node_has_spare) {
	1941	+ unsigned int imbalance;
	1942	+ int src_running, dst_running;
	1943	+
	1944	+ /*
	1945	+ * Would movement cause an imbalance? Note that if src has
	1946	+ * more running tasks that the imbalance is ignored as the
	1947	+ * move improves the imbalance from the perspective of the
	1948	+ * CPU load balancer.
	1949	+ * */
	1950	+ src_running = env->src_stats.nr_running - 1;
	1951	+ dst_running = env->dst_stats.nr_running + 1;
	1952	+ imbalance = max(0, dst_running - src_running);
	1953	+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
	1954	+
	1955	+ /* Use idle CPU if there is no imbalance */
	1956	+ if (!imbalance) {
	1957	+ maymove = true;
	1958	+ if (env->dst_stats.idle_cpu >= 0) {
	1959	+ env->dst_cpu = env->dst_stats.idle_cpu;
	1960	+ task_numa_assign(env, NULL, 0);
	1961	+ return;
	1962	+ }
	1963	+ }
	1964	+ } else {
	1965	+ long src_load, dst_load, load;
	1966	+ /*
	1967	+ * If the improvement from just moving env->p direction is better
	1968	+ * than swapping tasks around, check if a move is possible.
	1969	+ */
	1970	+ load = task_h_load(env->p);
	1971	+ dst_load = env->dst_stats.load + load;
	1972	+ src_load = env->src_stats.load - load;
	1973	+ maymove = !load_too_imbalanced(src_load, dst_load, env);
	1974	+ }
1803	1975
1804	1976	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1805	1977	/* Skip this CPU if the source task cannot migrate */
1806		- if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
	1978	+ if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1807	1979	continue;
1808	1980
1809	1981	env->dst_cpu = cpu;
1810		- task_numa_compare(env, taskimp, groupimp, maymove);
	1982	+ if (task_numa_compare(env, taskimp, groupimp, maymove))
	1983	+ break;
1811	1984	}
1812	1985	}
1813	1986
..	..	@@ -1861,10 +2034,10 @@
1861	2034	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1862	2035	taskweight = task_weight(p, env.src_nid, dist);
1863	2036	groupweight = group_weight(p, env.src_nid, dist);
1864		- update_numa_stats(&env.src_stats, env.src_nid);
	2037	+ update_numa_stats(&env, &env.src_stats, env.src_nid, false);
1865	2038	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1866	2039	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1867		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2040	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1868	2041
1869	2042	/* Try to find a spot on the preferred nid. */
1870	2043	task_numa_find_cpu(&env, taskimp, groupimp);
..	..	@@ -1897,7 +2070,7 @@
1897	2070
1898	2071	env.dist = dist;
1899	2072	env.dst_nid = nid;
1900		- update_numa_stats(&env.dst_stats, env.dst_nid);
	2073	+ update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
1901	2074	task_numa_find_cpu(&env, taskimp, groupimp);
1902	2075	}
1903	2076	}
..	..	@@ -1921,15 +2094,17 @@
1921	2094	}
1922	2095
1923	2096	/* No better CPU than the current one was found. */
1924		- if (env.best_cpu == -1)
	2097	+ if (env.best_cpu == -1) {
	2098	+ trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
1925	2099	return -EAGAIN;
	2100	+ }
1926	2101
1927	2102	best_rq = cpu_rq(env.best_cpu);
1928	2103	if (env.best_task == NULL) {
1929	2104	ret = migrate_task_to(p, env.best_cpu);
1930	2105	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1931	2106	if (ret != 0)
1932		- trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
	2107	+ trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
1933	2108	return ret;
1934	2109	}
1935	2110
..	..	@@ -1937,7 +2112,7 @@
1937	2112	WRITE_ONCE(best_rq->numa_migrate_on, 0);
1938	2113
1939	2114	if (ret != 0)
1940		- trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
	2115	+ trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
1941	2116	put_task_struct(env.best_task);
1942	2117	return ret;
1943	2118	}
..	..	@@ -1948,7 +2123,7 @@
1948	2123	unsigned long interval = HZ;
1949	2124
1950	2125	/* This task has no NUMA fault statistics yet */
1951		- if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
	2126	+ if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE \|\| !p->numa_faults))
1952	2127	return;
1953	2128
1954	2129	/* Periodically retry migrating the task to the preferred node */
..	..	@@ -2199,7 +2374,7 @@
2199	2374
2200	2375	static void task_numa_placement(struct task_struct *p)
2201	2376	{
2202		- int seq, nid, max_nid = -1;
	2377	+ int seq, nid, max_nid = NUMA_NO_NODE;
2203	2378	unsigned long max_faults = 0;
2204	2379	unsigned long fault_types[2] = { 0, 0 };
2205	2380	unsigned long total_faults;
..	..	@@ -2309,12 +2484,12 @@
2309	2484
2310	2485	static inline int get_numa_group(struct numa_group *grp)
2311	2486	{
2312		- return atomic_inc_not_zero(&grp->refcount);
	2487	+ return refcount_inc_not_zero(&grp->refcount);
2313	2488	}
2314	2489
2315	2490	static inline void put_numa_group(struct numa_group *grp)
2316	2491	{
2317		- if (atomic_dec_and_test(&grp->refcount))
	2492	+ if (refcount_dec_and_test(&grp->refcount))
2318	2493	kfree_rcu(grp, rcu);
2319	2494	}
2320	2495
..	..	@@ -2335,7 +2510,7 @@
2335	2510	if (!grp)
2336	2511	return;
2337	2512
2338		- atomic_set(&grp->refcount, 1);
	2513	+ refcount_set(&grp->refcount, 1);
2339	2514	grp->active_nodes = 1;
2340	2515	grp->max_faults_cpu = 0;
2341	2516	spin_lock_init(&grp->lock);
..	..	@@ -2522,8 +2697,8 @@
2522	2697	local = 1;
2523	2698
2524	2699	/*
2525		- * Retry task to preferred node migration periodically, in case it
2526		- * case it previously failed, or the scheduler moved us.
	2700	+ * Retry to migrate task to preferred node periodically, in case it
	2701	+ * previously failed, or the scheduler moved us.
2527	2702	*/
2528	2703	if (time_after(jiffies, p->numa_migrate_retry)) {
2529	2704	task_numa_placement(p);
..	..	@@ -2558,7 +2733,7 @@
2558	2733	* The expensive part of numa migration is done from task_work context.
2559	2734	* Triggered from task_tick_numa().
2560	2735	*/
2561		-void task_numa_work(struct callback_head *work)
	2736	+static void task_numa_work(struct callback_head *work)
2562	2737	{
2563	2738	unsigned long migrate, next_scan, now = jiffies;
2564	2739	struct task_struct *p = current;
..	..	@@ -2571,7 +2746,7 @@
2571	2746
2572	2747	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2573	2748
2574		- work->next = work; /* protect against double add */
	2749	+ work->next = work;
2575	2750	/*
2576	2751	* Who cares about NUMA placement when they're dying.
2577	2752	*
..	..	@@ -2618,7 +2793,7 @@
2618	2793	return;
2619	2794
2620	2795
2621		- if (!down_read_trylock(&mm->mmap_sem))
	2796	+ if (!mmap_read_trylock(mm))
2622	2797	return;
2623	2798	vma = find_vma(mm, start);
2624	2799	if (!vma) {
..	..	@@ -2646,7 +2821,7 @@
2646	2821	* Skip inaccessible VMAs to avoid any confusion between
2647	2822	* PROT_NONE and NUMA hinting ptes
2648	2823	*/
2649		- if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
	2824	+ if (!vma_is_accessible(vma))
2650	2825	continue;
2651	2826
2652	2827	do {
..	..	@@ -2686,7 +2861,7 @@
2686	2861	mm->numa_scan_offset = start;
2687	2862	else
2688	2863	reset_ptenuma_scan(p);
2689		- up_read(&mm->mmap_sem);
	2864	+ mmap_read_unlock(mm);
2690	2865
2691	2866	/*
2692	2867	* Make sure tasks use at least 32x as much time to run other code
..	..	@@ -2700,10 +2875,54 @@
2700	2875	}
2701	2876	}
2702	2877
	2878	+void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
	2879	+{
	2880	+ int mm_users = 0;
	2881	+ struct mm_struct *mm = p->mm;
	2882	+
	2883	+ if (mm) {
	2884	+ mm_users = atomic_read(&mm->mm_users);
	2885	+ if (mm_users == 1) {
	2886	+ mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
	2887	+ mm->numa_scan_seq = 0;
	2888	+ }
	2889	+ }
	2890	+ p->node_stamp = 0;
	2891	+ p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
	2892	+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
	2893	+ /* Protect against double add, see task_tick_numa and task_numa_work */
	2894	+ p->numa_work.next = &p->numa_work;
	2895	+ p->numa_faults = NULL;
	2896	+ RCU_INIT_POINTER(p->numa_group, NULL);
	2897	+ p->last_task_numa_placement = 0;
	2898	+ p->last_sum_exec_runtime = 0;
	2899	+
	2900	+ init_task_work(&p->numa_work, task_numa_work);
	2901	+
	2902	+ /* New address space, reset the preferred nid */
	2903	+ if (!(clone_flags & CLONE_VM)) {
	2904	+ p->numa_preferred_nid = NUMA_NO_NODE;
	2905	+ return;
	2906	+ }
	2907	+
	2908	+ /*
	2909	+ * New thread, keep existing numa_preferred_nid which should be copied
	2910	+ * already by arch_dup_task_struct but stagger when scans start.
	2911	+ */
	2912	+ if (mm) {
	2913	+ unsigned int delay;
	2914	+
	2915	+ delay = min_t(unsigned int, task_scan_max(current),
	2916	+ current->numa_scan_period * mm_users * NSEC_PER_MSEC);
	2917	+ delay += 2 * TICK_NSEC;
	2918	+ p->node_stamp = delay;
	2919	+ }
	2920	+}
	2921	+
2703	2922	/*
2704	2923	* Drive the periodic memory faults..
2705	2924	*/
2706		-void task_tick_numa(struct rq rq, struct task_struct curr)
	2925	+static void task_tick_numa(struct rq rq, struct task_struct curr)
2707	2926	{
2708	2927	struct callback_head *work = &curr->numa_work;
2709	2928	u64 period, now;
..	..	@@ -2728,10 +2947,8 @@
2728	2947	curr->numa_scan_period = task_scan_start(curr);
2729	2948	curr->node_stamp += period;
2730	2949
2731		- if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2732		- init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2733		- task_work_add(curr, work, true);
2734		- }
	2950	+ if (!time_before(jiffies, curr->mm->numa_next_scan))
	2951	+ task_work_add(curr, work, TWA_RESUME);
2735	2952	}
2736	2953	}
2737	2954
..	..	@@ -2761,7 +2978,8 @@
2761	2978	* the preferred node.
2762	2979	*/
2763	2980	if (dst_nid == p->numa_preferred_nid \|\|
2764		- (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
	2981	+ (p->numa_preferred_nid != NUMA_NO_NODE &&
	2982	+ src_nid != p->numa_preferred_nid))
2765	2983	return;
2766	2984	}
2767	2985
..	..	@@ -2791,8 +3009,6 @@
2791	3009	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
2792	3010	{
2793	3011	update_load_add(&cfs_rq->load, se->load.weight);
2794		- if (!parent_entity(se))
2795		- update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2796	3012	#ifdef CONFIG_SMP
2797	3013	if (entity_is_task(se)) {
2798	3014	struct rq *rq = rq_of(cfs_rq);
..	..	@@ -2808,8 +3024,6 @@
2808	3024	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
2809	3025	{
2810	3026	update_load_sub(&cfs_rq->load, se->load.weight);
2811		- if (!parent_entity(se))
2812		- update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2813	3027	#ifdef CONFIG_SMP
2814	3028	if (entity_is_task(se)) {
2815	3029	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
..	..	@@ -2856,26 +3070,18 @@
2856	3070	WRITE_ONCE(*ptr, res); \
2857	3071	} while (0)
2858	3072
	3073	+/*
	3074	+ * Remove and clamp on negative, from a local variable.
	3075	+ *
	3076	+ * A variant of sub_positive(), which does not use explicit load-store
	3077	+ * and is thus optimized for local variable updates.
	3078	+ */
	3079	+#define lsub_positive(_ptr, _val) do { \
	3080	+ typeof(_ptr) ptr = (_ptr); \
	3081	+ ptr -= min_t(typeof(ptr), *ptr, _val); \
	3082	+} while (0)
	3083	+
2859	3084	#ifdef CONFIG_SMP
2860		-static inline void
2861		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2862		-{
2863		- cfs_rq->runnable_weight += se->runnable_weight;
2864		-
2865		- cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2866		- cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2867		-}
2868		-
2869		-static inline void
2870		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2871		-{
2872		- cfs_rq->runnable_weight -= se->runnable_weight;
2873		-
2874		- sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2875		- sub_positive(&cfs_rq->avg.runnable_load_sum,
2876		- se_runnable(se) * se->avg.runnable_load_sum);
2877		-}
2878		-
2879	3085	static inline void
2880	3086	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
2881	3087	{
..	..	@@ -2891,45 +3097,36 @@
2891	3097	}
2892	3098	#else
2893	3099	static inline void
2894		-enqueue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2895		-static inline void
2896		-dequeue_runnable_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2897		-static inline void
2898	3100	enqueue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2899	3101	static inline void
2900	3102	dequeue_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) { }
2901	3103	#endif
2902	3104
2903	3105	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
2904		- unsigned long weight, unsigned long runnable)
	3106	+ unsigned long weight)
2905	3107	{
2906	3108	if (se->on_rq) {
2907	3109	/* commit outstanding execution time */
2908	3110	if (cfs_rq->curr == se)
2909	3111	update_curr(cfs_rq);
2910		- account_entity_dequeue(cfs_rq, se);
2911		- dequeue_runnable_load_avg(cfs_rq, se);
	3112	+ update_load_sub(&cfs_rq->load, se->load.weight);
2912	3113	}
2913	3114	dequeue_load_avg(cfs_rq, se);
2914	3115
2915		- se->runnable_weight = runnable;
2916	3116	update_load_set(&se->load, weight);
2917	3117
2918	3118	#ifdef CONFIG_SMP
2919	3119	do {
2920		- u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
	3120	+ u32 divider = get_pelt_divider(&se->avg);
2921	3121
2922	3122	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2923		- se->avg.runnable_load_avg =
2924		- div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2925	3123	} while (0);
2926	3124	#endif
2927	3125
2928	3126	enqueue_load_avg(cfs_rq, se);
2929		- if (se->on_rq) {
2930		- account_entity_enqueue(cfs_rq, se);
2931		- enqueue_runnable_load_avg(cfs_rq, se);
2932		- }
	3127	+ if (se->on_rq)
	3128	+ update_load_add(&cfs_rq->load, se->load.weight);
	3129	+
2933	3130	}
2934	3131
2935	3132	void reweight_task(struct task_struct *p, int prio)
..	..	@@ -2939,7 +3136,7 @@
2939	3136	struct load_weight *load = &se->load;
2940	3137	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2941	3138
2942		- reweight_entity(cfs_rq, se, weight, weight);
	3139	+ reweight_entity(cfs_rq, se, weight);
2943	3140	load->inv_weight = sched_prio_to_wmult[prio];
2944	3141	}
2945	3142
..	..	@@ -3051,50 +3248,6 @@
3051	3248	*/
3052	3249	return clamp_t(long, shares, MIN_SHARES, tg_shares);
3053	3250	}
3054		-
3055		-/*
3056		- * This calculates the effective runnable weight for a group entity based on
3057		- * the group entity weight calculated above.
3058		- *
3059		- * Because of the above approximation (2), our group entity weight is
3060		- * an load_avg based ratio (3). This means that it includes blocked load and
3061		- * does not represent the runnable weight.
3062		- *
3063		- * Approximate the group entity's runnable weight per ratio from the group
3064		- * runqueue:
3065		- *
3066		- * grq->avg.runnable_load_avg
3067		- * ge->runnable_weight = ge->load.weight * -------------------------- (7)
3068		- * grq->avg.load_avg
3069		- *
3070		- * However, analogous to above, since the avg numbers are slow, this leads to
3071		- * transients in the from-idle case. Instead we use:
3072		- *
3073		- * ge->runnable_weight = ge->load.weight *
3074		- *
3075		- * max(grq->avg.runnable_load_avg, grq->runnable_weight)
3076		- * ----------------------------------------------------- (8)
3077		- * max(grq->avg.load_avg, grq->load.weight)
3078		- *
3079		- * Where these max() serve both to use the 'instant' values to fix the slow
3080		- * from-idle and avoid the /0 on to-idle, similar to (6).
3081		- */
3082		-static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3083		-{
3084		- long runnable, load_avg;
3085		-
3086		- load_avg = max(cfs_rq->avg.load_avg,
3087		- scale_load_down(cfs_rq->load.weight));
3088		-
3089		- runnable = max(cfs_rq->avg.runnable_load_avg,
3090		- scale_load_down(cfs_rq->runnable_weight));
3091		-
3092		- runnable *= shares;
3093		- if (load_avg)
3094		- runnable /= load_avg;
3095		-
3096		- return clamp_t(long, runnable, MIN_SHARES, shares);
3097		-}
3098	3251	#endif /* CONFIG_SMP */
3099	3252
3100	3253	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
..	..	@@ -3106,7 +3259,7 @@
3106	3259	static void update_cfs_group(struct sched_entity *se)
3107	3260	{
3108	3261	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3109		- long shares, runnable;
	3262	+ long shares;
3110	3263
3111	3264	if (!gcfs_rq)
3112	3265	return;
..	..	@@ -3115,16 +3268,15 @@
3115	3268	return;
3116	3269
3117	3270	#ifndef CONFIG_SMP
3118		- runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
	3271	+ shares = READ_ONCE(gcfs_rq->tg->shares);
3119	3272
3120	3273	if (likely(se->load.weight == shares))
3121	3274	return;
3122	3275	#else
3123	3276	shares = calc_group_shares(gcfs_rq);
3124		- runnable = calc_group_runnable(gcfs_rq, shares);
3125	3277	#endif
3126	3278
3127		- reweight_entity(cfs_rq_of(se), se, shares, runnable);
	3279	+ reweight_entity(cfs_rq_of(se), se, shares);
3128	3280	}
3129	3281
3130	3282	#else /* CONFIG_FAIR_GROUP_SCHED */
..	..	@@ -3137,7 +3289,7 @@
3137	3289	{
3138	3290	struct rq *rq = rq_of(cfs_rq);
3139	3291
3140		- if (&rq->cfs == cfs_rq \|\| (flags & SCHED_CPUFREQ_MIGRATION)) {
	3292	+ if (&rq->cfs == cfs_rq) {
3141	3293	/*
3142	3294	* There are a few boundary cases this might miss but it should
3143	3295	* get called often enough that that should (hopefully) not be
..	..	@@ -3161,7 +3313,6 @@
3161	3313	/**
3162	3314	* update_tg_load_avg - update the tg's load avg
3163	3315	* @cfs_rq: the cfs_rq whose avg changed
3164		- * @force: update regardless of how small the difference
3165	3316	*
3166	3317	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3167	3318	* However, because tg->load_avg is a global value there are performance
..	..	@@ -3173,7 +3324,7 @@
3173	3324	*
3174	3325	* Updating tg's load_avg is necessary before update_cfs_share().
3175	3326	*/
3176		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
	3327	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
3177	3328	{
3178	3329	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3179	3330
..	..	@@ -3183,11 +3334,9 @@
3183	3334	if (cfs_rq->tg == &root_task_group)
3184	3335	return;
3185	3336
3186		- if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
	3337	+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3187	3338	atomic_long_add(delta, &cfs_rq->tg->load_avg);
3188	3339	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3189		-
3190		- trace_sched_load_tg(cfs_rq);
3191	3340	}
3192	3341	}
3193	3342
..	..	@@ -3240,7 +3389,6 @@
3240	3389	se->avg.last_update_time = n_last_update_time;
3241	3390	}
3242	3391
3243		-
3244	3392	/*
3245	3393	* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3246	3394	* propagate its contribution. The key to this propagation is the invariant
..	..	@@ -3251,11 +3399,11 @@
3251	3399	* _IFF_ we look at the pure running and runnable sums. Because they
3252	3400	* represent the very same entity, just at different points in the hierarchy.
3253	3401	*
3254		- * Per the above update_tg_cfs_util() is trivial and simply copies the running
3255		- * sum over (but still wrong, because the group entity and group rq do not have
3256		- * their PELT windows aligned).
	3402	+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
	3403	+ * and simply copies the running/runnable sum over (but still wrong, because
	3404	+ * the group entity and group rq do not have their PELT windows aligned).
3257	3405	*
3258		- * However, update_tg_cfs_runnable() is more complex. So we have:
	3406	+ * However, update_tg_cfs_load() is more complex. So we have:
3259	3407	*
3260	3408	* ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3261	3409	*
..	..	@@ -3308,45 +3456,75 @@
3308	3456	* XXX: only do this for the part of runnable > running ?
3309	3457	*
3310	3458	*/
3311		-
3312	3459	static inline void
3313	3460	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3314	3461	{
3315	3462	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
	3463	+ u32 divider;
3316	3464
3317	3465	/* Nothing to update */
3318	3466	if (!delta)
3319	3467	return;
3320	3468
3321	3469	/*
3322		- * The relation between sum and avg is:
3323		- *
3324		- * LOAD_AVG_MAX - 1024 + sa->period_contrib
3325		- *
3326		- * however, the PELT windows are not aligned between grq and gse.
	3470	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3471	+ * See ___update_load_avg() for details.
3327	3472	*/
	3473	+ divider = get_pelt_divider(&cfs_rq->avg);
3328	3474
3329	3475	/* Set new sched_entity's utilization */
3330	3476	se->avg.util_avg = gcfs_rq->avg.util_avg;
3331		- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
	3477	+ se->avg.util_sum = se->avg.util_avg * divider;
3332	3478
3333	3479	/* Update parent cfs_rq utilization */
3334	3480	add_positive(&cfs_rq->avg.util_avg, delta);
3335		- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
	3481	+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
3336	3482	}
3337	3483
3338	3484	static inline void
3339	3485	update_tg_cfs_runnable(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
3340	3486	{
	3487	+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
	3488	+ u32 divider;
	3489	+
	3490	+ /* Nothing to update */
	3491	+ if (!delta)
	3492	+ return;
	3493	+
	3494	+ /*
	3495	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3496	+ * See ___update_load_avg() for details.
	3497	+ */
	3498	+ divider = get_pelt_divider(&cfs_rq->avg);
	3499	+
	3500	+ /* Set new sched_entity's runnable */
	3501	+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
	3502	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
	3503	+
	3504	+ /* Update parent cfs_rq runnable */
	3505	+ add_positive(&cfs_rq->avg.runnable_avg, delta);
	3506	+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
	3507	+}
	3508	+
	3509	+static inline void
	3510	+update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se, struct cfs_rq *gcfs_rq)
	3511	+{
3341	3512	long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3342		- unsigned long runnable_load_avg, load_avg;
3343		- u64 runnable_load_sum, load_sum = 0;
	3513	+ unsigned long load_avg;
	3514	+ u64 load_sum = 0;
3344	3515	s64 delta_sum;
	3516	+ u32 divider;
3345	3517
3346	3518	if (!runnable_sum)
3347	3519	return;
3348	3520
3349	3521	gcfs_rq->prop_runnable_sum = 0;
	3522	+
	3523	+ /*
	3524	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3525	+ * See ___update_load_avg() for details.
	3526	+ */
	3527	+ divider = get_pelt_divider(&cfs_rq->avg);
3350	3528
3351	3529	if (runnable_sum >= 0) {
3352	3530	/*
..	..	@@ -3354,7 +3532,7 @@
3354	3532	* the CPU is saturated running == runnable.
3355	3533	*/
3356	3534	runnable_sum += se->avg.load_sum;
3357		- runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
	3535	+ runnable_sum = min_t(long, runnable_sum, divider);
3358	3536	} else {
3359	3537	/*
3360	3538	* Estimate the new unweighted runnable_sum of the gcfs_rq by
..	..	@@ -3379,7 +3557,7 @@
3379	3557	runnable_sum = max(runnable_sum, running_sum);
3380	3558
3381	3559	load_sum = (s64)se_weight(se) * runnable_sum;
3382		- load_avg = div_s64(load_sum, LOAD_AVG_MAX);
	3560	+ load_avg = div_s64(load_sum, divider);
3383	3561
3384	3562	delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3385	3563	delta_avg = load_avg - se->avg.load_avg;
..	..	@@ -3388,19 +3566,6 @@
3388	3566	se->avg.load_avg = load_avg;
3389	3567	add_positive(&cfs_rq->avg.load_avg, delta_avg);
3390	3568	add_positive(&cfs_rq->avg.load_sum, delta_sum);
3391		-
3392		- runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3393		- runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3394		- delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3395		- delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3396		-
3397		- se->avg.runnable_load_sum = runnable_sum;
3398		- se->avg.runnable_load_avg = runnable_load_avg;
3399		-
3400		- if (se->on_rq) {
3401		- add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3402		- add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3403		- }
3404	3569	}
3405	3570
3406	3571	static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
..	..	@@ -3429,9 +3594,10 @@
3429	3594
3430	3595	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3431	3596	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
	3597	+ update_tg_cfs_load(cfs_rq, se, gcfs_rq);
3432	3598
3433		- trace_sched_load_cfs_rq(cfs_rq);
3434		- trace_sched_load_se(se);
	3599	+ trace_pelt_cfs_tp(cfs_rq);
	3600	+ trace_pelt_se_tp(se);
3435	3601
3436	3602	return 1;
3437	3603	}
..	..	@@ -3468,7 +3634,7 @@
3468	3634
3469	3635	#else /* CONFIG_FAIR_GROUP_SCHED */
3470	3636
3471		-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
	3637	+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
3472	3638
3473	3639	static inline int propagate_entity_load_avg(struct sched_entity *se)
3474	3640	{
..	..	@@ -3498,18 +3664,18 @@
3498	3664	static inline int
3499	3665	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3500	3666	{
3501		- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
	3667	+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
3502	3668	struct sched_avg *sa = &cfs_rq->avg;
3503	3669	int decayed = 0;
3504	3670
3505	3671	if (cfs_rq->removed.nr) {
3506	3672	unsigned long r;
3507		- u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
	3673	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3508	3674
3509	3675	raw_spin_lock(&cfs_rq->removed.lock);
3510	3676	swap(cfs_rq->removed.util_avg, removed_util);
3511	3677	swap(cfs_rq->removed.load_avg, removed_load);
3512		- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
	3678	+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
3513	3679	cfs_rq->removed.nr = 0;
3514	3680	raw_spin_unlock(&cfs_rq->removed.lock);
3515	3681
..	..	@@ -3520,8 +3686,29 @@
3520	3686	r = removed_util;
3521	3687	sub_positive(&sa->util_avg, r);
3522	3688	sub_positive(&sa->util_sum, r * divider);
	3689	+ /*
	3690	+ * Because of rounding, se->util_sum might ends up being +1 more than
	3691	+ * cfs->util_sum. Although this is not a problem by itself, detaching
	3692	+ * a lot of tasks with the rounding problem between 2 updates of
	3693	+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
	3694	+ * cfs_util_avg is not.
	3695	+ * Check that util_sum is still above its lower bound for the new
	3696	+ * util_avg. Given that period_contrib might have moved since the last
	3697	+ * sync, we are only sure that util_sum must be above or equal to
	3698	+ * util_avg * minimum possible divider
	3699	+ */
	3700	+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
3523	3701
3524		- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
	3702	+ r = removed_runnable;
	3703	+ sub_positive(&sa->runnable_avg, r);
	3704	+ sub_positive(&sa->runnable_sum, r * divider);
	3705	+
	3706	+ /*
	3707	+ * removed_runnable is the unweighted version of removed_load so we
	3708	+ * can use it to estimate removed_load_sum.
	3709	+ */
	3710	+ add_tg_cfs_propagate(cfs_rq,
	3711	+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
3525	3712
3526	3713	decayed = 1;
3527	3714	}
..	..	@@ -3533,9 +3720,6 @@
3533	3720	cfs_rq->load_last_update_time_copy = sa->last_update_time;
3534	3721	#endif
3535	3722
3536		- if (decayed)
3537		- cfs_rq_util_change(cfs_rq, 0);
3538		-
3539	3723	return decayed;
3540	3724	}
3541	3725
..	..	@@ -3543,14 +3727,17 @@
3543	3727	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
3544	3728	* @cfs_rq: cfs_rq to attach to
3545	3729	* @se: sched_entity to attach
3546		- * @flags: migration hints
3547	3730	*
3548	3731	* Must call update_cfs_rq_load_avg() before this, since we rely on
3549	3732	* cfs_rq->avg.last_update_time being current.
3550	3733	*/
3551		-static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
	3734	+static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
3552	3735	{
3553		- u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
	3736	+ /*
	3737	+ * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
	3738	+ * See ___update_load_avg() for details.
	3739	+ */
	3740	+ u32 divider = get_pelt_divider(&cfs_rq->avg);
3554	3741
3555	3742	/*
3556	3743	* When we attach the @se to the @cfs_rq, we must align the decay
..	..	@@ -3570,23 +3757,25 @@
3570	3757	*/
3571	3758	se->avg.util_sum = se->avg.util_avg * divider;
3572	3759
3573		- se->avg.load_sum = divider;
3574		- if (se_weight(se)) {
3575		- se->avg.load_sum =
3576		- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3577		- }
	3760	+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
3578	3761
3579		- se->avg.runnable_load_sum = se->avg.load_sum;
	3762	+ se->avg.load_sum = se->avg.load_avg * divider;
	3763	+ if (se_weight(se) < se->avg.load_sum)
	3764	+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
	3765	+ else
	3766	+ se->avg.load_sum = 1;
3580	3767
3581	3768	enqueue_load_avg(cfs_rq, se);
3582	3769	cfs_rq->avg.util_avg += se->avg.util_avg;
3583	3770	cfs_rq->avg.util_sum += se->avg.util_sum;
	3771	+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
	3772	+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
3584	3773
3585	3774	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3586	3775
3587		- cfs_rq_util_change(cfs_rq, flags);
	3776	+ cfs_rq_util_change(cfs_rq, 0);
3588	3777
3589		- trace_sched_load_cfs_rq(cfs_rq);
	3778	+ trace_pelt_cfs_tp(cfs_rq);
3590	3779	}
3591	3780
3592	3781	/**
..	..	@@ -3602,12 +3791,14 @@
3602	3791	dequeue_load_avg(cfs_rq, se);
3603	3792	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3604	3793	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
	3794	+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
	3795	+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3605	3796
3606	3797	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3607	3798
3608	3799	cfs_rq_util_change(cfs_rq, 0);
3609	3800
3610		- trace_sched_load_cfs_rq(cfs_rq);
	3801	+ trace_pelt_cfs_tp(cfs_rq);
3611	3802	}
3612	3803
3613	3804	/*
..	..	@@ -3623,12 +3814,15 @@
3623	3814	u64 now = cfs_rq_clock_pelt(cfs_rq);
3624	3815	int decayed;
3625	3816
	3817	+ trace_android_vh_prepare_update_load_avg_se(se, flags);
3626	3818	/*
3627	3819	* Track task load average for carrying it to new CPU after migrated, and
3628	3820	* track group sched_entity load average for task_h_load calc in migration
3629	3821	*/
3630	3822	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3631	3823	__update_load_avg_se(now, cfs_rq, se);
	3824	+
	3825	+ trace_android_vh_finish_update_load_avg_se(se, flags);
3632	3826
3633	3827	decayed = update_cfs_rq_load_avg(now, cfs_rq);
3634	3828	decayed \|= propagate_entity_load_avg(se);
..	..	@@ -3642,11 +3836,15 @@
3642	3836	*
3643	3837	* IOW we're enqueueing a task on a new CPU.
3644	3838	*/
3645		- attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3646		- update_tg_load_avg(cfs_rq, 0);
	3839	+ attach_entity_load_avg(cfs_rq, se);
	3840	+ update_tg_load_avg(cfs_rq);
3647	3841
3648		- } else if (decayed && (flags & UPDATE_TG))
3649		- update_tg_load_avg(cfs_rq, 0);
	3842	+ } else if (decayed) {
	3843	+ cfs_rq_util_change(cfs_rq, 0);
	3844	+
	3845	+ if (flags & UPDATE_TG)
	3846	+ update_tg_load_avg(cfs_rq);
	3847	+ }
3650	3848	}
3651	3849
3652	3850	#ifndef CONFIG_64BIT
..	..	@@ -3674,20 +3872,22 @@
3674	3872	* Synchronize entity load avg of dequeued entity without locking
3675	3873	* the previous rq.
3676	3874	*/
3677		-void sync_entity_load_avg(struct sched_entity *se)
	3875	+static void sync_entity_load_avg(struct sched_entity *se)
3678	3876	{
3679	3877	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3680	3878	u64 last_update_time;
3681	3879
3682	3880	last_update_time = cfs_rq_last_update_time(cfs_rq);
	3881	+ trace_android_vh_prepare_update_load_avg_se(se, 0);
3683	3882	__update_load_avg_blocked_se(last_update_time, se);
	3883	+ trace_android_vh_finish_update_load_avg_se(se, 0);
3684	3884	}
3685	3885
3686	3886	/*
3687	3887	* Task first catches up with cfs_rq, and then subtract
3688	3888	* itself from the cfs_rq (task must be off the queue now).
3689	3889	*/
3690		-void remove_entity_load_avg(struct sched_entity *se)
	3890	+static void remove_entity_load_avg(struct sched_entity *se)
3691	3891	{
3692	3892	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3693	3893	unsigned long flags;
..	..	@@ -3696,10 +3896,6 @@
3696	3896	* tasks cannot exit without having gone through wake_up_new_task() ->
3697	3897	* post_init_entity_util_avg() which will have added things to the
3698	3898	* cfs_rq, so we can remove unconditionally.
3699		- *
3700		- * Similarly for groups, they will have passed through
3701		- * post_init_entity_util_avg() before unregister_sched_fair_group()
3702		- * calls this.
3703	3899	*/
3704	3900
3705	3901	sync_entity_load_avg(se);
..	..	@@ -3708,13 +3904,13 @@
3708	3904	++cfs_rq->removed.nr;
3709	3905	cfs_rq->removed.util_avg += se->avg.util_avg;
3710	3906	cfs_rq->removed.load_avg += se->avg.load_avg;
3711		- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
	3907	+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
3712	3908	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3713	3909	}
3714	3910
3715		-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
	3911	+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3716	3912	{
3717		- return cfs_rq->avg.runnable_load_avg;
	3913	+ return cfs_rq->avg.runnable_avg;
3718	3914	}
3719	3915
3720	3916	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
..	..	@@ -3722,7 +3918,7 @@
3722	3918	return cfs_rq->avg.load_avg;
3723	3919	}
3724	3920
3725		-static int idle_balance(struct rq this_rq, struct rq_flags rf);
	3921	+static int newidle_balance(struct rq this_rq, struct rq_flags rf);
3726	3922
3727	3923	static inline unsigned long task_util(struct task_struct *p)
3728	3924	{
..	..	@@ -3733,10 +3929,10 @@
3733	3929	{
3734	3930	struct util_est ue = READ_ONCE(p->se.avg.util_est);
3735	3931
3736		- return max(ue.ewma, ue.enqueued);
	3932	+ return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
3737	3933	}
3738	3934
3739		-unsigned long task_util_est(struct task_struct *p)
	3935	+static inline unsigned long task_util_est(struct task_struct *p)
3740	3936	{
3741	3937	return max(task_util(p), _task_util_est(p));
3742	3938	}
..	..	@@ -3765,13 +3961,29 @@
3765	3961
3766	3962	/* Update root cfs_rq's estimated utilization */
3767	3963	enqueued = cfs_rq->avg.util_est.enqueued;
3768		- enqueued += (_task_util_est(p) \| UTIL_AVG_UNCHANGED);
	3964	+ enqueued += _task_util_est(p);
3769	3965	WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3770	3966
3771		- /* Update plots for Task and CPU estimated utilization */
3772		- trace_sched_util_est_task(p, &p->se.avg);
3773		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
	3967	+ trace_sched_util_est_cfs_tp(cfs_rq);
3774	3968	}
	3969	+
	3970	+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
	3971	+ struct task_struct *p)
	3972	+{
	3973	+ unsigned int enqueued;
	3974	+
	3975	+ if (!sched_feat(UTIL_EST))
	3976	+ return;
	3977	+
	3978	+ /* Update root cfs_rq's estimated utilization */
	3979	+ enqueued = cfs_rq->avg.util_est.enqueued;
	3980	+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
	3981	+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
	3982	+
	3983	+ trace_sched_util_est_cfs_tp(cfs_rq);
	3984	+}
	3985	+
	3986	+#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
3775	3987
3776	3988	/*
3777	3989	* Check if a (signed) value is within a specified (unsigned) margin,
..	..	@@ -3786,24 +3998,20 @@
3786	3998	return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3787	3999	}
3788	4000
3789		-static void
3790		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p, bool task_sleep)
	4001	+static inline void util_est_update(struct cfs_rq *cfs_rq,
	4002	+ struct task_struct *p,
	4003	+ bool task_sleep)
3791	4004	{
3792		- long last_ewma_diff;
	4005	+ long last_ewma_diff, last_enqueued_diff;
3793	4006	struct util_est ue;
3794		- int cpu;
	4007	+ int ret = 0;
	4008	+
	4009	+ trace_android_rvh_util_est_update(cfs_rq, p, task_sleep, &ret);
	4010	+ if (ret)
	4011	+ return;
3795	4012
3796	4013	if (!sched_feat(UTIL_EST))
3797	4014	return;
3798		-
3799		- /* Update root cfs_rq's estimated utilization */
3800		- ue.enqueued = cfs_rq->avg.util_est.enqueued;
3801		- ue.enqueued -= min_t(unsigned int, ue.enqueued,
3802		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
3803		- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3804		-
3805		- /* Update plots for CPU's estimated utilization */
3806		- trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3807	4015
3808	4016	/*
3809	4017	* Skip update of task's estimated utilization when the task has not
..	..	@@ -3820,11 +4028,13 @@
3820	4028	if (ue.enqueued & UTIL_AVG_UNCHANGED)
3821	4029	return;
3822	4030
	4031	+ last_enqueued_diff = ue.enqueued;
	4032	+
3823	4033	/*
3824	4034	* Reset EWMA on utilization increases, the moving average is used only
3825	4035	* to smooth utilization decreases.
3826	4036	*/
3827		- ue.enqueued = (task_util(p) \| UTIL_AVG_UNCHANGED);
	4037	+ ue.enqueued = task_util(p);
3828	4038	if (sched_feat(UTIL_EST_FASTUP)) {
3829	4039	if (ue.ewma < ue.enqueued) {
3830	4040	ue.ewma = ue.enqueued;
..	..	@@ -3833,19 +4043,23 @@
3833	4043	}
3834	4044
3835	4045	/*
3836		- * Skip update of task's estimated utilization when its EWMA is
	4046	+ * Skip update of task's estimated utilization when its members are
3837	4047	* already ~1% close to its last activation value.
3838	4048	*/
3839	4049	last_ewma_diff = ue.enqueued - ue.ewma;
3840		- if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
	4050	+ last_enqueued_diff -= ue.enqueued;
	4051	+ if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
	4052	+ if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
	4053	+ goto done;
	4054	+
3841	4055	return;
	4056	+ }
3842	4057
3843	4058	/*
3844	4059	* To avoid overestimation of actual task utilization, skip updates if
3845	4060	* we cannot grant there is idle time in this CPU.
3846	4061	*/
3847		- cpu = cpu_of(rq_of(cfs_rq));
3848		- if (task_util(p) > capacity_orig_of(cpu))
	4062	+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
3849	4063	return;
3850	4064
3851	4065	/*
..	..	@@ -3869,39 +4083,26 @@
3869	4083	ue.ewma += last_ewma_diff;
3870	4084	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3871	4085	done:
	4086	+ ue.enqueued \|= UTIL_AVG_UNCHANGED;
3872	4087	WRITE_ONCE(p->se.avg.util_est, ue);
3873	4088
3874		- /* Update plots for Task's estimated utilization */
3875		- trace_sched_util_est_task(p, &p->se.avg);
	4089	+ trace_sched_util_est_se_tp(&p->se);
3876	4090	}
3877	4091
3878	4092	static inline int task_fits_capacity(struct task_struct *p, long capacity)
3879	4093	{
3880		- return capacity * 1024 > uclamp_task_util(p) * capacity_margin;
	4094	+ return fits_capacity(uclamp_task_util(p), capacity);
3881	4095	}
3882		-
3883		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
3884		-static inline bool task_fits_max(struct task_struct *p, int cpu)
3885		-{
3886		- unsigned long capacity = capacity_of(cpu);
3887		- unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
3888		-
3889		- if (capacity == max_capacity)
3890		- return true;
3891		-
3892		- if (capacity * capacity_margin > max_capacity * 1024)
3893		- return true;
3894		-
3895		- return task_fits_capacity(p, capacity);
3896		-}
3897		-#endif
3898	4096
3899	4097	static inline void update_misfit_status(struct task_struct p, struct rq rq)
3900	4098	{
3901		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
	4099	+ bool need_update = true;
	4100	+
	4101	+ trace_android_rvh_update_misfit_status(p, rq, &need_update);
	4102	+ if (!static_branch_unlikely(&sched_asym_cpucapacity) \|\| !need_update)
3902	4103	return;
3903	4104
3904		- if (!p) {
	4105	+ if (!p \|\| p->nr_cpus_allowed == 1) {
3905	4106	rq->misfit_task_load = 0;
3906	4107	return;
3907	4108	}
..	..	@@ -3911,7 +4112,11 @@
3911	4112	return;
3912	4113	}
3913	4114
3914		- rq->misfit_task_load = task_h_load(p);
	4115	+ /*
	4116	+ * Make sure that misfit_task_load will not be null even if
	4117	+ * task_h_load() returns 0.
	4118	+ */
	4119	+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
3915	4120	}
3916	4121
3917	4122	#else /* CONFIG_SMP */
..	..	@@ -3928,11 +4133,11 @@
3928	4133	static inline void remove_entity_load_avg(struct sched_entity *se) {}
3929	4134
3930	4135	static inline void
3931		-attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se, int flags) {}
	4136	+attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3932	4137	static inline void
3933	4138	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
3934	4139
3935		-static inline int idle_balance(struct rq rq, struct rq_flags rf)
	4140	+static inline int newidle_balance(struct rq rq, struct rq_flags rf)
3936	4141	{
3937	4142	return 0;
3938	4143	}
..	..	@@ -3941,8 +4146,11 @@
3941	4146	util_est_enqueue(struct cfs_rq cfs_rq, struct task_struct p) {}
3942	4147
3943	4148	static inline void
3944		-util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p,
3945		- bool task_sleep) {}
	4149	+util_est_dequeue(struct cfs_rq cfs_rq, struct task_struct p) {}
	4150	+
	4151	+static inline void
	4152	+util_est_update(struct cfs_rq cfs_rq, struct task_struct p,
	4153	+ bool task_sleep) {}
3946	4154	static inline void update_misfit_status(struct task_struct p, struct rq rq) {}
3947	4155
3948	4156	#endif /* CONFIG_SMP */
..	..	@@ -3990,6 +4198,7 @@
3990	4198
3991	4199	/* ensure we never gain time by being placed backwards. */
3992	4200	se->vruntime = max_vruntime(se->vruntime, vruntime);
	4201	+ trace_android_rvh_place_entity(cfs_rq, se, initial, vruntime);
3993	4202	}
3994	4203
3995	4204	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
..	..	@@ -4014,6 +4223,7 @@
4014	4223	#endif
4015	4224	}
4016	4225
	4226	+static inline bool cfs_bandwidth_used(void);
4017	4227
4018	4228	/*
4019	4229	* MIGRATION
..	..	@@ -4078,8 +4288,8 @@
4078	4288	* - Add its new weight to cfs_rq->load.weight
4079	4289	*/
4080	4290	update_load_avg(cfs_rq, se, UPDATE_TG \| DO_ATTACH);
	4291	+ se_update_runnable(se);
4081	4292	update_cfs_group(se);
4082		- enqueue_runnable_load_avg(cfs_rq, se);
4083	4293	account_entity_enqueue(cfs_rq, se);
4084	4294
4085	4295	if (flags & ENQUEUE_WAKEUP)
..	..	@@ -4092,10 +4302,16 @@
4092	4302	__enqueue_entity(cfs_rq, se);
4093	4303	se->on_rq = 1;
4094	4304
4095		- if (cfs_rq->nr_running == 1) {
	4305	+ /*
	4306	+ * When bandwidth control is enabled, cfs might have been removed
	4307	+ * because of a parent been throttled but cfs->nr_running > 1. Try to
	4308	+ * add it unconditionnally.
	4309	+ */
	4310	+ if (cfs_rq->nr_running == 1 \|\| cfs_bandwidth_used())
4096	4311	list_add_leaf_cfs_rq(cfs_rq);
	4312	+
	4313	+ if (cfs_rq->nr_running == 1)
4097	4314	check_enqueue_throttle(cfs_rq);
4098		- }
4099	4315	}
4100	4316
4101	4317	static void __clear_buddies_last(struct sched_entity *se)
..	..	@@ -4156,13 +4372,13 @@
4156	4372	/*
4157	4373	* When dequeuing a sched_entity, we must:
4158	4374	* - Update loads to have both entity and cfs_rq synced with now.
4159		- * - Substract its load from the cfs_rq->runnable_avg.
4160		- * - Substract its previous weight from cfs_rq->load.weight.
	4375	+ * - Subtract its load from the cfs_rq->runnable_avg.
	4376	+ * - Subtract its previous weight from cfs_rq->load.weight.
4161	4377	* - For group entity, update its weight to reflect the new share
4162	4378	* of its group cfs_rq.
4163	4379	*/
4164	4380	update_load_avg(cfs_rq, se, UPDATE_TG);
4165		- dequeue_runnable_load_avg(cfs_rq, se);
	4381	+ se_update_runnable(se);
4166	4382
4167	4383	update_stats_dequeue(cfs_rq, se, flags);
4168	4384
..	..	@@ -4206,11 +4422,16 @@
4206	4422	unsigned long ideal_runtime, delta_exec;
4207	4423	struct sched_entity *se;
4208	4424	s64 delta;
	4425	+ bool skip_preempt = false;
4209	4426
4210	4427	ideal_runtime = sched_slice(cfs_rq, curr);
4211	4428	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
	4429	+ trace_android_rvh_check_preempt_tick(current, &ideal_runtime, &skip_preempt,
	4430	+ delta_exec, cfs_rq, curr, sysctl_sched_min_granularity);
	4431	+ if (skip_preempt)
	4432	+ return;
4212	4433	if (delta_exec > ideal_runtime) {
4213		- resched_curr(rq_of(cfs_rq));
	4434	+ resched_curr_lazy(rq_of(cfs_rq));
4214	4435	/*
4215	4436	* The current task ran long enough, ensure it doesn't get
4216	4437	* re-elected due to buddy favours.
..	..	@@ -4234,11 +4455,10 @@
4234	4455	return;
4235	4456
4236	4457	if (delta > ideal_runtime)
4237		- resched_curr(rq_of(cfs_rq));
	4458	+ resched_curr_lazy(rq_of(cfs_rq));
4238	4459	}
4239	4460
4240		-static void
4241		-set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
	4461	+void set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
4242	4462	{
4243	4463	/* 'current' is not kept within the tree. */
4244	4464	if (se->on_rq) {
..	..	@@ -4260,7 +4480,8 @@
4260	4480	* least twice that of our own weight (i.e. dont track it
4261	4481	* when there are only lesser-weight tasks around):
4262	4482	*/
4263		- if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
	4483	+ if (schedstat_enabled() &&
	4484	+ rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4264	4485	schedstat_set(se->statistics.slice_max,
4265	4486	max((u64)schedstat_val(se->statistics.slice_max),
4266	4487	se->sum_exec_runtime - se->prev_sum_exec_runtime));
..	..	@@ -4268,6 +4489,8 @@
4268	4489
4269	4490	se->prev_sum_exec_runtime = se->sum_exec_runtime;
4270	4491	}
	4492	+EXPORT_SYMBOL_GPL(set_next_entity);
	4493	+
4271	4494
4272	4495	static int
4273	4496	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
..	..	@@ -4283,7 +4506,11 @@
4283	4506	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
4284	4507	{
4285	4508	struct sched_entity *left = __pick_first_entity(cfs_rq);
4286		- struct sched_entity *se;
	4509	+ struct sched_entity *se = NULL;
	4510	+
	4511	+ trace_android_rvh_pick_next_entity(cfs_rq, curr, &se);
	4512	+ if (se)
	4513	+ goto done;
4287	4514
4288	4515	/*
4289	4516	* If curr is set we have to see if its left of the leftmost entity
..	..	@@ -4313,18 +4540,19 @@
4313	4540	se = second;
4314	4541	}
4315	4542
4316		- /*
4317		- * Prefer last buddy, try to return the CPU to a preempted task.
4318		- */
4319		- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4320		- se = cfs_rq->last;
4321		-
4322		- /*
4323		- * Someone really wants this to run. If it's not unfair, run it.
4324		- */
4325		- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
	4543	+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
	4544	+ /*
	4545	+ * Someone really wants this to run. If it's not unfair, run it.
	4546	+ */
4326	4547	se = cfs_rq->next;
	4548	+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
	4549	+ /*
	4550	+ * Prefer last buddy, try to return the CPU to a preempted task.
	4551	+ */
	4552	+ se = cfs_rq->last;
	4553	+ }
4327	4554
	4555	+done:
4328	4556	clear_buddies(cfs_rq, se);
4329	4557
4330	4558	return se;
..	..	@@ -4376,7 +4604,7 @@
4376	4604	* validating it and just reschedule.
4377	4605	*/
4378	4606	if (queued) {
4379		- resched_curr(rq_of(cfs_rq));
	4607	+ resched_curr_lazy(rq_of(cfs_rq));
4380	4608	return;
4381	4609	}
4382	4610	/*
..	..	@@ -4457,26 +4685,17 @@
4457	4685	return &tg->cfs_bandwidth;
4458	4686	}
4459	4687
4460		-/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4461		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4462		-{
4463		- if (unlikely(cfs_rq->throttle_count))
4464		- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
4465		-
4466		- return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
4467		-}
4468		-
4469	4688	/* returns 0 on failure to allocate runtime */
4470		-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4689	+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
	4690	+ struct cfs_rq *cfs_rq, u64 target_runtime)
4471	4691	{
4472		- struct task_group *tg = cfs_rq->tg;
4473		- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4474		- u64 amount = 0, min_amount;
	4692	+ u64 min_amount, amount = 0;
	4693	+
	4694	+ lockdep_assert_held(&cfs_b->lock);
4475	4695
4476	4696	/* note: this is a positive sum as runtime_remaining <= 0 */
4477		- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
	4697	+ min_amount = target_runtime - cfs_rq->runtime_remaining;
4478	4698
4479		- raw_spin_lock(&cfs_b->lock);
4480	4699	if (cfs_b->quota == RUNTIME_INF)
4481	4700	amount = min_amount;
4482	4701	else {
..	..	@@ -4488,11 +4707,23 @@
4488	4707	cfs_b->idle = 0;
4489	4708	}
4490	4709	}
4491		- raw_spin_unlock(&cfs_b->lock);
4492	4710
4493	4711	cfs_rq->runtime_remaining += amount;
4494	4712
4495	4713	return cfs_rq->runtime_remaining > 0;
	4714	+}
	4715	+
	4716	+/* returns 0 on failure to allocate runtime */
	4717	+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
	4718	+{
	4719	+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
	4720	+ int ret;
	4721	+
	4722	+ raw_spin_lock(&cfs_b->lock);
	4723	+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
	4724	+ raw_spin_unlock(&cfs_b->lock);
	4725	+
	4726	+ return ret;
4496	4727	}
4497	4728
4498	4729	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
..	..	@@ -4510,7 +4741,7 @@
4510	4741	* hierarchy can be throttled
4511	4742	*/
4512	4743	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4513		- resched_curr(rq_of(cfs_rq));
	4744	+ resched_curr_lazy(rq_of(cfs_rq));
4514	4745	}
4515	4746
4516	4747	static __always_inline
..	..	@@ -4557,9 +4788,8 @@
4557	4788
4558	4789	cfs_rq->throttle_count--;
4559	4790	if (!cfs_rq->throttle_count) {
4560		- /* adjust cfs_rq_clock_task() */
4561		- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4562		- cfs_rq->throttled_clock_task;
	4791	+ cfs_rq->throttled_clock_pelt_time += rq_clock_task_mult(rq) -
	4792	+ cfs_rq->throttled_clock_pelt;
4563	4793
4564	4794	/* Add cfs_rq with already running entity in the list */
4565	4795	if (cfs_rq->nr_running >= 1)
..	..	@@ -4576,7 +4806,7 @@
4576	4806
4577	4807	/* group is entering throttled state, stop time */
4578	4808	if (!cfs_rq->throttle_count) {
4579		- cfs_rq->throttled_clock_task = rq_clock_task(rq);
	4809	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(rq);
4580	4810	list_del_leaf_cfs_rq(cfs_rq);
4581	4811	}
4582	4812	cfs_rq->throttle_count++;
..	..	@@ -4584,13 +4814,33 @@
4584	4814	return 0;
4585	4815	}
4586	4816
4587		-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
	4817	+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
4588	4818	{
4589	4819	struct rq *rq = rq_of(cfs_rq);
4590	4820	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4591	4821	struct sched_entity *se;
4592		- long task_delta, dequeue = 1;
4593		- bool empty;
	4822	+ long task_delta, idle_task_delta, dequeue = 1;
	4823	+
	4824	+ raw_spin_lock(&cfs_b->lock);
	4825	+ /* This will start the period timer if necessary */
	4826	+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
	4827	+ /*
	4828	+ * We have raced with bandwidth becoming available, and if we
	4829	+ * actually throttled the timer might not unthrottle us for an
	4830	+ * entire period. We additionally needed to make sure that any
	4831	+ * subsequent check_cfs_rq_runtime calls agree not to throttle
	4832	+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
	4833	+ * for 1ns of runtime rather than just check cfs_b.
	4834	+ */
	4835	+ dequeue = 0;
	4836	+ } else {
	4837	+ list_add_tail_rcu(&cfs_rq->throttled_list,
	4838	+ &cfs_b->throttled_cfs_rq);
	4839	+ }
	4840	+ raw_spin_unlock(&cfs_b->lock);
	4841	+
	4842	+ if (!dequeue)
	4843	+ return false; /* Throttle no longer required. */
4594	4844
4595	4845	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4596	4846
..	..	@@ -4600,15 +4850,22 @@
4600	4850	rcu_read_unlock();
4601	4851
4602	4852	task_delta = cfs_rq->h_nr_running;
	4853	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4603	4854	for_each_sched_entity(se) {
4604	4855	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4605	4856	/* throttled entity or throttle-on-deactivate */
4606	4857	if (!se->on_rq)
4607	4858	break;
4608	4859
4609		- if (dequeue)
	4860	+ if (dequeue) {
4610	4861	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
	4862	+ } else {
	4863	+ update_load_avg(qcfs_rq, se, 0);
	4864	+ se_update_runnable(se);
	4865	+ }
	4866	+
4611	4867	qcfs_rq->h_nr_running -= task_delta;
	4868	+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
4612	4869
4613	4870	if (qcfs_rq->load.weight)
4614	4871	dequeue = 0;
..	..	@@ -4617,29 +4874,13 @@
4617	4874	if (!se)
4618	4875	sub_nr_running(rq, task_delta);
4619	4876
	4877	+ /*
	4878	+ * Note: distribution will already see us throttled via the
	4879	+ * throttled-list. rq->lock protects completion.
	4880	+ */
4620	4881	cfs_rq->throttled = 1;
4621	4882	cfs_rq->throttled_clock = rq_clock(rq);
4622		- raw_spin_lock(&cfs_b->lock);
4623		- empty = list_empty(&cfs_b->throttled_cfs_rq);
4624		-
4625		- /*
4626		- * Add to the _head_ of the list, so that an already-started
4627		- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4628		- * not running add to the tail so that later runqueues don't get starved.
4629		- */
4630		- if (cfs_b->distribute_running)
4631		- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4632		- else
4633		- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4634		-
4635		- /*
4636		- * If we're the first throttled task, make sure the bandwidth
4637		- * timer is running.
4638		- */
4639		- if (empty)
4640		- start_cfs_bandwidth(cfs_b);
4641		-
4642		- raw_spin_unlock(&cfs_b->lock);
	4883	+ return true;
4643	4884	}
4644	4885
4645	4886	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
..	..	@@ -4647,8 +4888,7 @@
4647	4888	struct rq *rq = rq_of(cfs_rq);
4648	4889	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4649	4890	struct sched_entity *se;
4650		- int enqueue = 1;
4651		- long task_delta;
	4891	+ long task_delta, idle_task_delta;
4652	4892
4653	4893	se = cfs_rq->tg->se[cpu_of(rq)];
4654	4894
..	..	@@ -4668,34 +4908,70 @@
4668	4908	return;
4669	4909
4670	4910	task_delta = cfs_rq->h_nr_running;
	4911	+ idle_task_delta = cfs_rq->idle_h_nr_running;
4671	4912	for_each_sched_entity(se) {
4672	4913	if (se->on_rq)
4673		- enqueue = 0;
4674		-
	4914	+ break;
4675	4915	cfs_rq = cfs_rq_of(se);
4676		- if (enqueue)
4677		- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4678		- cfs_rq->h_nr_running += task_delta;
	4916	+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4679	4917
	4918	+ cfs_rq->h_nr_running += task_delta;
	4919	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	4920	+
	4921	+ /* end evaluation on encountering a throttled cfs_rq */
4680	4922	if (cfs_rq_throttled(cfs_rq))
	4923	+ goto unthrottle_throttle;
	4924	+ }
	4925	+
	4926	+ for_each_sched_entity(se) {
	4927	+ cfs_rq = cfs_rq_of(se);
	4928	+
	4929	+ update_load_avg(cfs_rq, se, UPDATE_TG);
	4930	+ se_update_runnable(se);
	4931	+
	4932	+ cfs_rq->h_nr_running += task_delta;
	4933	+ cfs_rq->idle_h_nr_running += idle_task_delta;
	4934	+
	4935	+
	4936	+ /* end evaluation on encountering a throttled cfs_rq */
	4937	+ if (cfs_rq_throttled(cfs_rq))
	4938	+ goto unthrottle_throttle;
	4939	+
	4940	+ /*
	4941	+ * One parent has been throttled and cfs_rq removed from the
	4942	+ * list. Add it back to not break the leaf list.
	4943	+ */
	4944	+ if (throttled_hierarchy(cfs_rq))
	4945	+ list_add_leaf_cfs_rq(cfs_rq);
	4946	+ }
	4947	+
	4948	+ /* At this point se is NULL and we are at root level*/
	4949	+ add_nr_running(rq, task_delta);
	4950	+
	4951	+unthrottle_throttle:
	4952	+ /*
	4953	+ * The cfs_rq_throttled() breaks in the above iteration can result in
	4954	+ * incomplete leaf list maintenance, resulting in triggering the
	4955	+ * assertion below.
	4956	+ */
	4957	+ for_each_sched_entity(se) {
	4958	+ cfs_rq = cfs_rq_of(se);
	4959	+
	4960	+ if (list_add_leaf_cfs_rq(cfs_rq))
4681	4961	break;
4682	4962	}
4683	4963
4684	4964	assert_list_leaf_cfs_rq(rq);
4685		-
4686		- if (!se)
4687		- add_nr_running(rq, task_delta);
4688	4965
4689	4966	/* Determine whether we need to wake up potentially idle CPU: */
4690	4967	if (rq->curr == rq->idle && rq->cfs.nr_running)
4691	4968	resched_curr(rq);
4692	4969	}
4693	4970
4694		-static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
	4971	+static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
4695	4972	{
4696	4973	struct cfs_rq *cfs_rq;
4697		- u64 runtime;
4698		- u64 starting_runtime = remaining;
	4974	+ u64 runtime, remaining = 1;
4699	4975
4700	4976	rcu_read_lock();
4701	4977	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
..	..	@@ -4703,17 +4979,20 @@
4703	4979	struct rq *rq = rq_of(cfs_rq);
4704	4980	struct rq_flags rf;
4705	4981
4706		- rq_lock(rq, &rf);
	4982	+ rq_lock_irqsave(rq, &rf);
4707	4983	if (!cfs_rq_throttled(cfs_rq))
4708	4984	goto next;
4709	4985
4710	4986	/* By the above check, this should never be true */
4711	4987	SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4712	4988
	4989	+ raw_spin_lock(&cfs_b->lock);
4713	4990	runtime = -cfs_rq->runtime_remaining + 1;
4714		- if (runtime > remaining)
4715		- runtime = remaining;
4716		- remaining -= runtime;
	4991	+ if (runtime > cfs_b->runtime)
	4992	+ runtime = cfs_b->runtime;
	4993	+ cfs_b->runtime -= runtime;
	4994	+ remaining = cfs_b->runtime;
	4995	+ raw_spin_unlock(&cfs_b->lock);
4717	4996
4718	4997	cfs_rq->runtime_remaining += runtime;
4719	4998
..	..	@@ -4722,14 +5001,12 @@
4722	5001	unthrottle_cfs_rq(cfs_rq);
4723	5002
4724	5003	next:
4725		- rq_unlock(rq, &rf);
	5004	+ rq_unlock_irqrestore(rq, &rf);
4726	5005
4727	5006	if (!remaining)
4728	5007	break;
4729	5008	}
4730	5009	rcu_read_unlock();
4731		-
4732		- return starting_runtime - remaining;
4733	5010	}
4734	5011
4735	5012	/*
..	..	@@ -4738,9 +5015,8 @@
4738	5015	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
4739	5016	* used to track this state.
4740	5017	*/
4741		-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
	5018	+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4742	5019	{
4743		- u64 runtime;
4744	5020	int throttled;
4745	5021
4746	5022	/* no need to continue the timer with no bandwidth constraint */
..	..	@@ -4769,24 +5045,15 @@
4769	5045	cfs_b->nr_throttled += overrun;
4770	5046
4771	5047	/*
4772		- * This check is repeated as we are holding onto the new bandwidth while
4773		- * we unthrottle. This can potentially race with an unthrottled group
4774		- * trying to acquire new bandwidth from the global pool. This can result
4775		- * in us over-using our runtime if it is all used during this loop, but
4776		- * only by limited amounts in that extreme case.
	5048	+ * This check is repeated as we release cfs_b->lock while we unthrottle.
4777	5049	*/
4778		- while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4779		- runtime = cfs_b->runtime;
4780		- cfs_b->distribute_running = 1;
4781		- raw_spin_unlock(&cfs_b->lock);
	5050	+ while (throttled && cfs_b->runtime > 0) {
	5051	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4782	5052	/* we can't nest cfs_b->lock while distributing bandwidth */
4783		- runtime = distribute_cfs_runtime(cfs_b, runtime);
4784		- raw_spin_lock(&cfs_b->lock);
	5053	+ distribute_cfs_runtime(cfs_b);
	5054	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
4785	5055
4786		- cfs_b->distribute_running = 0;
4787	5056	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4788		-
4789		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4790	5057	}
4791	5058
4792	5059	/*
..	..	@@ -4842,6 +5109,11 @@
4842	5109	if (runtime_refresh_within(cfs_b, min_left))
4843	5110	return;
4844	5111
	5112	+ /* don't push forwards an existing deferred unthrottle */
	5113	+ if (cfs_b->slack_started)
	5114	+ return;
	5115	+ cfs_b->slack_started = true;
	5116	+
4845	5117	hrtimer_start(&cfs_b->slack_timer,
4846	5118	ns_to_ktime(cfs_bandwidth_slack_period),
4847	5119	HRTIMER_MODE_REL);
..	..	@@ -4889,42 +5161,35 @@
4889	5161	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4890	5162	{
4891	5163	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
	5164	+ unsigned long flags;
4892	5165
4893	5166	/* confirm we're still not at a refresh boundary */
4894		- raw_spin_lock(&cfs_b->lock);
4895		- if (cfs_b->distribute_running) {
4896		- raw_spin_unlock(&cfs_b->lock);
4897		- return;
4898		- }
	5167	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
	5168	+ cfs_b->slack_started = false;
4899	5169
4900	5170	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4901		- raw_spin_unlock(&cfs_b->lock);
	5171	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4902	5172	return;
4903	5173	}
4904	5174
4905	5175	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4906	5176	runtime = cfs_b->runtime;
4907	5177
4908		- if (runtime)
4909		- cfs_b->distribute_running = 1;
4910		-
4911		- raw_spin_unlock(&cfs_b->lock);
	5178	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4912	5179
4913	5180	if (!runtime)
4914	5181	return;
4915	5182
4916		- runtime = distribute_cfs_runtime(cfs_b, runtime);
	5183	+ distribute_cfs_runtime(cfs_b);
4917	5184
4918		- raw_spin_lock(&cfs_b->lock);
4919		- cfs_b->runtime -= min(runtime, cfs_b->runtime);
4920		- cfs_b->distribute_running = 0;
4921		- raw_spin_unlock(&cfs_b->lock);
	5185	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
	5186	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4922	5187	}
4923	5188
4924	5189	/*
4925	5190	* When a group wakes up we want to make sure that its quota is not already
4926	5191	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
4927		- * runtime as update_curr() throttling can not not trigger until it's on-rq.
	5192	+ * runtime as update_curr() throttling can not trigger until it's on-rq.
4928	5193	*/
4929	5194	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4930	5195	{
..	..	@@ -4959,7 +5224,7 @@
4959	5224	pcfs_rq = tg->parent->cfs_rq[cpu];
4960	5225
4961	5226	cfs_rq->throttle_count = pcfs_rq->throttle_count;
4962		- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
	5227	+ cfs_rq->throttled_clock_pelt = rq_clock_task_mult(cpu_rq(cpu));
4963	5228	}
4964	5229
4965	5230	/* conditionally throttle active cfs_rq's from put_prev_entity() */
..	..	@@ -4978,8 +5243,7 @@
4978	5243	if (cfs_rq_throttled(cfs_rq))
4979	5244	return true;
4980	5245
4981		- throttle_cfs_rq(cfs_rq);
4982		- return true;
	5246	+ return throttle_cfs_rq(cfs_rq);
4983	5247	}
4984	5248
4985	5249	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
..	..	@@ -4998,15 +5262,18 @@
4998	5262	{
4999	5263	struct cfs_bandwidth *cfs_b =
5000	5264	container_of(timer, struct cfs_bandwidth, period_timer);
	5265	+ unsigned long flags;
5001	5266	int overrun;
5002	5267	int idle = 0;
5003	5268	int count = 0;
5004	5269
5005		- raw_spin_lock(&cfs_b->lock);
	5270	+ raw_spin_lock_irqsave(&cfs_b->lock, flags);
5006	5271	for (;;) {
5007	5272	overrun = hrtimer_forward_now(timer, cfs_b->period);
5008	5273	if (!overrun)
5009	5274	break;
	5275	+
	5276	+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5010	5277
5011	5278	if (++count > 3) {
5012	5279	u64 new, old = ktime_to_ns(cfs_b->period);
..	..	@@ -5037,12 +5304,10 @@
5037	5304	/* reset count so we don't come right back in here */
5038	5305	count = 0;
5039	5306	}
5040		-
5041		- idle = do_sched_cfs_period_timer(cfs_b, overrun);
5042	5307	}
5043	5308	if (idle)
5044	5309	cfs_b->period_active = 0;
5045		- raw_spin_unlock(&cfs_b->lock);
	5310	+ raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
5046	5311
5047	5312	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5048	5313	}
..	..	@@ -5059,7 +5324,7 @@
5059	5324	cfs_b->period_timer.function = sched_cfs_period_timer;
5060	5325	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5061	5326	cfs_b->slack_timer.function = sched_cfs_slack_timer;
5062		- cfs_b->distribute_running = 0;
	5327	+ cfs_b->slack_started = false;
5063	5328	}
5064	5329
5065	5330	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
..	..	@@ -5154,11 +5419,6 @@
5154	5419	return false;
5155	5420	}
5156	5421
5157		-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
5158		-{
5159		- return rq_clock_task(rq_of(cfs_rq));
5160		-}
5161		-
5162	5422	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5163	5423	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5164	5424	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
..	..	@@ -5216,7 +5476,7 @@
5216	5476
5217	5477	if (delta < 0) {
5218	5478	if (rq->curr == p)
5219		- resched_curr(rq);
	5479	+ resched_curr_lazy(rq);
5220	5480	return;
5221	5481	}
5222	5482	hrtick_start(rq, delta);
..	..	@@ -5251,22 +5511,41 @@
5251	5511
5252	5512	#ifdef CONFIG_SMP
5253	5513	static inline unsigned long cpu_util(int cpu);
5254		-static unsigned long capacity_of(int cpu);
5255	5514
5256	5515	static inline bool cpu_overutilized(int cpu)
5257	5516	{
5258		- return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
	5517	+ int overutilized = -1;
	5518	+
	5519	+ trace_android_rvh_cpu_overutilized(cpu, &overutilized);
	5520	+ if (overutilized != -1)
	5521	+ return overutilized;
	5522	+
	5523	+ return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5259	5524	}
5260	5525
5261	5526	static inline void update_overutilized_status(struct rq *rq)
5262	5527	{
5263	5528	if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5264	5529	WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5265		- trace_sched_overutilized(1);
	5530	+ trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5266	5531	}
5267	5532	}
5268	5533	#else
5269	5534	static inline void update_overutilized_status(struct rq *rq) { }
	5535	+#endif
	5536	+
	5537	+/* Runqueue only has SCHED_IDLE tasks enqueued */
	5538	+static int sched_idle_rq(struct rq *rq)
	5539	+{
	5540	+ return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
	5541	+ rq->nr_running);
	5542	+}
	5543	+
	5544	+#ifdef CONFIG_SMP
	5545	+static int sched_idle_cpu(int cpu)
	5546	+{
	5547	+ return sched_idle_rq(cpu_rq(cpu));
	5548	+}
5270	5549	#endif
5271	5550
5272	5551	/*
..	..	@@ -5279,12 +5558,9 @@
5279	5558	{
5280	5559	struct cfs_rq *cfs_rq;
5281	5560	struct sched_entity *se = &p->se;
	5561	+ int idle_h_nr_running = task_has_idle_policy(p);
5282	5562	int task_new = !(flags & ENQUEUE_WAKEUP);
5283		-
5284		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
5285		- if (sysctl_sched_performance_bias)
5286		- cpufreq_task_boost(rq->cpu, task_util_est(p));
5287		-#endif
	5563	+ int should_iowait_boost;
5288	5564
5289	5565	/*
5290	5566	* The code below (indirectly) updates schedutil which looks at
..	..	@@ -5295,29 +5571,13 @@
5295	5571	util_est_enqueue(&rq->cfs, p);
5296	5572
5297	5573	/*
5298		- * The code below (indirectly) updates schedutil which looks at
5299		- * the cfs_rq utilization to select a frequency.
5300		- * Let's update schedtune here to ensure the boost value of the
5301		- * current task is accounted for in the selection of the OPP.
5302		- *
5303		- * We do it also in the case where we enqueue a throttled task;
5304		- * we could argue that a throttled task should not boost a CPU,
5305		- * however:
5306		- * a) properly implementing CPU boosting considering throttled
5307		- * tasks will increase a lot the complexity of the solution
5308		- * b) it's not easy to quantify the benefits introduced by
5309		- * such a more complex solution.
5310		- * Thus, for the time being we go for the simple solution and boost
5311		- * also for throttled RQs.
5312		- */
5313		- schedtune_enqueue_task(p, cpu_of(rq));
5314		-
5315		- /*
5316	5574	* If in_iowait is set, the code below may not trigger any cpufreq
5317	5575	* utilization updates, so do it here explicitly with the IOWAIT flag
5318	5576	* passed.
5319	5577	*/
5320		- if (p->in_iowait)
	5578	+ should_iowait_boost = p->in_iowait;
	5579	+ trace_android_rvh_set_iowait(p, &should_iowait_boost);
	5580	+ if (should_iowait_boost)
5321	5581	cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5322	5582
5323	5583	for_each_sched_entity(se) {
..	..	@@ -5326,51 +5586,60 @@
5326	5586	cfs_rq = cfs_rq_of(se);
5327	5587	enqueue_entity(cfs_rq, se, flags);
5328	5588
5329		- /*
5330		- * end evaluation on encountering a throttled cfs_rq
5331		- *
5332		- * note: in the case of encountering a throttled cfs_rq we will
5333		- * post the final h_nr_running increment below.
5334		- */
5335		- if (cfs_rq_throttled(cfs_rq))
5336		- break;
5337	5589	cfs_rq->h_nr_running++;
	5590	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5591	+
	5592	+ /* end evaluation on encountering a throttled cfs_rq */
	5593	+ if (cfs_rq_throttled(cfs_rq))
	5594	+ goto enqueue_throttle;
5338	5595
5339	5596	flags = ENQUEUE_WAKEUP;
5340	5597	}
5341	5598
	5599	+ trace_android_rvh_enqueue_task_fair(rq, p, flags);
5342	5600	for_each_sched_entity(se) {
5343	5601	cfs_rq = cfs_rq_of(se);
5344		- cfs_rq->h_nr_running++;
5345		-
5346		- if (cfs_rq_throttled(cfs_rq))
5347		- break;
5348	5602
5349	5603	update_load_avg(cfs_rq, se, UPDATE_TG);
	5604	+ se_update_runnable(se);
5350	5605	update_cfs_group(se);
	5606	+
	5607	+ cfs_rq->h_nr_running++;
	5608	+ cfs_rq->idle_h_nr_running += idle_h_nr_running;
	5609	+
	5610	+ /* end evaluation on encountering a throttled cfs_rq */
	5611	+ if (cfs_rq_throttled(cfs_rq))
	5612	+ goto enqueue_throttle;
	5613	+
	5614	+ /*
	5615	+ * One parent has been throttled and cfs_rq removed from the
	5616	+ * list. Add it back to not break the leaf list.
	5617	+ */
	5618	+ if (throttled_hierarchy(cfs_rq))
	5619	+ list_add_leaf_cfs_rq(cfs_rq);
5351	5620	}
5352	5621
5353		- if (!se) {
5354		- add_nr_running(rq, 1);
5355		- /*
5356		- * Since new tasks are assigned an initial util_avg equal to
5357		- * half of the spare capacity of their CPU, tiny tasks have the
5358		- * ability to cross the overutilized threshold, which will
5359		- * result in the load balancer ruining all the task placement
5360		- * done by EAS. As a way to mitigate that effect, do not account
5361		- * for the first enqueue operation of new tasks during the
5362		- * overutilized flag detection.
5363		- *
5364		- * A better way of solving this problem would be to wait for
5365		- * the PELT signals of tasks to converge before taking them
5366		- * into account, but that is not straightforward to implement,
5367		- * and the following generally works well enough in practice.
5368		- */
5369		- if (!task_new)
5370		- update_overutilized_status(rq);
	5622	+ /* At this point se is NULL and we are at root level*/
	5623	+ add_nr_running(rq, 1);
5371	5624
5372		- }
	5625	+ /*
	5626	+ * Since new tasks are assigned an initial util_avg equal to
	5627	+ * half of the spare capacity of their CPU, tiny tasks have the
	5628	+ * ability to cross the overutilized threshold, which will
	5629	+ * result in the load balancer ruining all the task placement
	5630	+ * done by EAS. As a way to mitigate that effect, do not account
	5631	+ * for the first enqueue operation of new tasks during the
	5632	+ * overutilized flag detection.
	5633	+ *
	5634	+ * A better way of solving this problem would be to wait for
	5635	+ * the PELT signals of tasks to converge before taking them
	5636	+ * into account, but that is not straightforward to implement,
	5637	+ * and the following generally works well enough in practice.
	5638	+ */
	5639	+ if (!task_new)
	5640	+ update_overutilized_status(rq);
5373	5641
	5642	+enqueue_throttle:
5374	5643	if (cfs_bandwidth_used()) {
5375	5644	/*
5376	5645	* When bandwidth control is enabled; the cfs_rq_throttled()
..	..	@@ -5403,28 +5672,21 @@
5403	5672	struct cfs_rq *cfs_rq;
5404	5673	struct sched_entity *se = &p->se;
5405	5674	int task_sleep = flags & DEQUEUE_SLEEP;
	5675	+ int idle_h_nr_running = task_has_idle_policy(p);
	5676	+ bool was_sched_idle = sched_idle_rq(rq);
5406	5677
5407		- /*
5408		- * The code below (indirectly) updates schedutil which looks at
5409		- * the cfs_rq utilization to select a frequency.
5410		- * Let's update schedtune here to ensure the boost value of the
5411		- * current task is not more accounted for in the selection of the OPP.
5412		- */
5413		- schedtune_dequeue_task(p, cpu_of(rq));
	5678	+ util_est_dequeue(&rq->cfs, p);
5414	5679
5415	5680	for_each_sched_entity(se) {
5416	5681	cfs_rq = cfs_rq_of(se);
5417	5682	dequeue_entity(cfs_rq, se, flags);
5418	5683
5419		- /*
5420		- * end evaluation on encountering a throttled cfs_rq
5421		- *
5422		- * note: in the case of encountering a throttled cfs_rq we will
5423		- * post the final h_nr_running decrement below.
5424		- */
5425		- if (cfs_rq_throttled(cfs_rq))
5426		- break;
5427	5684	cfs_rq->h_nr_running--;
	5685	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5686	+
	5687	+ /* end evaluation on encountering a throttled cfs_rq */
	5688	+ if (cfs_rq_throttled(cfs_rq))
	5689	+ goto dequeue_throttle;
5428	5690
5429	5691	/* Don't dequeue parent if it has other entities besides us */
5430	5692	if (cfs_rq->load.weight) {
..	..	@@ -5441,21 +5703,32 @@
5441	5703	flags \|= DEQUEUE_SLEEP;
5442	5704	}
5443	5705
	5706	+ trace_android_rvh_dequeue_task_fair(rq, p, flags);
5444	5707	for_each_sched_entity(se) {
5445	5708	cfs_rq = cfs_rq_of(se);
5446		- cfs_rq->h_nr_running--;
5447		-
5448		- if (cfs_rq_throttled(cfs_rq))
5449		- break;
5450	5709
5451	5710	update_load_avg(cfs_rq, se, UPDATE_TG);
	5711	+ se_update_runnable(se);
5452	5712	update_cfs_group(se);
	5713	+
	5714	+ cfs_rq->h_nr_running--;
	5715	+ cfs_rq->idle_h_nr_running -= idle_h_nr_running;
	5716	+
	5717	+ /* end evaluation on encountering a throttled cfs_rq */
	5718	+ if (cfs_rq_throttled(cfs_rq))
	5719	+ goto dequeue_throttle;
	5720	+
5453	5721	}
5454	5722
5455		- if (!se)
5456		- sub_nr_running(rq, 1);
	5723	+ /* At this point se is NULL and we are at root level*/
	5724	+ sub_nr_running(rq, 1);
5457	5725
5458		- util_est_dequeue(&rq->cfs, p, task_sleep);
	5726	+ /* balance early to pull high priority tasks */
	5727	+ if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
	5728	+ rq->next_balance = jiffies;
	5729	+
	5730	+dequeue_throttle:
	5731	+ util_est_update(&rq->cfs, p, task_sleep);
5459	5732	hrtick_update(rq);
5460	5733	}
5461	5734
..	..	@@ -5466,71 +5739,6 @@
5466	5739	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5467	5740
5468	5741	#ifdef CONFIG_NO_HZ_COMMON
5469		-/*
5470		- * per rq 'load' arrray crap; XXX kill this.
5471		- */
5472		-
5473		-/*
5474		- * The exact cpuload calculated at every tick would be:
5475		- *
5476		- * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5477		- *
5478		- * If a CPU misses updates for n ticks (as it was idle) and update gets
5479		- * called on the n+1-th tick when CPU may be busy, then we have:
5480		- *
5481		- * load_n = (1 - 1/2^i)^n * load_0
5482		- * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5483		- *
5484		- * decay_load_missed() below does efficient calculation of
5485		- *
5486		- * load' = (1 - 1/2^i)^n * load
5487		- *
5488		- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5489		- * This allows us to precompute the above in said factors, thereby allowing the
5490		- * reduction of an arbitrary n in O(log_2 n) steps. (See also
5491		- * fixed_power_int())
5492		- *
5493		- * The calculation is approximated on a 128 point scale.
5494		- */
5495		-#define DEGRADE_SHIFT 7
5496		-
5497		-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5498		-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5499		- { 0, 0, 0, 0, 0, 0, 0, 0 },
5500		- { 64, 32, 8, 0, 0, 0, 0, 0 },
5501		- { 96, 72, 40, 12, 1, 0, 0, 0 },
5502		- { 112, 98, 75, 43, 15, 1, 0, 0 },
5503		- { 120, 112, 98, 76, 45, 16, 2, 0 }
5504		-};
5505		-
5506		-/*
5507		- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5508		- * would be when CPU is idle and so we just decay the old load without
5509		- * adding any new load.
5510		- */
5511		-static unsigned long
5512		-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5513		-{
5514		- int j = 0;
5515		-
5516		- if (!missed_updates)
5517		- return load;
5518		-
5519		- if (missed_updates >= degrade_zero_ticks[idx])
5520		- return 0;
5521		-
5522		- if (idx == 1)
5523		- return load >> missed_updates;
5524		-
5525		- while (missed_updates) {
5526		- if (missed_updates % 2)
5527		- load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5528		-
5529		- missed_updates >>= 1;
5530		- j++;
5531		- }
5532		- return load;
5533		-}
5534	5742
5535	5743	static struct {
5536	5744	cpumask_var_t idle_cpus_mask;
..	..	@@ -5542,249 +5750,68 @@
5542	5750
5543	5751	#endif /* CONFIG_NO_HZ_COMMON */
5544	5752
5545		-/**
5546		- * __cpu_load_update - update the rq->cpu_load[] statistics
5547		- * @this_rq: The rq to update statistics for
5548		- * @this_load: The current load
5549		- * @pending_updates: The number of missed updates
5550		- *
5551		- * Update rq->cpu_load[] statistics. This function is usually called every
5552		- * scheduler tick (TICK_NSEC).
5553		- *
5554		- * This function computes a decaying average:
5555		- *
5556		- * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5557		- *
5558		- * Because of NOHZ it might not get called on every tick which gives need for
5559		- * the @pending_updates argument.
5560		- *
5561		- * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5562		- * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5563		- * = A * (A * load[i]_n-2 + B) + B
5564		- * = A * (A * (A * load[i]_n-3 + B) + B) + B
5565		- * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5566		- * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5567		- * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5568		- * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5569		- *
5570		- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5571		- * any change in load would have resulted in the tick being turned back on.
5572		- *
5573		- * For regular NOHZ, this reduces to:
5574		- *
5575		- * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5576		- *
5577		- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5578		- * term.
5579		- */
5580		-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5581		- unsigned long pending_updates)
	5753	+static unsigned long cpu_load(struct rq *rq)
5582	5754	{
5583		- unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5584		- int i, scale;
5585		-
5586		- this_rq->nr_load_updates++;
5587		-
5588		- /* Update our load: */
5589		- this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5590		- for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5591		- unsigned long old_load, new_load;
5592		-
5593		- /* scale is effectively 1 << i now, and >> i divides by scale */
5594		-
5595		- old_load = this_rq->cpu_load[i];
5596		-#ifdef CONFIG_NO_HZ_COMMON
5597		- old_load = decay_load_missed(old_load, pending_updates - 1, i);
5598		- if (tickless_load) {
5599		- old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5600		- /*
5601		- * old_load can never be a negative value because a
5602		- * decayed tickless_load cannot be greater than the
5603		- * original tickless_load.
5604		- */
5605		- old_load += tickless_load;
5606		- }
5607		-#endif
5608		- new_load = this_load;
5609		- /*
5610		- * Round up the averaging division if load is increasing. This
5611		- * prevents us from getting stuck on 9 if the load is 10, for
5612		- * example.
5613		- */
5614		- if (new_load > old_load)
5615		- new_load += scale - 1;
5616		-
5617		- this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5618		- }
5619		-}
5620		-
5621		-/* Used instead of source_load when we know the type == 0 */
5622		-static unsigned long weighted_cpuload(struct rq *rq)
5623		-{
5624		- return cfs_rq_runnable_load_avg(&rq->cfs);
5625		-}
5626		-
5627		-#ifdef CONFIG_NO_HZ_COMMON
5628		-/*
5629		- * There is no sane way to deal with nohz on smp when using jiffies because the
5630		- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5631		- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5632		- *
5633		- * Therefore we need to avoid the delta approach from the regular tick when
5634		- * possible since that would seriously skew the load calculation. This is why we
5635		- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5636		- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5637		- * loop exit, nohz_idle_balance, nohz full exit...)
5638		- *
5639		- * This means we might still be one tick off for nohz periods.
5640		- */
5641		-
5642		-static void cpu_load_update_nohz(struct rq *this_rq,
5643		- unsigned long curr_jiffies,
5644		- unsigned long load)
5645		-{
5646		- unsigned long pending_updates;
5647		-
5648		- pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5649		- if (pending_updates) {
5650		- this_rq->last_load_update_tick = curr_jiffies;
5651		- /*
5652		- * In the regular NOHZ case, we were idle, this means load 0.
5653		- * In the NOHZ_FULL case, we were non-idle, we should consider
5654		- * its weighted load.
5655		- */
5656		- cpu_load_update(this_rq, load, pending_updates);
5657		- }
	5755	+ return cfs_rq_load_avg(&rq->cfs);
5658	5756	}
5659	5757
5660	5758	/*
5661		- * Called from nohz_idle_balance() to update the load ratings before doing the
5662		- * idle balance.
5663		- */
5664		-static void cpu_load_update_idle(struct rq *this_rq)
5665		-{
5666		- /*
5667		- * bail if there's load or we're actually up-to-date.
5668		- */
5669		- if (weighted_cpuload(this_rq))
5670		- return;
5671		-
5672		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5673		-}
5674		-
5675		-/*
5676		- * Record CPU load on nohz entry so we know the tickless load to account
5677		- * on nohz exit. cpu_load[0] happens then to be updated more frequently
5678		- * than other cpu_load[idx] but it should be fine as cpu_load readers
5679		- * shouldn't rely into synchronized cpu_load[*] updates.
5680		- */
5681		-void cpu_load_update_nohz_start(void)
5682		-{
5683		- struct rq *this_rq = this_rq();
5684		-
5685		- /*
5686		- * This is all lockless but should be fine. If weighted_cpuload changes
5687		- * concurrently we'll exit nohz. And cpu_load write can race with
5688		- * cpu_load_update_idle() but both updater would be writing the same.
5689		- */
5690		- this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5691		-}
5692		-
5693		-/*
5694		- * Account the tickless load in the end of a nohz frame.
5695		- */
5696		-void cpu_load_update_nohz_stop(void)
5697		-{
5698		- unsigned long curr_jiffies = READ_ONCE(jiffies);
5699		- struct rq *this_rq = this_rq();
5700		- unsigned long load;
5701		- struct rq_flags rf;
5702		-
5703		- if (curr_jiffies == this_rq->last_load_update_tick)
5704		- return;
5705		-
5706		- load = weighted_cpuload(this_rq);
5707		- rq_lock(this_rq, &rf);
5708		- update_rq_clock(this_rq);
5709		- cpu_load_update_nohz(this_rq, curr_jiffies, load);
5710		- rq_unlock(this_rq, &rf);
5711		-}
5712		-#else /* !CONFIG_NO_HZ_COMMON */
5713		-static inline void cpu_load_update_nohz(struct rq *this_rq,
5714		- unsigned long curr_jiffies,
5715		- unsigned long load) { }
5716		-#endif /* CONFIG_NO_HZ_COMMON */
5717		-
5718		-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5719		-{
5720		-#ifdef CONFIG_NO_HZ_COMMON
5721		- /* See the mess around cpu_load_update_nohz(). */
5722		- this_rq->last_load_update_tick = READ_ONCE(jiffies);
5723		-#endif
5724		- cpu_load_update(this_rq, load, 1);
5725		-}
5726		-
5727		-/*
5728		- * Called from scheduler_tick()
5729		- */
5730		-void cpu_load_update_active(struct rq *this_rq)
5731		-{
5732		- unsigned long load = weighted_cpuload(this_rq);
5733		-
5734		- if (tick_nohz_tick_stopped())
5735		- cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5736		- else
5737		- cpu_load_update_periodic(this_rq, load);
5738		-}
5739		-
5740		-/*
5741		- * Return a low guess at the load of a migration-source CPU weighted
5742		- * according to the scheduling class and "nice" value.
	5759	+ * cpu_load_without - compute CPU load without any contributions from *p
	5760	+ * @cpu: the CPU which load is requested
	5761	+ * @p: the task which load should be discounted
5743	5762	*
5744		- * We want to under-estimate the load of migration sources, to
5745		- * balance conservatively.
	5763	+ * The load of a CPU is defined by the load of tasks currently enqueued on that
	5764	+ * CPU as well as tasks which are currently sleeping after an execution on that
	5765	+ * CPU.
	5766	+ *
	5767	+ * This method returns the load of the specified CPU by discounting the load of
	5768	+ * the specified task, whenever the task is currently contributing to the CPU
	5769	+ * load.
5746	5770	*/
5747		-static unsigned long source_load(int cpu, int type)
	5771	+static unsigned long cpu_load_without(struct rq rq, struct task_struct p)
5748	5772	{
5749		- struct rq *rq = cpu_rq(cpu);
5750		- unsigned long total = weighted_cpuload(rq);
	5773	+ struct cfs_rq *cfs_rq;
	5774	+ unsigned int load;
5751	5775
5752		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5753		- return total;
	5776	+ /* Task has no contribution or is new */
	5777	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5778	+ return cpu_load(rq);
5754	5779
5755		- return min(rq->cpu_load[type-1], total);
	5780	+ cfs_rq = &rq->cfs;
	5781	+ load = READ_ONCE(cfs_rq->avg.load_avg);
	5782	+
	5783	+ /* Discount task's util from CPU's util */
	5784	+ lsub_positive(&load, task_h_load(p));
	5785	+
	5786	+ return load;
5756	5787	}
5757	5788
5758		-/*
5759		- * Return a high guess at the load of a migration-target CPU weighted
5760		- * according to the scheduling class and "nice" value.
5761		- */
5762		-static unsigned long target_load(int cpu, int type)
	5789	+static unsigned long cpu_runnable(struct rq *rq)
5763	5790	{
5764		- struct rq *rq = cpu_rq(cpu);
5765		- unsigned long total = weighted_cpuload(rq);
	5791	+ return cfs_rq_runnable_avg(&rq->cfs);
	5792	+}
5766	5793
5767		- if (type == 0 \|\| !sched_feat(LB_BIAS))
5768		- return total;
	5794	+static unsigned long cpu_runnable_without(struct rq rq, struct task_struct p)
	5795	+{
	5796	+ struct cfs_rq *cfs_rq;
	5797	+ unsigned int runnable;
5769	5798
5770		- return max(rq->cpu_load[type-1], total);
	5799	+ /* Task has no contribution or is new */
	5800	+ if (cpu_of(rq) != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	5801	+ return cpu_runnable(rq);
	5802	+
	5803	+ cfs_rq = &rq->cfs;
	5804	+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
	5805	+
	5806	+ /* Discount task's runnable from CPU's runnable */
	5807	+ lsub_positive(&runnable, p->se.avg.runnable_avg);
	5808	+
	5809	+ return runnable;
5771	5810	}
5772	5811
5773	5812	static unsigned long capacity_of(int cpu)
5774	5813	{
5775	5814	return cpu_rq(cpu)->cpu_capacity;
5776		-}
5777		-
5778		-static unsigned long cpu_avg_load_per_task(int cpu)
5779		-{
5780		- struct rq *rq = cpu_rq(cpu);
5781		- unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5782		- unsigned long load_avg = weighted_cpuload(rq);
5783		-
5784		- if (nr_running)
5785		- return load_avg / nr_running;
5786		-
5787		- return 0;
5788	5815	}
5789	5816
5790	5817	static void record_wakee(struct task_struct *p)
..	..	@@ -5821,18 +5848,15 @@
5821	5848	* whatever is irrelevant, spread criteria is apparent partner count exceeds
5822	5849	* socket size.
5823	5850	*/
5824		-static int wake_wide(struct task_struct *p, int sibling_count_hint)
	5851	+static int wake_wide(struct task_struct *p)
5825	5852	{
5826	5853	unsigned int master = current->wakee_flips;
5827	5854	unsigned int slave = p->wakee_flips;
5828		- int llc_size = this_cpu_read(sd_llc_size);
5829		-
5830		- if (sibling_count_hint >= llc_size)
5831		- return 1;
	5855	+ int factor = __this_cpu_read(sd_llc_size);
5832	5856
5833	5857	if (master < slave)
5834	5858	swap(master, slave);
5835		- if (slave < llc_size \|\| master < slave * llc_size)
	5859	+ if (slave < factor \|\| master < slave * factor)
5836	5860	return 0;
5837	5861	return 1;
5838	5862	}
..	..	@@ -5880,7 +5904,7 @@
5880	5904	s64 this_eff_load, prev_eff_load;
5881	5905	unsigned long task_load;
5882	5906
5883		- this_eff_load = target_load(this_cpu, sd->wake_idx);
	5907	+ this_eff_load = cpu_load(cpu_rq(this_cpu));
5884	5908
5885	5909	if (sync) {
5886	5910	unsigned long current_load = task_h_load(current);
..	..	@@ -5898,7 +5922,7 @@
5898	5922	this_eff_load *= 100;
5899	5923	this_eff_load *= capacity_of(prev_cpu);
5900	5924
5901		- prev_eff_load = source_load(prev_cpu, sd->wake_idx);
	5925	+ prev_eff_load = cpu_load(cpu_rq(prev_cpu));
5902	5926	prev_eff_load -= task_load;
5903	5927	if (sched_feat(WA_BIAS))
5904	5928	prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
..	..	@@ -5936,242 +5960,8 @@
5936	5960	return target;
5937	5961	}
5938	5962
5939		-#ifdef CONFIG_SCHED_TUNE
5940		-struct reciprocal_value schedtune_spc_rdiv;
5941		-
5942		-static long
5943		-schedtune_margin(unsigned long signal, long boost)
5944		-{
5945		- long long margin = 0;
5946		-
5947		- /*
5948		- * Signal proportional compensation (SPC)
5949		- *
5950		- * The Boost (B) value is used to compute a Margin (M) which is
5951		- * proportional to the complement of the original Signal (S):
5952		- * M = B * (SCHED_CAPACITY_SCALE - S)
5953		- * The obtained M could be used by the caller to "boost" S.
5954		- */
5955		- if (boost >= 0) {
5956		- margin = SCHED_CAPACITY_SCALE - signal;
5957		- margin *= boost;
5958		- } else
5959		- margin = -signal * boost;
5960		-
5961		- margin = reciprocal_divide(margin, schedtune_spc_rdiv);
5962		-
5963		- if (boost < 0)
5964		- margin *= -1;
5965		- return margin;
5966		-}
5967		-
5968		-inline long
5969		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
5970		-{
5971		- int boost = schedtune_cpu_boost_with(cpu, p);
5972		- long margin;
5973		-
5974		- if (boost == 0)
5975		- margin = 0;
5976		- else
5977		- margin = schedtune_margin(util, boost);
5978		-
5979		- trace_sched_boost_cpu(cpu, util, margin);
5980		-
5981		- return margin;
5982		-}
5983		-
5984		-long schedtune_task_margin(struct task_struct *task)
5985		-{
5986		- int boost = schedtune_task_boost(task);
5987		- unsigned long util;
5988		- long margin;
5989		-
5990		- if (boost == 0)
5991		- return 0;
5992		-
5993		- util = task_util_est(task);
5994		- margin = schedtune_margin(util, boost);
5995		-
5996		- return margin;
5997		-}
5998		-
5999		-#else /* CONFIG_SCHED_TUNE */
6000		-
6001		-inline long
6002		-schedtune_cpu_margin_with(unsigned long util, int cpu, struct task_struct *p)
6003		-{
6004		- return 0;
6005		-}
6006		-
6007		-#endif /* CONFIG_SCHED_TUNE */
6008		-
6009		-static unsigned long cpu_util_without(int cpu, struct task_struct *p);
6010		-
6011		-static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
6012		-{
6013		- return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
6014		-}
6015		-
6016		-/*
6017		- * find_idlest_group finds and returns the least busy CPU group within the
6018		- * domain.
6019		- *
6020		- * Assumes p is allowed on at least one CPU in sd.
6021		- */
6022	5963	static struct sched_group *
6023		-find_idlest_group(struct sched_domain sd, struct task_struct p,
6024		- int this_cpu, int sd_flag)
6025		-{
6026		- struct sched_group idlest = NULL, group = sd->groups;
6027		- struct sched_group *most_spare_sg = NULL;
6028		- unsigned long min_runnable_load = ULONG_MAX;
6029		- unsigned long this_runnable_load = ULONG_MAX;
6030		- unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
6031		- unsigned long most_spare = 0, this_spare = 0;
6032		- int load_idx = sd->forkexec_idx;
6033		- int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
6034		- unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
6035		- (sd->imbalance_pct-100) / 100;
6036		-
6037		- if (sd_flag & SD_BALANCE_WAKE)
6038		- load_idx = sd->wake_idx;
6039		-
6040		- do {
6041		- unsigned long load, avg_load, runnable_load;
6042		- unsigned long spare_cap, max_spare_cap;
6043		- int local_group;
6044		- int i;
6045		-
6046		- /* Skip over this group if it has no CPUs allowed */
6047		- if (!cpumask_intersects(sched_group_span(group),
6048		- &p->cpus_allowed))
6049		- continue;
6050		-
6051		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6052		- if (sysctl_sched_performance_bias) {
6053		- if (!task_fits_max(p, group_first_cpu(group)))
6054		- continue;
6055		- }
6056		-#endif
6057		-
6058		- local_group = cpumask_test_cpu(this_cpu,
6059		- sched_group_span(group));
6060		-
6061		- /*
6062		- * Tally up the load of all CPUs in the group and find
6063		- * the group containing the CPU with most spare capacity.
6064		- */
6065		- avg_load = 0;
6066		- runnable_load = 0;
6067		- max_spare_cap = 0;
6068		-
6069		- for_each_cpu(i, sched_group_span(group)) {
6070		- /* Bias balancing toward CPUs of our domain */
6071		- if (local_group)
6072		- load = source_load(i, load_idx);
6073		- else
6074		- load = target_load(i, load_idx);
6075		-
6076		- runnable_load += load;
6077		-
6078		- avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
6079		-
6080		- spare_cap = capacity_spare_without(i, p);
6081		-
6082		- if (spare_cap > max_spare_cap)
6083		- max_spare_cap = spare_cap;
6084		- }
6085		-
6086		- /* Adjust by relative CPU capacity of the group */
6087		- avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
6088		- group->sgc->capacity;
6089		- runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
6090		- group->sgc->capacity;
6091		-
6092		- if (local_group) {
6093		- this_runnable_load = runnable_load;
6094		- this_avg_load = avg_load;
6095		- this_spare = max_spare_cap;
6096		- } else {
6097		- if (min_runnable_load > (runnable_load + imbalance)) {
6098		- /*
6099		- * The runnable load is significantly smaller
6100		- * so we can pick this new CPU:
6101		- */
6102		- min_runnable_load = runnable_load;
6103		- min_avg_load = avg_load;
6104		- idlest = group;
6105		- } else if ((runnable_load < (min_runnable_load + imbalance)) &&
6106		- (100min_avg_load > imbalance_scaleavg_load)) {
6107		- /*
6108		- * The runnable loads are close so take the
6109		- * blocked load into account through avg_load:
6110		- */
6111		- min_avg_load = avg_load;
6112		- idlest = group;
6113		- }
6114		-
6115		- if (most_spare < max_spare_cap) {
6116		- most_spare = max_spare_cap;
6117		- most_spare_sg = group;
6118		- }
6119		- }
6120		- } while (group = group->next, group != sd->groups);
6121		-
6122		- /*
6123		- * The cross-over point between using spare capacity or least load
6124		- * is too conservative for high utilization tasks on partially
6125		- * utilized systems if we require spare_capacity > task_util(p),
6126		- * so we allow for some task stuffing by using
6127		- * spare_capacity > task_util(p)/2.
6128		- *
6129		- * Spare capacity can't be used for fork because the utilization has
6130		- * not been set yet, we must first select a rq to compute the initial
6131		- * utilization.
6132		- */
6133		- if (sd_flag & SD_BALANCE_FORK)
6134		- goto skip_spare;
6135		-
6136		- if (this_spare > task_util(p) / 2 &&
6137		- imbalance_scalethis_spare > 100most_spare)
6138		- return NULL;
6139		-
6140		- if (most_spare > task_util(p) / 2)
6141		- return most_spare_sg;
6142		-
6143		-skip_spare:
6144		- if (!idlest)
6145		- return NULL;
6146		-
6147		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
6148		- if (sysctl_sched_performance_bias) {
6149		- if ((this_runnable_load == ULONG_MAX) \|\| (this_avg_load == ULONG_MAX))
6150		- return idlest;
6151		- }
6152		-#endif
6153		-
6154		- /*
6155		- * When comparing groups across NUMA domains, it's possible for the
6156		- * local domain to be very lightly loaded relative to the remote
6157		- * domains but "imbalance" skews the comparison making remote CPUs
6158		- * look much more favourable. When considering cross-domain, add
6159		- * imbalance to the runnable load on the remote node and consider
6160		- * staying local.
6161		- */
6162		- if ((sd->flags & SD_NUMA) &&
6163		- min_runnable_load + imbalance >= this_runnable_load)
6164		- return NULL;
6165		-
6166		- if (min_runnable_load > (this_runnable_load + imbalance))
6167		- return NULL;
6168		-
6169		- if ((this_runnable_load < (min_runnable_load + imbalance)) &&
6170		- (100this_avg_load < imbalance_scalemin_avg_load))
6171		- return NULL;
6172		-
6173		- return idlest;
6174		-}
	5964	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu);
6175	5965
6176	5966	/*
6177	5967	* find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
..	..	@@ -6191,7 +5981,10 @@
6191	5981	return cpumask_first(sched_group_span(group));
6192	5982
6193	5983	/* Traverse only the allowed CPUs */
6194		- for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
	5984	+ for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
	5985	+ if (sched_idle_cpu(i))
	5986	+ return i;
	5987	+
6195	5988	if (available_idle_cpu(i)) {
6196	5989	struct rq *rq = cpu_rq(i);
6197	5990	struct cpuidle_state *idle = idle_get_state(rq);
..	..	@@ -6215,7 +6008,7 @@
6215	6008	shallowest_idle_cpu = i;
6216	6009	}
6217	6010	} else if (shallowest_idle_cpu == -1) {
6218		- load = weighted_cpuload(cpu_rq(i));
	6011	+ load = cpu_load(cpu_rq(i));
6219	6012	if (load < min_load) {
6220	6013	min_load = load;
6221	6014	least_loaded_cpu = i;
..	..	@@ -6231,11 +6024,11 @@
6231	6024	{
6232	6025	int new_cpu = cpu;
6233	6026
6234		- if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
	6027	+ if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6235	6028	return prev_cpu;
6236	6029
6237	6030	/*
6238		- * We need task's util for capacity_spare_without, sync it up to
	6031	+ * We need task's util for cpu_util_without, sync it up to
6239	6032	* prev_cpu's last_update_time.
6240	6033	*/
6241	6034	if (!(sd_flag & SD_BALANCE_FORK))
..	..	@@ -6251,7 +6044,7 @@
6251	6044	continue;
6252	6045	}
6253	6046
6254		- group = find_idlest_group(sd, p, cpu, sd_flag);
	6047	+ group = find_idlest_group(sd, p, cpu);
6255	6048	if (!group) {
6256	6049	sd = sd->child;
6257	6050	continue;
..	..	@@ -6348,16 +6141,18 @@
6348	6141	if (!test_idle_cores(target, false))
6349	6142	return -1;
6350	6143
6351		- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
	6144	+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6352	6145
6353	6146	for_each_cpu_wrap(core, cpus, target) {
6354	6147	bool idle = true;
6355	6148
6356	6149	for_each_cpu(cpu, cpu_smt_mask(core)) {
6357		- cpumask_clear_cpu(cpu, cpus);
6358		- if (!available_idle_cpu(cpu))
	6150	+ if (!available_idle_cpu(cpu)) {
6359	6151	idle = false;
	6152	+ break;
	6153	+ }
6360	6154	}
	6155	+ cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
6361	6156
6362	6157	if (idle)
6363	6158	return core;
..	..	@@ -6382,9 +6177,10 @@
6382	6177	return -1;
6383	6178
6384	6179	for_each_cpu(cpu, cpu_smt_mask(target)) {
6385		- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
	6180	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) \|\|
	6181	+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6386	6182	continue;
6387		- if (available_idle_cpu(cpu))
	6183	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6388	6184	return cpu;
6389	6185	}
6390	6186
..	..	@@ -6415,8 +6211,8 @@
6415	6211	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6416	6212	struct sched_domain *this_sd;
6417	6213	u64 avg_cost, avg_idle;
6418		- u64 time, cost;
6419		- s64 delta;
	6214	+ u64 time;
	6215	+ int this = smp_processor_id();
6420	6216	int cpu, nr = INT_MAX;
6421	6217
6422	6218	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
..	..	@@ -6441,23 +6237,63 @@
6441	6237	nr = 4;
6442	6238	}
6443	6239
6444		- time = local_clock();
	6240	+ time = cpu_clock(this);
6445	6241
6446		- cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
	6242	+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6447	6243
6448	6244	for_each_cpu_wrap(cpu, cpus, target) {
6449	6245	if (!--nr)
6450	6246	return -1;
6451		- if (available_idle_cpu(cpu))
	6247	+ if (available_idle_cpu(cpu) \|\| sched_idle_cpu(cpu))
6452	6248	break;
6453	6249	}
6454	6250
6455		- time = local_clock() - time;
6456		- cost = this_sd->avg_scan_cost;
6457		- delta = (s64)(time - cost) / 8;
6458		- this_sd->avg_scan_cost += delta;
	6251	+ time = cpu_clock(this) - time;
	6252	+ update_avg(&this_sd->avg_scan_cost, time);
6459	6253
6460	6254	return cpu;
	6255	+}
	6256	+
	6257	+/*
	6258	+ * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
	6259	+ * the task fits. If no CPU is big enough, but there are idle ones, try to
	6260	+ * maximize capacity.
	6261	+ */
	6262	+static int
	6263	+select_idle_capacity(struct task_struct p, struct sched_domain sd, int target)
	6264	+{
	6265	+ unsigned long task_util, best_cap = 0;
	6266	+ int cpu, best_cpu = -1;
	6267	+ struct cpumask *cpus;
	6268	+
	6269	+ cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
	6270	+ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
	6271	+
	6272	+ task_util = uclamp_task_util(p);
	6273	+
	6274	+ for_each_cpu_wrap(cpu, cpus, target) {
	6275	+ unsigned long cpu_cap = capacity_of(cpu);
	6276	+
	6277	+ if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
	6278	+ continue;
	6279	+ if (fits_capacity(task_util, cpu_cap))
	6280	+ return cpu;
	6281	+
	6282	+ if (cpu_cap > best_cap) {
	6283	+ best_cap = cpu_cap;
	6284	+ best_cpu = cpu;
	6285	+ }
	6286	+ }
	6287	+
	6288	+ return best_cpu;
	6289	+}
	6290	+
	6291	+static inline bool asym_fits_capacity(int task_util, int cpu)
	6292	+{
	6293	+ if (static_branch_unlikely(&sched_asym_cpucapacity))
	6294	+ return fits_capacity(task_util, capacity_of(cpu));
	6295	+
	6296	+ return true;
6461	6297	}
6462	6298
6463	6299	/*
..	..	@@ -6466,24 +6302,54 @@
6466	6302	static int select_idle_sibling(struct task_struct *p, int prev, int target)
6467	6303	{
6468	6304	struct sched_domain *sd;
	6305	+ unsigned long task_util;
6469	6306	int i, recent_used_cpu;
6470	6307
6471		- if (available_idle_cpu(target))
	6308	+ /*
	6309	+ * On asymmetric system, update task utilization because we will check
	6310	+ * that the task fits with cpu's capacity.
	6311	+ */
	6312	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6313	+ sync_entity_load_avg(&p->se);
	6314	+ task_util = uclamp_task_util(p);
	6315	+ }
	6316	+
	6317	+ if ((available_idle_cpu(target) \|\| sched_idle_cpu(target)) &&
	6318	+ asym_fits_capacity(task_util, target))
6472	6319	return target;
6473	6320
6474	6321	/*
6475	6322	* If the previous CPU is cache affine and idle, don't be stupid:
6476	6323	*/
6477		- if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
	6324	+ if (prev != target && cpus_share_cache(prev, target) &&
	6325	+ (available_idle_cpu(prev) \|\| sched_idle_cpu(prev)) &&
	6326	+ asym_fits_capacity(task_util, prev))
6478	6327	return prev;
	6328	+
	6329	+ /*
	6330	+ * Allow a per-cpu kthread to stack with the wakee if the
	6331	+ * kworker thread and the tasks previous CPUs are the same.
	6332	+ * The assumption is that the wakee queued work for the
	6333	+ * per-cpu kthread that is now complete and the wakeup is
	6334	+ * essentially a sync wakeup. An obvious example of this
	6335	+ * pattern is IO completions.
	6336	+ */
	6337	+ if (is_per_cpu_kthread(current) &&
	6338	+ in_task() &&
	6339	+ prev == smp_processor_id() &&
	6340	+ this_rq()->nr_running <= 1 &&
	6341	+ asym_fits_capacity(task_util, prev)) {
	6342	+ return prev;
	6343	+ }
6479	6344
6480	6345	/* Check a recently used CPU as a potential idle candidate: */
6481	6346	recent_used_cpu = p->recent_used_cpu;
6482	6347	if (recent_used_cpu != prev &&
6483	6348	recent_used_cpu != target &&
6484	6349	cpus_share_cache(recent_used_cpu, target) &&
6485		- available_idle_cpu(recent_used_cpu) &&
6486		- cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
	6350	+ (available_idle_cpu(recent_used_cpu) \|\| sched_idle_cpu(recent_used_cpu)) &&
	6351	+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
	6352	+ asym_fits_capacity(task_util, recent_used_cpu)) {
6487	6353	/*
6488	6354	* Replace recent_used_cpu with prev as it is a potential
6489	6355	* candidate for the next wake:
..	..	@@ -6492,6 +6358,32 @@
6492	6358	return recent_used_cpu;
6493	6359	}
6494	6360
	6361	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6362	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	6363	+ goto sd_llc;
	6364	+ }
	6365	+
	6366	+ /*
	6367	+ * For asymmetric CPU capacity systems, our domain of interest is
	6368	+ * sd_asym_cpucapacity rather than sd_llc.
	6369	+ */
	6370	+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
	6371	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
	6372	+ /*
	6373	+ * On an asymmetric CPU capacity system where an exclusive
	6374	+ * cpuset defines a symmetric island (i.e. one unique
	6375	+ * capacity_orig value through the cpuset), the key will be set
	6376	+ * but the CPUs within that cpuset will not have a domain with
	6377	+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
	6378	+ * capacity path.
	6379	+ */
	6380	+ if (sd) {
	6381	+ i = select_idle_capacity(p, sd, target);
	6382	+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
	6383	+ }
	6384	+ }
	6385	+
	6386	+sd_llc:
6495	6387	sd = rcu_dereference(per_cpu(sd_llc, target));
6496	6388	if (!sd)
6497	6389	return target;
..	..	@@ -6589,7 +6481,7 @@
6589	6481	util = READ_ONCE(cfs_rq->avg.util_avg);
6590	6482
6591	6483	/* Discount task's util from CPU's util */
6592		- util -= min_t(unsigned int, util, task_util(p));
	6484	+ lsub_positive(&util, task_util(p));
6593	6485
6594	6486	/*
6595	6487	* Covered cases:
..	..	@@ -6638,10 +6530,9 @@
6638	6530	* properly fix the execl regression and it helps in further
6639	6531	* reducing the chances for the above race.
6640	6532	*/
6641		- if (unlikely(task_on_rq_queued(p) \|\| current == p)) {
6642		- estimated -= min_t(unsigned int, estimated,
6643		- (_task_util_est(p) \| UTIL_AVG_UNCHANGED));
6644		- }
	6533	+ if (unlikely(task_on_rq_queued(p) \|\| current == p))
	6534	+ lsub_positive(&estimated, _task_util_est(p));
	6535	+
6645	6536	util = max(util, estimated);
6646	6537	}
6647	6538
..	..	@@ -6651,350 +6542,6 @@
6651	6542	* the cpu_util call.
6652	6543	*/
6653	6544	return min_t(unsigned long, util, capacity_orig_of(cpu));
6654		-}
6655		-
6656		-/*
6657		- * Returns the current capacity of cpu after applying both
6658		- * cpu and freq scaling.
6659		- */
6660		-unsigned long capacity_curr_of(int cpu)
6661		-{
6662		- unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
6663		- unsigned long scale_freq = arch_scale_freq_capacity(cpu);
6664		-
6665		- return cap_scale(max_cap, scale_freq);
6666		-}
6667		-
6668		-static void find_best_target(struct sched_domain sd, cpumask_t cpus,
6669		- struct task_struct *p)
6670		-{
6671		- unsigned long min_util = uclamp_task(p);
6672		- unsigned long target_capacity = ULONG_MAX;
6673		- unsigned long min_wake_util = ULONG_MAX;
6674		- unsigned long target_max_spare_cap = 0;
6675		- unsigned long target_util = ULONG_MAX;
6676		- /* Initialise with deepest possible cstate (INT_MAX) */
6677		- int shallowest_idle_cstate = INT_MAX;
6678		- struct sched_group *sg;
6679		- int best_active_cpu = -1;
6680		- int best_idle_cpu = -1;
6681		- int target_cpu = -1;
6682		- int backup_cpu = -1;
6683		- bool prefer_idle;
6684		- bool boosted;
6685		- int i;
6686		-
6687		- /*
6688		- * In most cases, target_capacity tracks capacity_orig of the most
6689		- * energy efficient CPU candidate, thus requiring to minimise
6690		- * target_capacity. For these cases target_capacity is already
6691		- * initialized to ULONG_MAX.
6692		- * However, for prefer_idle and boosted tasks we look for a high
6693		- * performance CPU, thus requiring to maximise target_capacity. In this
6694		- * case we initialise target_capacity to 0.
6695		- */
6696		- prefer_idle = uclamp_latency_sensitive(p);
6697		- boosted = uclamp_boosted(p);
6698		- if (prefer_idle && boosted)
6699		- target_capacity = 0;
6700		-
6701		- /* Scan CPUs in all SDs */
6702		- sg = sd->groups;
6703		- do {
6704		- for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) {
6705		- unsigned long capacity_curr = capacity_curr_of(i);
6706		- unsigned long capacity_orig = capacity_orig_of(i);
6707		- unsigned long wake_util, new_util;
6708		- long spare_cap;
6709		- int idle_idx = INT_MAX;
6710		-
6711		- if (!cpu_online(i))
6712		- continue;
6713		-
6714		- /*
6715		- * p's blocked utilization is still accounted for on prev_cpu
6716		- * so prev_cpu will receive a negative bias due to the double
6717		- * accounting. However, the blocked utilization may be zero.
6718		- */
6719		- wake_util = cpu_util_without(i, p);
6720		- new_util = wake_util + task_util_est(p);
6721		-
6722		- /*
6723		- * Ensure minimum capacity to grant the required boost.
6724		- * The target CPU can be already at a capacity level higher
6725		- * than the one required to boost the task.
6726		- */
6727		- new_util = max(min_util, new_util);
6728		- if (new_util > capacity_orig)
6729		- continue;
6730		-
6731		- /*
6732		- * Pre-compute the maximum possible capacity we expect
6733		- * to have available on this CPU once the task is
6734		- * enqueued here.
6735		- */
6736		- spare_cap = capacity_orig - new_util;
6737		-
6738		- if (idle_cpu(i))
6739		- idle_idx = idle_get_state_idx(cpu_rq(i));
6740		-
6741		-
6742		- /*
6743		- * Case A) Latency sensitive tasks
6744		- *
6745		- * Unconditionally favoring tasks that prefer idle CPU to
6746		- * improve latency.
6747		- *
6748		- * Looking for:
6749		- * - an idle CPU, whatever its idle_state is, since
6750		- * the first CPUs we explore are more likely to be
6751		- * reserved for latency sensitive tasks.
6752		- * - a non idle CPU where the task fits in its current
6753		- * capacity and has the maximum spare capacity.
6754		- * - a non idle CPU with lower contention from other
6755		- * tasks and running at the lowest possible OPP.
6756		- *
6757		- * The last two goals tries to favor a non idle CPU
6758		- * where the task can run as if it is "almost alone".
6759		- * A maximum spare capacity CPU is favoured since
6760		- * the task already fits into that CPU's capacity
6761		- * without waiting for an OPP chance.
6762		- *
6763		- * The following code path is the only one in the CPUs
6764		- * exploration loop which is always used by
6765		- * prefer_idle tasks. It exits the loop with wither a
6766		- * best_active_cpu or a target_cpu which should
6767		- * represent an optimal choice for latency sensitive
6768		- * tasks.
6769		- */
6770		- if (prefer_idle) {
6771		-
6772		- /*
6773		- * Case A.1: IDLE CPU
6774		- * Return the best IDLE CPU we find:
6775		- * - for boosted tasks: the CPU with the highest
6776		- * performance (i.e. biggest capacity_orig)
6777		- * - for !boosted tasks: the most energy
6778		- * efficient CPU (i.e. smallest capacity_orig)
6779		- */
6780		- if (idle_cpu(i)) {
6781		- if (boosted &&
6782		- capacity_orig < target_capacity)
6783		- continue;
6784		- if (!boosted &&
6785		- capacity_orig > target_capacity)
6786		- continue;
6787		- /*
6788		- * Minimise value of idle state: skip
6789		- * deeper idle states and pick the
6790		- * shallowest.
6791		- */
6792		- if (capacity_orig == target_capacity &&
6793		- sysctl_sched_cstate_aware &&
6794		- idle_idx >= shallowest_idle_cstate)
6795		- continue;
6796		-
6797		- target_capacity = capacity_orig;
6798		- shallowest_idle_cstate = idle_idx;
6799		- best_idle_cpu = i;
6800		- continue;
6801		- }
6802		- if (best_idle_cpu != -1)
6803		- continue;
6804		-
6805		- /*
6806		- * Case A.2: Target ACTIVE CPU
6807		- * Favor CPUs with max spare capacity.
6808		- */
6809		- if (capacity_curr > new_util &&
6810		- spare_cap > target_max_spare_cap) {
6811		- target_max_spare_cap = spare_cap;
6812		- target_cpu = i;
6813		- continue;
6814		- }
6815		- if (target_cpu != -1)
6816		- continue;
6817		-
6818		-
6819		- /*
6820		- * Case A.3: Backup ACTIVE CPU
6821		- * Favor CPUs with:
6822		- * - lower utilization due to other tasks
6823		- * - lower utilization with the task in
6824		- */
6825		- if (wake_util > min_wake_util)
6826		- continue;
6827		- min_wake_util = wake_util;
6828		- best_active_cpu = i;
6829		- continue;
6830		- }
6831		-
6832		- /*
6833		- * Enforce EAS mode
6834		- *
6835		- * For non latency sensitive tasks, skip CPUs that
6836		- * will be overutilized by moving the task there.
6837		- *
6838		- * The goal here is to remain in EAS mode as long as
6839		- * possible at least for !prefer_idle tasks.
6840		- */
6841		- if ((new_util * capacity_margin) >
6842		- (capacity_orig * SCHED_CAPACITY_SCALE))
6843		- continue;
6844		-
6845		- /*
6846		- * Favor CPUs with smaller capacity for non latency
6847		- * sensitive tasks.
6848		- */
6849		- if (capacity_orig > target_capacity)
6850		- continue;
6851		-
6852		- /*
6853		- * Case B) Non latency sensitive tasks on IDLE CPUs.
6854		- *
6855		- * Find an optimal backup IDLE CPU for non latency
6856		- * sensitive tasks.
6857		- *
6858		- * Looking for:
6859		- * - minimizing the capacity_orig,
6860		- * i.e. preferring LITTLE CPUs
6861		- * - favoring shallowest idle states
6862		- * i.e. avoid to wakeup deep-idle CPUs
6863		- *
6864		- * The following code path is used by non latency
6865		- * sensitive tasks if IDLE CPUs are available. If at
6866		- * least one of such CPUs are available it sets the
6867		- * best_idle_cpu to the most suitable idle CPU to be
6868		- * selected.
6869		- *
6870		- * If idle CPUs are available, favour these CPUs to
6871		- * improve performances by spreading tasks.
6872		- * Indeed, the energy_diff() computed by the caller
6873		- * will take care to ensure the minimization of energy
6874		- * consumptions without affecting performance.
6875		- */
6876		- if (idle_cpu(i)) {
6877		- /*
6878		- * Skip CPUs in deeper idle state, but only
6879		- * if they are also less energy efficient.
6880		- * IOW, prefer a deep IDLE LITTLE CPU vs a
6881		- * shallow idle big CPU.
6882		- */
6883		- if (capacity_orig == target_capacity &&
6884		- sysctl_sched_cstate_aware &&
6885		- idle_idx >= shallowest_idle_cstate)
6886		- continue;
6887		-
6888		- target_capacity = capacity_orig;
6889		- shallowest_idle_cstate = idle_idx;
6890		- best_idle_cpu = i;
6891		- continue;
6892		- }
6893		-
6894		- /*
6895		- * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6896		- *
6897		- * Pack tasks in the most energy efficient capacities.
6898		- *
6899		- * This task packing strategy prefers more energy
6900		- * efficient CPUs (i.e. pack on smaller maximum
6901		- * capacity CPUs) while also trying to spread tasks to
6902		- * run them all at the lower OPP.
6903		- *
6904		- * This assumes for example that it's more energy
6905		- * efficient to run two tasks on two CPUs at a lower
6906		- * OPP than packing both on a single CPU but running
6907		- * that CPU at an higher OPP.
6908		- *
6909		- * Thus, this case keep track of the CPU with the
6910		- * smallest maximum capacity and highest spare maximum
6911		- * capacity.
6912		- */
6913		-
6914		- /* Favor CPUs with maximum spare capacity */
6915		- if (capacity_orig == target_capacity &&
6916		- spare_cap < target_max_spare_cap)
6917		- continue;
6918		-
6919		- target_max_spare_cap = spare_cap;
6920		- target_capacity = capacity_orig;
6921		- target_util = new_util;
6922		- target_cpu = i;
6923		- }
6924		-
6925		- } while (sg = sg->next, sg != sd->groups);
6926		-
6927		- /*
6928		- * For non latency sensitive tasks, cases B and C in the previous loop,
6929		- * we pick the best IDLE CPU only if we was not able to find a target
6930		- * ACTIVE CPU.
6931		- *
6932		- * Policies priorities:
6933		- *
6934		- * - prefer_idle tasks:
6935		- *
6936		- * a) IDLE CPU available: best_idle_cpu
6937		- * b) ACTIVE CPU where task fits and has the bigger maximum spare
6938		- * capacity (i.e. target_cpu)
6939		- * c) ACTIVE CPU with less contention due to other tasks
6940		- * (i.e. best_active_cpu)
6941		- *
6942		- * - NON prefer_idle tasks:
6943		- *
6944		- * a) ACTIVE CPU: target_cpu
6945		- * b) IDLE CPU: best_idle_cpu
6946		- */
6947		-
6948		- if (prefer_idle && (best_idle_cpu != -1)) {
6949		- target_cpu = best_idle_cpu;
6950		- goto target;
6951		- }
6952		-
6953		- if (target_cpu == -1)
6954		- target_cpu = prefer_idle
6955		- ? best_active_cpu
6956		- : best_idle_cpu;
6957		- else
6958		- backup_cpu = prefer_idle
6959		- ? best_active_cpu
6960		- : best_idle_cpu;
6961		-
6962		- if (backup_cpu >= 0)
6963		- cpumask_set_cpu(backup_cpu, cpus);
6964		- if (target_cpu >= 0) {
6965		-target:
6966		- cpumask_set_cpu(target_cpu, cpus);
6967		- }
6968		-
6969		- trace_sched_find_best_target(p, prefer_idle, min_util, best_idle_cpu,
6970		- best_active_cpu, target_cpu, backup_cpu);
6971		-}
6972		-
6973		-/*
6974		- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6975		- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6976		- *
6977		- * In that case WAKE_AFFINE doesn't make sense and we'll let
6978		- * BALANCE_WAKE sort things out.
6979		- */
6980		-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6981		-{
6982		- long min_cap, max_cap;
6983		-
6984		- if (!static_branch_unlikely(&sched_asym_cpucapacity))
6985		- return 0;
6986		-
6987		- min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6988		- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6989		-
6990		- /* Minimum capacity is close to max, no need to abort wake_affine */
6991		- if (max_cap - min_cap < max_cap >> 3)
6992		- return 0;
6993		-
6994		- /* Bring task utilization in sync with prev_cpu */
6995		- sync_entity_load_avg(&p->se);
6996		-
6997		- return !task_fits_capacity(p, min_cap);
6998	6545	}
6999	6546
7000	6547	/*
..	..	@@ -7036,154 +6583,61 @@
7036	6583	}
7037	6584
7038	6585	/*
7039		- * compute_energy(): Estimates the energy that would be consumed if @p was
	6586	+ * compute_energy(): Estimates the energy that @pd would consume if @p was
7040	6587	* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
7041		- * landscape of the * CPUs after the task migration, and uses the Energy Model
	6588	+ * landscape of @pd's CPUs after the task migration, and uses the Energy Model
7042	6589	* to compute what would be the energy if we decided to actually migrate that
7043	6590	* task.
7044	6591	*/
7045	6592	static long
7046	6593	compute_energy(struct task_struct p, int dst_cpu, struct perf_domain pd)
7047	6594	{
7048		- unsigned int max_util, util_cfs, cpu_util, cpu_cap;
7049		- unsigned long sum_util, energy = 0;
7050		- struct task_struct *tsk;
	6595	+ struct cpumask *pd_mask = perf_domain_span(pd);
	6596	+ unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
	6597	+ unsigned long max_util = 0, sum_util = 0;
	6598	+ unsigned long energy = 0;
7051	6599	int cpu;
7052	6600
7053		- for (; pd; pd = pd->next) {
7054		- struct cpumask *pd_mask = perf_domain_span(pd);
	6601	+ /*
	6602	+ * The capacity state of CPUs of the current rd can be driven by CPUs
	6603	+ * of another rd if they belong to the same pd. So, account for the
	6604	+ * utilization of these CPUs too by masking pd with cpu_online_mask
	6605	+ * instead of the rd span.
	6606	+ *
	6607	+ * If an entire pd is outside of the current rd, it will not appear in
	6608	+ * its pd list and will not be accounted by compute_energy().
	6609	+ */
	6610	+ for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
	6611	+ unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
	6612	+ struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
7055	6613
7056	6614	/*
7057		- * The energy model mandates all the CPUs of a performance
7058		- * domain have the same capacity.
	6615	+ * Busy time computation: utilization clamping is not
	6616	+ * required since the ratio (sum_util / cpu_capacity)
	6617	+ * is already enough to scale the EM reported power
	6618	+ * consumption at the (eventually clamped) cpu_capacity.
7059	6619	*/
7060		- cpu_cap = arch_scale_cpu_capacity(NULL, cpumask_first(pd_mask));
7061		- max_util = sum_util = 0;
	6620	+ sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6621	+ ENERGY_UTIL, NULL);
7062	6622
7063	6623	/*
7064		- * The capacity state of CPUs of the current rd can be driven by
7065		- * CPUs of another rd if they belong to the same performance
7066		- * domain. So, account for the utilization of these CPUs too
7067		- * by masking pd with cpu_online_mask instead of the rd span.
7068		- *
7069		- * If an entire performance domain is outside of the current rd,
7070		- * it will not appear in its pd list and will not be accounted
7071		- * by compute_energy().
	6624	+ * Performance domain frequency: utilization clamping
	6625	+ * must be considered since it affects the selection
	6626	+ * of the performance domain frequency.
	6627	+ * NOTE: in case RT tasks are running, by default the
	6628	+ * FREQUENCY_UTIL's utilization can be max OPP.
7072	6629	*/
7073		- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
7074		- util_cfs = cpu_util_next(cpu, p, dst_cpu);
7075		-
7076		- /*
7077		- * Busy time computation: utilization clamping is not
7078		- * required since the ratio (sum_util / cpu_capacity)
7079		- * is already enough to scale the EM reported power
7080		- * consumption at the (eventually clamped) cpu_capacity.
7081		- */
7082		- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7083		- ENERGY_UTIL, NULL);
7084		-
7085		- /*
7086		- * Performance domain frequency: utilization clamping
7087		- * must be considered since it affects the selection
7088		- * of the performance domain frequency.
7089		- * NOTE: in case RT tasks are running, by default the
7090		- * FREQUENCY_UTIL's utilization can be max OPP.
7091		- */
7092		- tsk = cpu == dst_cpu ? p : NULL;
7093		- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
7094		- FREQUENCY_UTIL, tsk);
7095		- max_util = max(max_util, cpu_util);
7096		- }
7097		-
7098		- energy += em_pd_energy(pd->em_pd, max_util, sum_util);
	6630	+ cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
	6631	+ FREQUENCY_UTIL, tsk);
	6632	+ max_util = max(max_util, cpu_util);
7099	6633	}
	6634	+
	6635	+ trace_android_vh_em_cpu_energy(pd->em_pd, max_util, sum_util, &energy);
	6636	+ if (!energy)
	6637	+ energy = em_cpu_energy(pd->em_pd, max_util, sum_util);
7100	6638
7101	6639	return energy;
7102	6640	}
7103		-
7104		-static void select_cpu_candidates(struct sched_domain sd, cpumask_t cpus,
7105		- struct perf_domain pd, struct task_struct p, int prev_cpu)
7106		-{
7107		- int highest_spare_cap_cpu = prev_cpu, best_idle_cpu = -1;
7108		- unsigned long spare_cap, max_spare_cap, util, cpu_cap;
7109		- bool prefer_idle = uclamp_latency_sensitive(p);
7110		- bool boosted = uclamp_boosted(p);
7111		- unsigned long target_cap = boosted ? 0 : ULONG_MAX;
7112		- unsigned long highest_spare_cap = 0;
7113		- unsigned int min_exit_lat = UINT_MAX;
7114		- int cpu, max_spare_cap_cpu;
7115		- struct cpuidle_state *idle;
7116		-
7117		- for (; pd; pd = pd->next) {
7118		- max_spare_cap_cpu = -1;
7119		- max_spare_cap = 0;
7120		-
7121		- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
7122		- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
7123		- continue;
7124		-
7125		- util = cpu_util_next(cpu, p, cpu);
7126		- cpu_cap = capacity_of(cpu);
7127		- spare_cap = cpu_cap - util;
7128		-
7129		- /*
7130		- * Skip CPUs that cannot satisfy the capacity request.
7131		- * IOW, placing the task there would make the CPU
7132		- * overutilized. Take uclamp into account to see how
7133		- * much capacity we can get out of the CPU; this is
7134		- * aligned with schedutil_cpu_util().
7135		- */
7136		- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
7137		- if (cpu_cap * 1024 < util * capacity_margin)
7138		- continue;
7139		-
7140		- /*
7141		- * Find the CPU with the maximum spare capacity in
7142		- * the performance domain
7143		- */
7144		- if (spare_cap > max_spare_cap) {
7145		- max_spare_cap = spare_cap;
7146		- max_spare_cap_cpu = cpu;
7147		- }
7148		-
7149		- if (!prefer_idle)
7150		- continue;
7151		-
7152		- if (idle_cpu(cpu)) {
7153		- cpu_cap = capacity_orig_of(cpu);
7154		- if (boosted && cpu_cap < target_cap)
7155		- continue;
7156		- if (!boosted && cpu_cap > target_cap)
7157		- continue;
7158		- idle = idle_get_state(cpu_rq(cpu));
7159		- if (idle && idle->exit_latency > min_exit_lat &&
7160		- cpu_cap == target_cap)
7161		- continue;
7162		-
7163		- if (idle)
7164		- min_exit_lat = idle->exit_latency;
7165		- target_cap = cpu_cap;
7166		- best_idle_cpu = cpu;
7167		- } else if (spare_cap > highest_spare_cap) {
7168		- highest_spare_cap = spare_cap;
7169		- highest_spare_cap_cpu = cpu;
7170		- }
7171		- }
7172		-
7173		- if (!prefer_idle && max_spare_cap_cpu >= 0)
7174		- cpumask_set_cpu(max_spare_cap_cpu, cpus);
7175		- }
7176		-
7177		- if (!prefer_idle)
7178		- return;
7179		-
7180		- if (best_idle_cpu >= 0)
7181		- cpumask_set_cpu(best_idle_cpu, cpus);
7182		- else
7183		- cpumask_set_cpu(highest_spare_cap_cpu, cpus);
7184		-}
7185		-
7186		-static DEFINE_PER_CPU(cpumask_t, energy_cpus);
7187	6641
7188	6642	/*
7189	6643	* find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
..	..	@@ -7224,27 +6678,39 @@
7224	6678	* other use-cases too. So, until someone finds a better way to solve this,
7225	6679	* let's keep things simple by re-using the existing slow path.
7226	6680	*/
7227		-
7228	6681	static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync)
7229	6682	{
7230		- unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
	6683	+ unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
	6684	+ unsigned long best_delta2 = ULONG_MAX;
7231	6685	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
7232		- int weight, cpu, best_energy_cpu = prev_cpu;
7233		- unsigned long cur_energy;
7234		- struct perf_domain *pd;
	6686	+ int max_spare_cap_cpu_ls = prev_cpu, best_idle_cpu = -1;
	6687	+ unsigned long max_spare_cap_ls = 0, target_cap;
	6688	+ unsigned long cpu_cap, util, base_energy = 0;
	6689	+ bool boosted, latency_sensitive = false;
	6690	+ unsigned int min_exit_lat = UINT_MAX;
	6691	+ int cpu, best_energy_cpu = prev_cpu;
	6692	+ struct cpuidle_state *idle;
7235	6693	struct sched_domain *sd;
7236		- cpumask_t *candidates;
	6694	+ struct perf_domain *pd;
	6695	+ int new_cpu = INT_MAX;
7237	6696
7238		- if (sysctl_sched_sync_hint_enable && sync) {
7239		- cpu = smp_processor_id();
7240		- if (cpumask_test_cpu(cpu, &p->cpus_allowed))
7241		- return cpu;
7242		- }
	6697	+ sync_entity_load_avg(&p->se);
	6698	+ trace_android_rvh_find_energy_efficient_cpu(p, prev_cpu, sync, &new_cpu);
	6699	+ if (new_cpu != INT_MAX)
	6700	+ return new_cpu;
7243	6701
7244	6702	rcu_read_lock();
7245	6703	pd = rcu_dereference(rd->pd);
7246	6704	if (!pd \|\| READ_ONCE(rd->overutilized))
7247	6705	goto fail;
	6706	+
	6707	+ cpu = smp_processor_id();
	6708	+ if (sync && cpu_rq(cpu)->nr_running == 1 &&
	6709	+ cpumask_test_cpu(cpu, p->cpus_ptr) &&
	6710	+ task_fits_capacity(p, capacity_of(cpu))) {
	6711	+ rcu_read_unlock();
	6712	+ return cpu;
	6713	+ }
7248	6714
7249	6715	/*
7250	6716	* Energy-aware wake-up happens on the lowest sched_domain starting
..	..	@@ -7256,59 +6722,149 @@
7256	6722	if (!sd)
7257	6723	goto fail;
7258	6724
7259		- sync_entity_load_avg(&p->se);
7260	6725	if (!task_util_est(p))
7261	6726	goto unlock;
7262	6727
7263		- /* Pre-select a set of candidate CPUs. */
7264		- candidates = this_cpu_ptr(&energy_cpus);
7265		- cpumask_clear(candidates);
	6728	+ latency_sensitive = uclamp_latency_sensitive(p);
	6729	+ boosted = uclamp_boosted(p);
	6730	+ target_cap = boosted ? 0 : ULONG_MAX;
7266	6731
7267		- if (sched_feat(FIND_BEST_TARGET))
7268		- find_best_target(sd, candidates, p);
7269		- else
7270		- select_cpu_candidates(sd, candidates, pd, p, prev_cpu);
	6732	+ for (; pd; pd = pd->next) {
	6733	+ unsigned long cur_delta, spare_cap, max_spare_cap = 0;
	6734	+ unsigned long base_energy_pd;
	6735	+ int max_spare_cap_cpu = -1;
7271	6736
7272		- /* Bail out if no candidate was found. */
7273		- weight = cpumask_weight(candidates);
7274		- if (!weight)
7275		- goto unlock;
	6737	+ /* Compute the 'base' energy of the pd, without @p */
	6738	+ base_energy_pd = compute_energy(p, -1, pd);
	6739	+ base_energy += base_energy_pd;
7276	6740
7277		- /* If there is only one sensible candidate, select it now. */
7278		- cpu = cpumask_first(candidates);
7279		- if (weight == 1 && ((uclamp_latency_sensitive(p) && idle_cpu(cpu)) \|\|
7280		- (cpu == prev_cpu))) {
7281		- best_energy_cpu = cpu;
7282		- goto unlock;
7283		- }
	6741	+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
	6742	+ if (!cpumask_test_cpu(cpu, p->cpus_ptr))
	6743	+ continue;
7284	6744
7285		- if (cpumask_test_cpu(prev_cpu, &p->cpus_allowed))
7286		- prev_energy = best_energy = compute_energy(p, prev_cpu, pd);
7287		- else
7288		- prev_energy = best_energy = ULONG_MAX;
	6745	+ util = cpu_util_next(cpu, p, cpu);
	6746	+ cpu_cap = capacity_of(cpu);
	6747	+ spare_cap = cpu_cap;
	6748	+ lsub_positive(&spare_cap, util);
7289	6749
7290		- /* Select the best candidate energy-wise. */
7291		- for_each_cpu(cpu, candidates) {
7292		- if (cpu == prev_cpu)
7293		- continue;
7294		- cur_energy = compute_energy(p, cpu, pd);
7295		- if (cur_energy < best_energy) {
7296		- best_energy = cur_energy;
7297		- best_energy_cpu = cpu;
	6750	+ /*
	6751	+ * Skip CPUs that cannot satisfy the capacity request.
	6752	+ * IOW, placing the task there would make the CPU
	6753	+ * overutilized. Take uclamp into account to see how
	6754	+ * much capacity we can get out of the CPU; this is
	6755	+ * aligned with schedutil_cpu_util().
	6756	+ */
	6757	+ util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
	6758	+ if (!fits_capacity(util, cpu_cap))
	6759	+ continue;
	6760	+
	6761	+ /* Always use prev_cpu as a candidate. */
	6762	+ if (!latency_sensitive && cpu == prev_cpu) {
	6763	+ prev_delta = compute_energy(p, prev_cpu, pd);
	6764	+ prev_delta -= base_energy_pd;
	6765	+ best_delta = min(best_delta, prev_delta);
	6766	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6767	+ if (prev_delta == best_delta)
	6768	+ best_energy_cpu = prev_cpu;
	6769	+ }
	6770	+ }
	6771	+
	6772	+ /*
	6773	+ * Find the CPU with the maximum spare capacity in
	6774	+ * the performance domain
	6775	+ */
	6776	+ if (spare_cap > max_spare_cap) {
	6777	+ max_spare_cap = spare_cap;
	6778	+ max_spare_cap_cpu = cpu;
	6779	+ }
	6780	+
	6781	+ if (!IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6782	+ if (!latency_sensitive)
	6783	+ continue;
	6784	+ }
	6785	+
	6786	+ if (idle_cpu(cpu)) {
	6787	+ cpu_cap = capacity_orig_of(cpu);
	6788	+ if (boosted && cpu_cap < target_cap)
	6789	+ continue;
	6790	+ if (!boosted && cpu_cap > target_cap)
	6791	+ continue;
	6792	+ idle = idle_get_state(cpu_rq(cpu));
	6793	+ if (idle && idle->exit_latency > min_exit_lat &&
	6794	+ cpu_cap == target_cap)
	6795	+ continue;
	6796	+
	6797	+ if (idle)
	6798	+ min_exit_lat = idle->exit_latency;
	6799	+ target_cap = cpu_cap;
	6800	+ best_idle_cpu = cpu;
	6801	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6802	+ best_delta2 = compute_energy(p, cpu, pd);
	6803	+ best_delta2 -= base_energy_pd;
	6804	+ }
	6805	+ } else if (spare_cap > max_spare_cap_ls) {
	6806	+ max_spare_cap_ls = spare_cap;
	6807	+ max_spare_cap_cpu_ls = cpu;
	6808	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6809	+ if (best_idle_cpu == -1) {
	6810	+ best_delta2 = compute_energy(p, cpu, pd);
	6811	+ best_delta2 -= base_energy_pd;
	6812	+ }
	6813	+ }
	6814	+ }
	6815	+ }
	6816	+
	6817	+ /* Evaluate the energy impact of using this CPU. */
	6818	+ if (!latency_sensitive && max_spare_cap_cpu >= 0 &&
	6819	+ max_spare_cap_cpu != prev_cpu) {
	6820	+ cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
	6821	+ cur_delta -= base_energy_pd;
	6822	+ if (cur_delta < best_delta) {
	6823	+ best_delta = cur_delta;
	6824	+ best_energy_cpu = max_spare_cap_cpu;
	6825	+ }
7298	6826	}
7299	6827	}
7300	6828	unlock:
7301	6829	rcu_read_unlock();
7302	6830
	6831	+ if (latency_sensitive)
	6832	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	6833	+
7303	6834	/*
7304	6835	* Pick the best CPU if prev_cpu cannot be used, or if it saves at
7305	6836	* least 6% of the energy used by prev_cpu.
7306	6837	*/
7307		- if (prev_energy == ULONG_MAX)
	6838	+ if (prev_delta == ULONG_MAX)
7308	6839	return best_energy_cpu;
7309	6840
7310		- if ((prev_energy - best_energy) > (prev_energy >> 4))
	6841	+ if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
7311	6842	return best_energy_cpu;
	6843	+
	6844	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6845	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	6846	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	6847	+ int level = rockchip_perf_get_level();
	6848	+
	6849	+ /*
	6850	+ * when select ROCKCHIP_PERFORMANCE_LOW:
	6851	+ * Pick best_energy_cpu if prev_cpu is big cpu and best_energy_cpu
	6852	+ * is little cpu, so that tasks can migrate from big cpu to little
	6853	+ * cpu easier to save power.
	6854	+ */
	6855	+ if ((level == ROCKCHIP_PERFORMANCE_LOW) && cpul_mask &&
	6856	+ cpub_mask && cpumask_test_cpu(prev_cpu, cpub_mask) &&
	6857	+ cpumask_test_cpu(best_energy_cpu, cpul_mask)) {
	6858	+ return best_energy_cpu;
	6859	+ }
	6860	+
	6861	+ /*
	6862	+ * Pick the idlest cpu if it is a little power increased(<3.1%).
	6863	+ */
	6864	+ if ((best_delta2 <= prev_delta) \|\|
	6865	+ ((best_delta2 - prev_delta) < ((prev_delta + base_energy) >> 5)))
	6866	+ return best_idle_cpu >= 0 ? best_idle_cpu : max_spare_cap_cpu_ls;
	6867	+ }
7312	6868
7313	6869	return prev_cpu;
7314	6870
..	..	@@ -7331,39 +6887,44 @@
7331	6887	* preempt must be disabled.
7332	6888	*/
7333	6889	static int
7334		-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
7335		- int sibling_count_hint)
	6890	+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
7336	6891	{
7337	6892	struct sched_domain tmp, sd = NULL;
7338	6893	int cpu = smp_processor_id();
7339	6894	int new_cpu = prev_cpu;
7340	6895	int want_affine = 0;
7341	6896	int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
	6897	+ int target_cpu = -1;
	6898	+
	6899	+ if (trace_android_rvh_select_task_rq_fair_enabled() &&
	6900	+ !(sd_flag & SD_BALANCE_FORK))
	6901	+ sync_entity_load_avg(&p->se);
	6902	+ trace_android_rvh_select_task_rq_fair(p, prev_cpu, sd_flag,
	6903	+ wake_flags, &target_cpu);
	6904	+ if (target_cpu >= 0)
	6905	+ return target_cpu;
7342	6906
7343	6907	if (sd_flag & SD_BALANCE_WAKE) {
7344	6908	record_wakee(p);
7345	6909
7346		- if (static_branch_unlikely(&sched_energy_present)) {
7347		- if (uclamp_latency_sensitive(p) && !sched_feat(EAS_PREFER_IDLE) && !sync)
7348		- goto sd_loop;
	6910	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6911	+ if (rockchip_perf_get_level() == ROCKCHIP_PERFORMANCE_HIGH)
	6912	+ goto no_eas;
	6913	+ }
7349	6914
	6915	+ if (sched_energy_enabled()) {
7350	6916	new_cpu = find_energy_efficient_cpu(p, prev_cpu, sync);
7351	6917	if (new_cpu >= 0)
7352	6918	return new_cpu;
7353	6919	new_cpu = prev_cpu;
7354	6920	}
7355	6921
7356		- want_affine = !wake_wide(p, sibling_count_hint) &&
7357		- !wake_cap(p, cpu, prev_cpu) &&
7358		- cpumask_test_cpu(cpu, &p->cpus_allowed);
	6922	+no_eas:
	6923	+ want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
7359	6924	}
7360	6925
7361		-sd_loop:
7362	6926	rcu_read_lock();
7363	6927	for_each_domain(cpu, tmp) {
7364		- if (!(tmp->flags & SD_LOAD_BALANCE))
7365		- break;
7366		-
7367	6928	/*
7368	6929	* If both 'cpu' and 'prev_cpu' are part of this domain,
7369	6930	* cpu is a valid SD_WAKE_AFFINE target.
..	..	@@ -7390,6 +6951,23 @@
7390	6951	/* Fast path */
7391	6952
7392	6953	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
	6954	+
	6955	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	6956	+ struct root_domain *rd = cpu_rq(cpu)->rd;
	6957	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	6958	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	6959	+ int level = rockchip_perf_get_level();
	6960	+
	6961	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	6962	+ cpul_mask && cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	6963	+ cpumask_test_cpu(new_cpu, cpul_mask)) {
	6964	+ for_each_domain(cpu, tmp) {
	6965	+ sd = tmp;
	6966	+ }
	6967	+ if (sd)
	6968	+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
	6969	+ }
	6970	+ }
7393	6971
7394	6972	if (want_affine)
7395	6973	current->recent_used_cpu = cpu;
..	..	@@ -7467,6 +7045,15 @@
7467	7045	{
7468	7046	remove_entity_load_avg(&p->se);
7469	7047	}
	7048	+
	7049	+static int
	7050	+balance_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
	7051	+{
	7052	+ if (rq->nr_running)
	7053	+ return 1;
	7054	+
	7055	+ return newidle_balance(rq, rf) != 0;
	7056	+}
7470	7057	#endif /* CONFIG_SMP */
7471	7058
7472	7059	static unsigned long wakeup_gran(struct sched_entity *se)
..	..	@@ -7520,7 +7107,7 @@
7520	7107
7521	7108	static void set_last_buddy(struct sched_entity *se)
7522	7109	{
7523		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7110	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7524	7111	return;
7525	7112
7526	7113	for_each_sched_entity(se) {
..	..	@@ -7532,7 +7119,7 @@
7532	7119
7533	7120	static void set_next_buddy(struct sched_entity *se)
7534	7121	{
7535		- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
	7122	+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
7536	7123	return;
7537	7124
7538	7125	for_each_sched_entity(se) {
..	..	@@ -7558,6 +7145,7 @@
7558	7145	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7559	7146	int scale = cfs_rq->nr_running >= sched_nr_latency;
7560	7147	int next_buddy_marked = 0;
	7148	+ bool preempt = false, nopreempt = false;
7561	7149
7562	7150	if (unlikely(se == pse))
7563	7151	return;
..	..	@@ -7590,8 +7178,8 @@
7590	7178	return;
7591	7179
7592	7180	/* Idle tasks are by definition preempted by non-idle tasks. */
7593		- if (unlikely(curr->policy == SCHED_IDLE) &&
7594		- likely(p->policy != SCHED_IDLE))
	7181	+ if (unlikely(task_has_idle_policy(curr)) &&
	7182	+ likely(!task_has_idle_policy(p)))
7595	7183	goto preempt;
7596	7184
7597	7185	/*
..	..	@@ -7603,6 +7191,12 @@
7603	7191
7604	7192	find_matching_se(&se, &pse);
7605	7193	update_curr(cfs_rq_of(se));
	7194	+ trace_android_rvh_check_preempt_wakeup(rq, p, &preempt, &nopreempt,
	7195	+ wake_flags, se, pse, next_buddy_marked, sysctl_sched_wakeup_granularity);
	7196	+ if (preempt)
	7197	+ goto preempt;
	7198	+ if (nopreempt)
	7199	+ return;
7606	7200	BUG_ON(!pse);
7607	7201	if (wakeup_preempt_entity(se, pse) == 1) {
7608	7202	/*
..	..	@@ -7617,7 +7211,7 @@
7617	7211	return;
7618	7212
7619	7213	preempt:
7620		- resched_curr(rq);
	7214	+ resched_curr_lazy(rq);
7621	7215	/*
7622	7216	* Only set the backward buddy when the current task is still
7623	7217	* on the rq. This can happen when a wakeup gets interleaved
..	..	@@ -7634,20 +7228,21 @@
7634	7228	set_last_buddy(se);
7635	7229	}
7636	7230
7637		-static struct task_struct *
	7231	+struct task_struct *
7638	7232	pick_next_task_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
7639	7233	{
7640	7234	struct cfs_rq *cfs_rq = &rq->cfs;
7641		- struct sched_entity *se;
7642		- struct task_struct *p;
	7235	+ struct sched_entity *se = NULL;
	7236	+ struct task_struct *p = NULL;
7643	7237	int new_tasks;
	7238	+ bool repick = false;
7644	7239
7645	7240	again:
7646		- if (!cfs_rq->nr_running)
	7241	+ if (!sched_fair_runnable(rq))
7647	7242	goto idle;
7648	7243
7649	7244	#ifdef CONFIG_FAIR_GROUP_SCHED
7650		- if (prev->sched_class != &fair_sched_class)
	7245	+ if (!prev \|\| prev->sched_class != &fair_sched_class)
7651	7246	goto simple;
7652	7247
7653	7248	/*
..	..	@@ -7694,7 +7289,7 @@
7694	7289	} while (cfs_rq);
7695	7290
7696	7291	p = task_of(se);
7697		-
	7292	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, false, prev);
7698	7293	/*
7699	7294	* Since we haven't yet done put_prev_entity and if the selected task
7700	7295	* is a different task than we started out with, try and touch the
..	..	@@ -7724,8 +7319,15 @@
7724	7319	goto done;
7725	7320	simple:
7726	7321	#endif
	7322	+ if (prev)
	7323	+ put_prev_task(rq, prev);
7727	7324
7728		- put_prev_task(rq, prev);
	7325	+ trace_android_rvh_replace_next_task_fair(rq, &p, &se, &repick, true, prev);
	7326	+ if (repick) {
	7327	+ for_each_sched_entity(se)
	7328	+ set_next_entity(cfs_rq_of(se), se);
	7329	+ goto done;
	7330	+ }
7729	7331
7730	7332	do {
7731	7333	se = pick_next_entity(cfs_rq, NULL);
..	..	@@ -7753,11 +7355,13 @@
7753	7355	return p;
7754	7356
7755	7357	idle:
7756		- update_misfit_status(NULL, rq);
7757		- new_tasks = idle_balance(rq, rf);
	7358	+ if (!rf)
	7359	+ return NULL;
	7360	+
	7361	+ new_tasks = newidle_balance(rq, rf);
7758	7362
7759	7363	/*
7760		- * Because idle_balance() releases (and re-acquires) rq->lock, it is
	7364	+ * Because newidle_balance() releases (and re-acquires) rq->lock, it is
7761	7365	* possible for any higher priority task to appear. In that case we
7762	7366	* must re-start the pick_next_entity() loop.
7763	7367	*/
..	..	@@ -7774,6 +7378,11 @@
7774	7378	update_idle_rq_clock_pelt(rq);
7775	7379
7776	7380	return NULL;
	7381	+}
	7382	+
	7383	+static struct task_struct __pick_next_task_fair(struct rq rq)
	7384	+{
	7385	+ return pick_next_task_fair(rq, NULL, NULL);
7777	7386	}
7778	7387
7779	7388	/*
..	..	@@ -7826,7 +7435,7 @@
7826	7435	set_skip_buddy(se);
7827	7436	}
7828	7437
7829		-static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
	7438	+static bool yield_to_task_fair(struct rq rq, struct task_struct p)
7830	7439	{
7831	7440	struct sched_entity *se = &p->se;
7832	7441
..	..	@@ -7961,15 +7570,54 @@
7961	7570	* rewrite all of this once again.]
7962	7571	*/
7963	7572
7964		-static unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7573	+unsigned long __read_mostly max_load_balance_interval = HZ/10;
	7574	+EXPORT_SYMBOL_GPL(max_load_balance_interval);
7965	7575
7966	7576	enum fbq_type { regular, remote, all };
7967	7577
	7578	+/*
	7579	+ * 'group_type' describes the group of CPUs at the moment of load balancing.
	7580	+ *
	7581	+ * The enum is ordered by pulling priority, with the group with lowest priority
	7582	+ * first so the group_type can simply be compared when selecting the busiest
	7583	+ * group. See update_sd_pick_busiest().
	7584	+ */
7968	7585	enum group_type {
7969		- group_other = 0,
	7586	+ /* The group has spare capacity that can be used to run more tasks. */
	7587	+ group_has_spare = 0,
	7588	+ /*
	7589	+ * The group is fully used and the tasks don't compete for more CPU
	7590	+ * cycles. Nevertheless, some tasks might wait before running.
	7591	+ */
	7592	+ group_fully_busy,
	7593	+ /*
	7594	+ * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
	7595	+ * and must be migrated to a more powerful CPU.
	7596	+ */
7970	7597	group_misfit_task,
	7598	+ /*
	7599	+ * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
	7600	+ * and the task should be migrated to it instead of running on the
	7601	+ * current CPU.
	7602	+ */
	7603	+ group_asym_packing,
	7604	+ /*
	7605	+ * The tasks' affinity constraints previously prevented the scheduler
	7606	+ * from balancing the load across the system.
	7607	+ */
7971	7608	group_imbalanced,
7972		- group_overloaded,
	7609	+ /*
	7610	+ * The CPU is overloaded and can't provide expected CPU cycles to all
	7611	+ * tasks.
	7612	+ */
	7613	+ group_overloaded
	7614	+};
	7615	+
	7616	+enum migration_type {
	7617	+ migrate_load = 0,
	7618	+ migrate_util,
	7619	+ migrate_task,
	7620	+ migrate_misfit
7973	7621	};
7974	7622
7975	7623	#define LBF_ALL_PINNED 0x01
..	..	@@ -7992,7 +7640,6 @@
7992	7640	int new_dst_cpu;
7993	7641	enum cpu_idle_type idle;
7994	7642	long imbalance;
7995		- unsigned int src_grp_nr_running;
7996	7643	/* The set of CPUs under consideration for load-balancing */
7997	7644	struct cpumask *cpus;
7998	7645
..	..	@@ -8003,8 +7650,9 @@
8003	7650	unsigned int loop_max;
8004	7651
8005	7652	enum fbq_type fbq_type;
8006		- enum group_type src_grp_type;
	7653	+ enum migration_type migration_type;
8007	7654	struct list_head tasks;
	7655	+ struct rq_flags *src_rq_rf;
8008	7656	};
8009	7657
8010	7658	/*
..	..	@@ -8019,7 +7667,11 @@
8019	7667	if (p->sched_class != &fair_sched_class)
8020	7668	return 0;
8021	7669
8022		- if (unlikely(p->policy == SCHED_IDLE))
	7670	+ if (unlikely(task_has_idle_policy(p)))
	7671	+ return 0;
	7672	+
	7673	+ /* SMT siblings share cache */
	7674	+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
8023	7675	return 0;
8024	7676
8025	7677	/*
..	..	@@ -8107,20 +7759,29 @@
8107	7759	int can_migrate_task(struct task_struct p, struct lb_env env)
8108	7760	{
8109	7761	int tsk_cache_hot;
	7762	+ int can_migrate = 1;
8110	7763
8111	7764	lockdep_assert_held(&env->src_rq->lock);
	7765	+
	7766	+ trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate);
	7767	+ if (!can_migrate)
	7768	+ return 0;
8112	7769
8113	7770	/*
8114	7771	* We do not migrate tasks that are:
8115	7772	* 1) throttled_lb_pair, or
8116		- * 2) cannot be migrated to this CPU due to cpus_allowed, or
	7773	+ * 2) cannot be migrated to this CPU due to cpus_ptr, or
8117	7774	* 3) running (obviously), or
8118	7775	* 4) are cache-hot on their current CPU.
8119	7776	*/
8120	7777	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
8121	7778	return 0;
8122	7779
8123		- if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
	7780	+ /* Disregard pcpu kthreads; they are where they need to be. */
	7781	+ if (kthread_is_per_cpu(p))
	7782	+ return 0;
	7783	+
	7784	+ if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
8124	7785	int cpu;
8125	7786
8126	7787	schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
..	..	@@ -8140,7 +7801,7 @@
8140	7801
8141	7802	/* Prevent to re-select dst_cpu via env's CPUs: */
8142	7803	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
8143		- if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
	7804	+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
8144	7805	env->flags \|= LBF_DST_PINNED;
8145	7806	env->new_dst_cpu = cpu;
8146	7807	break;
..	..	@@ -8186,9 +7847,20 @@
8186	7847	*/
8187	7848	static void detach_task(struct task_struct p, struct lb_env env)
8188	7849	{
	7850	+ int detached = 0;
	7851	+
8189	7852	lockdep_assert_held(&env->src_rq->lock);
8190	7853
8191		- p->on_rq = TASK_ON_RQ_MIGRATING;
	7854	+ /*
	7855	+ * The vendor hook may drop the lock temporarily, so
	7856	+ * pass the rq flags to unpin lock. We expect the
	7857	+ * rq lock to be held after return.
	7858	+ */
	7859	+ trace_android_rvh_migrate_queued_task(env->src_rq, env->src_rq_rf, p,
	7860	+ env->dst_cpu, &detached);
	7861	+ if (detached)
	7862	+ return;
	7863	+
8192	7864	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
8193	7865	set_task_cpu(p, env->dst_cpu);
8194	7866	}
..	..	@@ -8227,7 +7899,7 @@
8227	7899	static const unsigned int sched_nr_migrate_break = 32;
8228	7900
8229	7901	/*
8230		- * detach_tasks() -- tries to detach up to imbalance weighted load from
	7902	+ * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
8231	7903	* busiest_rq, as part of a balancing operation within domain "sd".
8232	7904	*
8233	7905	* Returns number of detached tasks if successful and 0 otherwise.
..	..	@@ -8235,8 +7907,8 @@
8235	7907	static int detach_tasks(struct lb_env *env)
8236	7908	{
8237	7909	struct list_head *tasks = &env->src_rq->cfs_tasks;
	7910	+ unsigned long util, load;
8238	7911	struct task_struct *p;
8239		- unsigned long load;
8240	7912	int detached = 0;
8241	7913
8242	7914	lockdep_assert_held(&env->src_rq->lock);
..	..	@@ -8266,39 +7938,64 @@
8266	7938	break;
8267	7939	}
8268	7940
8269		-#ifdef CONFIG_ROCKCHIP_SCHED_PERFORMANCE_BIAS
8270		- if (sysctl_sched_performance_bias) {
8271		- if ((env->idle == CPU_NOT_IDLE) && (!task_fits_max(p, env->dst_cpu)))
8272		- goto next;
8273		- }
8274		-#endif
8275		-
8276	7941	if (!can_migrate_task(p, env))
8277	7942	goto next;
8278	7943
8279		- /*
8280		- * Depending of the number of CPUs and tasks and the
8281		- * cgroup hierarchy, task_h_load() can return a null
8282		- * value. Make sure that env->imbalance decreases
8283		- * otherwise detach_tasks() will stop only after
8284		- * detaching up to loop_max tasks.
8285		- */
8286		- load = max_t(unsigned long, task_h_load(p), 1);
	7944	+ switch (env->migration_type) {
	7945	+ case migrate_load:
	7946	+ /*
	7947	+ * Depending of the number of CPUs and tasks and the
	7948	+ * cgroup hierarchy, task_h_load() can return a null
	7949	+ * value. Make sure that env->imbalance decreases
	7950	+ * otherwise detach_tasks() will stop only after
	7951	+ * detaching up to loop_max tasks.
	7952	+ */
	7953	+ load = max_t(unsigned long, task_h_load(p), 1);
8287	7954
	7955	+ if (sched_feat(LB_MIN) &&
	7956	+ load < 16 && !env->sd->nr_balance_failed)
	7957	+ goto next;
8288	7958
8289		- if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
8290		- goto next;
	7959	+ /*
	7960	+ * Make sure that we don't migrate too much load.
	7961	+ * Nevertheless, let relax the constraint if
	7962	+ * scheduler fails to find a good waiting task to
	7963	+ * migrate.
	7964	+ */
	7965	+ if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
	7966	+ goto next;
8291	7967
8292		- if ((load / 2) > env->imbalance)
8293		- goto next;
	7968	+ env->imbalance -= load;
	7969	+ break;
	7970	+
	7971	+ case migrate_util:
	7972	+ util = task_util_est(p);
	7973	+
	7974	+ if (util > env->imbalance)
	7975	+ goto next;
	7976	+
	7977	+ env->imbalance -= util;
	7978	+ break;
	7979	+
	7980	+ case migrate_task:
	7981	+ env->imbalance--;
	7982	+ break;
	7983	+
	7984	+ case migrate_misfit:
	7985	+ /* This is not a misfit task */
	7986	+ if (task_fits_capacity(p, capacity_of(env->src_cpu)))
	7987	+ goto next;
	7988	+
	7989	+ env->imbalance = 0;
	7990	+ break;
	7991	+ }
8294	7992
8295	7993	detach_task(p, env);
8296	7994	list_add(&p->se.group_node, &env->tasks);
8297	7995
8298	7996	detached++;
8299		- env->imbalance -= load;
8300	7997
8301		-#ifdef CONFIG_PREEMPT
	7998	+#ifdef CONFIG_PREEMPTION
8302	7999	/*
8303	8000	* NEWIDLE balancing is a source of latency, so preemptible
8304	8001	* kernels will stop after the first task is detached to minimize
..	..	@@ -8310,7 +8007,7 @@
8310	8007
8311	8008	/*
8312	8009	* We only want to steal up to the prescribed amount of
8313		- * weighted load.
	8010	+ * load/util/tasks.
8314	8011	*/
8315	8012	if (env->imbalance <= 0)
8316	8013	break;
..	..	@@ -8339,7 +8036,6 @@
8339	8036
8340	8037	BUG_ON(task_rq(p) != rq);
8341	8038	activate_task(rq, p, ENQUEUE_NOCLOCK);
8342		- p->on_rq = TASK_ON_RQ_QUEUED;
8343	8039	check_preempt_curr(rq, p, 0);
8344	8040	}
8345	8041
..	..	@@ -8380,6 +8076,7 @@
8380	8076	rq_unlock(env->dst_rq, &rf);
8381	8077	}
8382	8078
	8079	+#ifdef CONFIG_NO_HZ_COMMON
8383	8080	static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8384	8081	{
8385	8082	if (cfs_rq->avg.load_avg)
..	..	@@ -8399,12 +8096,54 @@
8399	8096	if (READ_ONCE(rq->avg_dl.util_avg))
8400	8097	return true;
8401	8098
	8099	+ if (thermal_load_avg(rq))
	8100	+ return true;
	8101	+
8402	8102	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
8403	8103	if (READ_ONCE(rq->avg_irq.util_avg))
8404	8104	return true;
8405	8105	#endif
8406	8106
8407	8107	return false;
	8108	+}
	8109	+
	8110	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
	8111	+{
	8112	+ rq->last_blocked_load_update_tick = jiffies;
	8113	+
	8114	+ if (!has_blocked)
	8115	+ rq->has_blocked_load = 0;
	8116	+}
	8117	+#else
	8118	+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
	8119	+static inline bool others_have_blocked(struct rq *rq) { return false; }
	8120	+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
	8121	+#endif
	8122	+
	8123	+static bool __update_blocked_others(struct rq rq, bool done)
	8124	+{
	8125	+ const struct sched_class *curr_class;
	8126	+ u64 now = rq_clock_pelt(rq);
	8127	+ unsigned long thermal_pressure;
	8128	+ bool decayed;
	8129	+
	8130	+ /*
	8131	+ * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
	8132	+ * DL and IRQ signals have been updated before updating CFS.
	8133	+ */
	8134	+ curr_class = rq->curr->sched_class;
	8135	+
	8136	+ thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
	8137	+
	8138	+ decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) \|
	8139	+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) \|
	8140	+ update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) \|
	8141	+ update_irq_load_avg(rq, 0);
	8142	+
	8143	+ if (others_have_blocked(rq))
	8144	+ *done = false;
	8145	+
	8146	+ return decayed;
8408	8147	}
8409	8148
8410	8149	#ifdef CONFIG_FAIR_GROUP_SCHED
..	..	@@ -8420,22 +8159,17 @@
8420	8159	if (cfs_rq->avg.util_sum)
8421	8160	return false;
8422	8161
8423		- if (cfs_rq->avg.runnable_load_sum)
	8162	+ if (cfs_rq->avg.runnable_sum)
8424	8163	return false;
8425	8164
8426	8165	return true;
8427	8166	}
8428	8167
8429		-static void update_blocked_averages(int cpu)
	8168	+static bool __update_blocked_fair(struct rq rq, bool done)
8430	8169	{
8431		- struct rq *rq = cpu_rq(cpu);
8432	8170	struct cfs_rq cfs_rq, pos;
8433		- const struct sched_class *curr_class;
8434		- struct rq_flags rf;
8435		- bool done = true;
8436		-
8437		- rq_lock_irqsave(rq, &rf);
8438		- update_rq_clock(rq);
	8171	+ bool decayed = false;
	8172	+ int cpu = cpu_of(rq);
8439	8173
8440	8174	/*
8441	8175	* Iterates the task_group tree in a bottom up fashion, see
..	..	@@ -8444,8 +8178,12 @@
8444	8178	for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
8445	8179	struct sched_entity *se;
8446	8180
8447		- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
8448		- update_tg_load_avg(cfs_rq, 0);
	8181	+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
	8182	+ update_tg_load_avg(cfs_rq);
	8183	+
	8184	+ if (cfs_rq == &rq->cfs)
	8185	+ decayed = true;
	8186	+ }
8449	8187
8450	8188	/* Propagate pending load changes to the parent, if any: */
8451	8189	se = cfs_rq->tg->se[cpu];
..	..	@@ -8461,23 +8199,10 @@
8461	8199
8462	8200	/* Don't need periodic decay once load/util_avg are null */
8463	8201	if (cfs_rq_has_blocked(cfs_rq))
8464		- done = false;
	8202	+ *done = false;
8465	8203	}
8466	8204
8467		- curr_class = rq->curr->sched_class;
8468		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8469		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8470		- update_irq_load_avg(rq, 0);
8471		- /* Don't need periodic decay once load/util_avg are null */
8472		- if (others_have_blocked(rq))
8473		- done = false;
8474		-
8475		-#ifdef CONFIG_NO_HZ_COMMON
8476		- rq->last_blocked_load_update_tick = jiffies;
8477		- if (done)
8478		- rq->has_blocked_load = 0;
8479		-#endif
8480		- rq_unlock_irqrestore(rq, &rf);
	8205	+ return decayed;
8481	8206	}
8482	8207
8483	8208	/*
..	..	@@ -8527,27 +8252,16 @@
8527	8252	cfs_rq_load_avg(cfs_rq) + 1);
8528	8253	}
8529	8254	#else
8530		-static inline void update_blocked_averages(int cpu)
	8255	+static bool __update_blocked_fair(struct rq rq, bool done)
8531	8256	{
8532		- struct rq *rq = cpu_rq(cpu);
8533	8257	struct cfs_rq *cfs_rq = &rq->cfs;
8534		- const struct sched_class *curr_class;
8535		- struct rq_flags rf;
	8258	+ bool decayed;
8536	8259
8537		- rq_lock_irqsave(rq, &rf);
8538		- update_rq_clock(rq);
8539		- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8260	+ decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
	8261	+ if (cfs_rq_has_blocked(cfs_rq))
	8262	+ *done = false;
8540	8263
8541		- curr_class = rq->curr->sched_class;
8542		- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
8543		- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
8544		- update_irq_load_avg(rq, 0);
8545		-#ifdef CONFIG_NO_HZ_COMMON
8546		- rq->last_blocked_load_update_tick = jiffies;
8547		- if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
8548		- rq->has_blocked_load = 0;
8549		-#endif
8550		- rq_unlock_irqrestore(rq, &rf);
	8264	+ return decayed;
8551	8265	}
8552	8266
8553	8267	static unsigned long task_h_load(struct task_struct *p)
..	..	@@ -8555,6 +8269,24 @@
8555	8269	return p->se.avg.load_avg;
8556	8270	}
8557	8271	#endif
	8272	+
	8273	+static void update_blocked_averages(int cpu)
	8274	+{
	8275	+ bool decayed = false, done = true;
	8276	+ struct rq *rq = cpu_rq(cpu);
	8277	+ struct rq_flags rf;
	8278	+
	8279	+ rq_lock_irqsave(rq, &rf);
	8280	+ update_rq_clock(rq);
	8281	+
	8282	+ decayed \|= __update_blocked_others(rq, &done);
	8283	+ decayed \|= __update_blocked_fair(rq, &done);
	8284	+
	8285	+ update_blocked_load_status(rq, !done);
	8286	+ if (decayed)
	8287	+ cpufreq_update_util(rq, 0);
	8288	+ rq_unlock_irqrestore(rq, &rf);
	8289	+}
8558	8290
8559	8291	/******** Helpers for find_busiest_group **********************/
8560	8292
..	..	@@ -8564,15 +8296,15 @@
8564	8296	struct sg_lb_stats {
8565	8297	unsigned long avg_load; /Avg load across the CPUs of the group /
8566	8298	unsigned long group_load; /* Total load over the CPUs of the group */
8567		- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
8568		- unsigned long load_per_task;
8569	8299	unsigned long group_capacity;
8570		- unsigned long group_util; /* Total utilization of the group */
8571		- unsigned int sum_nr_running; /* Nr tasks running in the group */
	8300	+ unsigned long group_util; /* Total utilization over the CPUs of the group */
	8301	+ unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
	8302	+ unsigned int sum_nr_running; /* Nr of tasks running in the group */
	8303	+ unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
8572	8304	unsigned int idle_cpus;
8573	8305	unsigned int group_weight;
8574	8306	enum group_type group_type;
8575		- int group_no_capacity;
	8307	+ unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
8576	8308	unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
8577	8309	#ifdef CONFIG_NUMA_BALANCING
8578	8310	unsigned int nr_numa_running;
..	..	@@ -8587,10 +8319,10 @@
8587	8319	struct sd_lb_stats {
8588	8320	struct sched_group busiest; / Busiest group in this sd */
8589	8321	struct sched_group local; / Local group in this sd */
8590		- unsigned long total_running;
8591	8322	unsigned long total_load; /* Total load of all groups in sd */
8592	8323	unsigned long total_capacity; /* Total capacity of all groups in sd */
8593	8324	unsigned long avg_load; /* Average load across all groups in sd */
	8325	+ unsigned int prefer_sibling; /* tasks should go to sibling first */
8594	8326
8595	8327	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
8596	8328	struct sg_lb_stats local_stat; /* Statistics of the local group */
..	..	@@ -8601,54 +8333,26 @@
8601	8333	/*
8602	8334	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
8603	8335	* local_stat because update_sg_lb_stats() does a full clear/assignment.
8604		- * We must however clear busiest_stat::avg_load because
8605		- * update_sd_pick_busiest() reads this before assignment.
	8336	+ * We must however set busiest_stat::group_type and
	8337	+ * busiest_stat::idle_cpus to the worst busiest group because
	8338	+ * update_sd_pick_busiest() reads these before assignment.
8606	8339	*/
8607	8340	*sds = (struct sd_lb_stats){
8608	8341	.busiest = NULL,
8609	8342	.local = NULL,
8610		- .total_running = 0UL,
8611	8343	.total_load = 0UL,
8612	8344	.total_capacity = 0UL,
8613	8345	.busiest_stat = {
8614		- .avg_load = 0UL,
8615		- .sum_nr_running = 0,
8616		- .group_type = group_other,
	8346	+ .idle_cpus = UINT_MAX,
	8347	+ .group_type = group_has_spare,
8617	8348	},
8618	8349	};
8619	8350	}
8620	8351
8621		-/**
8622		- * get_sd_load_idx - Obtain the load index for a given sched domain.
8623		- * @sd: The sched_domain whose load_idx is to be obtained.
8624		- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
8625		- *
8626		- * Return: The load index.
8627		- */
8628		-static inline int get_sd_load_idx(struct sched_domain *sd,
8629		- enum cpu_idle_type idle)
8630		-{
8631		- int load_idx;
8632		-
8633		- switch (idle) {
8634		- case CPU_NOT_IDLE:
8635		- load_idx = sd->busy_idx;
8636		- break;
8637		-
8638		- case CPU_NEWLY_IDLE:
8639		- load_idx = sd->newidle_idx;
8640		- break;
8641		- default:
8642		- load_idx = sd->idle_idx;
8643		- break;
8644		- }
8645		-
8646		- return load_idx;
8647		-}
8648		-
8649		-static unsigned long scale_rt_capacity(int cpu, unsigned long max)
	8352	+static unsigned long scale_rt_capacity(int cpu)
8650	8353	{
8651	8354	struct rq *rq = cpu_rq(cpu);
	8355	+ unsigned long max = arch_scale_cpu_capacity(cpu);
8652	8356	unsigned long used, free;
8653	8357	unsigned long irq;
8654	8358
..	..	@@ -8657,8 +8361,15 @@
8657	8361	if (unlikely(irq >= max))
8658	8362	return 1;
8659	8363
	8364	+ /*
	8365	+ * avg_rt.util_avg and avg_dl.util_avg track binary signals
	8366	+ * (running and not running) with weights 0 and 1024 respectively.
	8367	+ * avg_thermal.load_avg tracks thermal pressure and the weighted
	8368	+ * average uses the actual delta max capacity(load).
	8369	+ */
8660	8370	used = READ_ONCE(rq->avg_rt.util_avg);
8661	8371	used += READ_ONCE(rq->avg_dl.util_avg);
	8372	+ used += thermal_load_avg(rq);
8662	8373
8663	8374	if (unlikely(used >= max))
8664	8375	return 1;
..	..	@@ -8668,52 +8379,20 @@
8668	8379	return scale_irq_capacity(free, irq, max);
8669	8380	}
8670	8381
8671		-void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
8672		- raw_spin_lock_init(&mcc->lock);
8673		- mcc->val = 0;
8674		- mcc->cpu = -1;
8675		-}
8676		-
8677	8382	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8678	8383	{
8679		- unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
	8384	+ unsigned long capacity = scale_rt_capacity(cpu);
8680	8385	struct sched_group *sdg = sd->groups;
8681		- struct max_cpu_capacity *mcc;
8682		- unsigned long max_capacity;
8683		- int max_cap_cpu;
8684		- unsigned long flags;
8685	8386
8686		- cpu_rq(cpu)->cpu_capacity_orig = capacity;
8687		-
8688		- capacity *= arch_scale_max_freq_capacity(sd, cpu);
8689		- capacity >>= SCHED_CAPACITY_SHIFT;
8690		-
8691		- mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8692		-
8693		- raw_spin_lock_irqsave(&mcc->lock, flags);
8694		- max_capacity = mcc->val;
8695		- max_cap_cpu = mcc->cpu;
8696		-
8697		- if ((max_capacity > capacity && max_cap_cpu == cpu) \|\|
8698		- (max_capacity < capacity)) {
8699		- mcc->val = capacity;
8700		- mcc->cpu = cpu;
8701		-#ifdef CONFIG_SCHED_DEBUG
8702		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8703		- //printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8704		- // cpu, capacity);
8705		- goto skip_unlock;
8706		-#endif
8707		- }
8708		- raw_spin_unlock_irqrestore(&mcc->lock, flags);
8709		-
8710		-skip_unlock: __attribute__ ((unused));
8711		- capacity = scale_rt_capacity(cpu, capacity);
	8387	+ cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
8712	8388
8713	8389	if (!capacity)
8714	8390	capacity = 1;
8715	8391
	8392	+ trace_android_rvh_update_cpu_capacity(cpu, &capacity);
8716	8393	cpu_rq(cpu)->cpu_capacity = capacity;
	8394	+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
	8395	+
8717	8396	sdg->sgc->capacity = capacity;
8718	8397	sdg->sgc->min_capacity = capacity;
8719	8398	sdg->sgc->max_capacity = capacity;
..	..	@@ -8746,29 +8425,11 @@
8746	8425	*/
8747	8426
8748	8427	for_each_cpu(cpu, sched_group_span(sdg)) {
8749		- struct sched_group_capacity *sgc;
8750		- struct rq *rq = cpu_rq(cpu);
	8428	+ unsigned long cpu_cap = capacity_of(cpu);
8751	8429
8752		- /*
8753		- * build_sched_domains() -> init_sched_groups_capacity()
8754		- * gets here before we've attached the domains to the
8755		- * runqueues.
8756		- *
8757		- * Use capacity_of(), which is set irrespective of domains
8758		- * in update_cpu_capacity().
8759		- *
8760		- * This avoids capacity from being 0 and
8761		- * causing divide-by-zero issues on boot.
8762		- */
8763		- if (unlikely(!rq->sd)) {
8764		- capacity += capacity_of(cpu);
8765		- } else {
8766		- sgc = rq->sd->groups->sgc;
8767		- capacity += sgc->capacity;
8768		- }
8769		-
8770		- min_capacity = min(capacity, min_capacity);
8771		- max_capacity = max(capacity, max_capacity);
	8430	+ capacity += cpu_cap;
	8431	+ min_capacity = min(cpu_cap, min_capacity);
	8432	+ max_capacity = max(cpu_cap, max_capacity);
8772	8433	}
8773	8434	} else {
8774	8435	/*
..	..	@@ -8805,8 +8466,20 @@
8805	8466	}
8806	8467
8807	8468	/*
	8469	+ * Check whether a rq has a misfit task and if it looks like we can actually
	8470	+ * help that task: we can migrate the task to a CPU of higher capacity, or
	8471	+ * the task's current CPU is heavily pressured.
	8472	+ */
	8473	+static inline int check_misfit_status(struct rq rq, struct sched_domain sd)
	8474	+{
	8475	+ return rq->misfit_task_load &&
	8476	+ (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity \|\|
	8477	+ check_cpu_capacity(rq, sd));
	8478	+}
	8479	+
	8480	+/*
8808	8481	* Group imbalance indicates (and tries to solve) the problem where balancing
8809		- * groups is inadequate due to ->cpus_allowed constraints.
	8482	+ * groups is inadequate due to ->cpus_ptr constraints.
8810	8483	*
8811	8484	* Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8812	8485	* cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
..	..	@@ -8851,13 +8524,17 @@
8851	8524	* any benefit for the load balance.
8852	8525	*/
8853	8526	static inline bool
8854		-group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
	8527	+group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8855	8528	{
8856	8529	if (sgs->sum_nr_running < sgs->group_weight)
8857	8530	return true;
8858	8531
	8532	+ if ((sgs->group_capacity * imbalance_pct) <
	8533	+ (sgs->group_runnable * 100))
	8534	+ return false;
	8535	+
8859	8536	if ((sgs->group_capacity * 100) >
8860		- (sgs->group_util * env->sd->imbalance_pct))
	8537	+ (sgs->group_util * imbalance_pct))
8861	8538	return true;
8862	8539
8863	8540	return false;
..	..	@@ -8872,13 +8549,17 @@
8872	8549	* false.
8873	8550	*/
8874	8551	static inline bool
8875		-group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
	8552	+group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
8876	8553	{
8877	8554	if (sgs->sum_nr_running <= sgs->group_weight)
8878	8555	return false;
8879	8556
8880	8557	if ((sgs->group_capacity * 100) <
8881		- (sgs->group_util * env->sd->imbalance_pct))
	8558	+ (sgs->group_util * imbalance_pct))
	8559	+ return true;
	8560	+
	8561	+ if ((sgs->group_capacity * imbalance_pct) <
	8562	+ (sgs->group_runnable * 100))
8882	8563	return true;
8883	8564
8884	8565	return false;
..	..	@@ -8891,8 +8572,7 @@
8891	8572	static inline bool
8892	8573	group_smaller_min_cpu_capacity(struct sched_group sg, struct sched_group ref)
8893	8574	{
8894		- return sg->sgc->min_capacity * capacity_margin <
8895		- ref->sgc->min_capacity * 1024;
	8575	+ return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
8896	8576	}
8897	8577
8898	8578	/*
..	..	@@ -8902,24 +8582,30 @@
8902	8582	static inline bool
8903	8583	group_smaller_max_cpu_capacity(struct sched_group sg, struct sched_group ref)
8904	8584	{
8905		- return sg->sgc->max_capacity * capacity_margin <
8906		- ref->sgc->max_capacity * 1024;
	8585	+ return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
8907	8586	}
8908	8587
8909	8588	static inline enum
8910		-group_type group_classify(struct sched_group *group,
	8589	+group_type group_classify(unsigned int imbalance_pct,
	8590	+ struct sched_group *group,
8911	8591	struct sg_lb_stats *sgs)
8912	8592	{
8913		- if (sgs->group_no_capacity)
	8593	+ if (group_is_overloaded(imbalance_pct, sgs))
8914	8594	return group_overloaded;
8915	8595
8916	8596	if (sg_imbalanced(group))
8917	8597	return group_imbalanced;
8918	8598
	8599	+ if (sgs->group_asym_packing)
	8600	+ return group_asym_packing;
	8601	+
8919	8602	if (sgs->group_misfit_task_load)
8920	8603	return group_misfit_task;
8921	8604
8922		- return group_other;
	8605	+ if (!group_has_capacity(imbalance_pct, sgs))
	8606	+ return group_fully_busy;
	8607	+
	8608	+ return group_has_spare;
8923	8609	}
8924	8610
8925	8611	static bool update_nohz_stats(struct rq *rq, bool force)
..	..	@@ -8956,12 +8642,11 @@
8956	8642	struct sg_lb_stats *sgs,
8957	8643	int *sg_status)
8958	8644	{
8959		- int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8960		- int load_idx = get_sd_load_idx(env->sd, env->idle);
8961		- unsigned long load;
8962		- int i, nr_running;
	8645	+ int i, nr_running, local_group;
8963	8646
8964	8647	memset(sgs, 0, sizeof(*sgs));
	8648	+
	8649	+ local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8965	8650
8966	8651	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8967	8652	struct rq *rq = cpu_rq(i);
..	..	@@ -8969,17 +8654,14 @@
8969	8654	if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8970	8655	env->flags \|= LBF_NOHZ_AGAIN;
8971	8656
8972		- /* Bias balancing toward CPUs of our domain: */
8973		- if (local_group)
8974		- load = target_load(i, load_idx);
8975		- else
8976		- load = source_load(i, load_idx);
8977		-
8978		- sgs->group_load += load;
	8657	+ sgs->group_load += cpu_load(rq);
8979	8658	sgs->group_util += cpu_util(i);
8980		- sgs->sum_nr_running += rq->cfs.h_nr_running;
	8659	+ sgs->group_runnable += cpu_runnable(rq);
	8660	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running;
8981	8661
8982	8662	nr_running = rq->nr_running;
	8663	+ sgs->sum_nr_running += nr_running;
	8664	+
8983	8665	if (nr_running > 1)
8984	8666	*sg_status \|= SG_OVERLOAD;
8985	8667
..	..	@@ -8990,13 +8672,19 @@
8990	8672	sgs->nr_numa_running += rq->nr_numa_running;
8991	8673	sgs->nr_preferred_running += rq->nr_preferred_running;
8992	8674	#endif
8993		- sgs->sum_weighted_load += weighted_cpuload(rq);
8994	8675	/*
8995	8676	* No need to call idle_cpu() if nr_running is not 0
8996	8677	*/
8997		- if (!nr_running && idle_cpu(i))
	8678	+ if (!nr_running && idle_cpu(i)) {
8998	8679	sgs->idle_cpus++;
	8680	+ /* Idle cpu can't have misfit task */
	8681	+ continue;
	8682	+ }
8999	8683
	8684	+ if (local_group)
	8685	+ continue;
	8686	+
	8687	+ /* Check for a misfit task on the cpu */
9000	8688	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9001	8689	sgs->group_misfit_task_load < rq->misfit_task_load) {
9002	8690	sgs->group_misfit_task_load = rq->misfit_task_load;
..	..	@@ -9004,17 +8692,24 @@
9004	8692	}
9005	8693	}
9006	8694
9007		- /* Adjust by relative CPU capacity of the group */
9008		- sgs->group_capacity = group->sgc->capacity;
9009		- sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
	8695	+ /* Check if dst CPU is idle and preferred to this group */
	8696	+ if (env->sd->flags & SD_ASYM_PACKING &&
	8697	+ env->idle != CPU_NOT_IDLE &&
	8698	+ sgs->sum_h_nr_running &&
	8699	+ sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
	8700	+ sgs->group_asym_packing = 1;
	8701	+ }
9010	8702
9011		- if (sgs->sum_nr_running)
9012		- sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
	8703	+ sgs->group_capacity = group->sgc->capacity;
9013	8704
9014	8705	sgs->group_weight = group->group_weight;
9015	8706
9016		- sgs->group_no_capacity = group_is_overloaded(env, sgs);
9017		- sgs->group_type = group_classify(group, sgs);
	8707	+ sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
	8708	+
	8709	+ /* Computing avg_load makes sense only when group is overloaded */
	8710	+ if (sgs->group_type == group_overloaded)
	8711	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	8712	+ sgs->group_capacity;
9018	8713	}
9019	8714
9020	8715	/**
..	..	@@ -9037,6 +8732,10 @@
9037	8732	{
9038	8733	struct sg_lb_stats *busiest = &sds->busiest_stat;
9039	8734
	8735	+ /* Make sure that there is at least one task to pull */
	8736	+ if (!sgs->sum_h_nr_running)
	8737	+ return false;
	8738	+
9040	8739	/*
9041	8740	* Don't try to pull misfit tasks we can't help.
9042	8741	* We can use max_capacity here as reduction in capacity on some
..	..	@@ -9045,7 +8744,7 @@
9045	8744	*/
9046	8745	if (sgs->group_type == group_misfit_task &&
9047	8746	(!group_smaller_max_cpu_capacity(sg, sds->local) \|\|
9048		- !group_has_capacity(env, &sds->local_stat)))
	8747	+ sds->local_stat.group_type != group_has_spare))
9049	8748	return false;
9050	8749
9051	8750	if (sgs->group_type > busiest->group_type)
..	..	@@ -9054,62 +8753,92 @@
9054	8753	if (sgs->group_type < busiest->group_type)
9055	8754	return false;
9056	8755
9057		- if (sgs->avg_load <= busiest->avg_load)
	8756	+ /*
	8757	+ * The candidate and the current busiest group are the same type of
	8758	+ * group. Let check which one is the busiest according to the type.
	8759	+ */
	8760	+
	8761	+ switch (sgs->group_type) {
	8762	+ case group_overloaded:
	8763	+ /* Select the overloaded group with highest avg_load. */
	8764	+ if (sgs->avg_load <= busiest->avg_load)
	8765	+ return false;
	8766	+ break;
	8767	+
	8768	+ case group_imbalanced:
	8769	+ /*
	8770	+ * Select the 1st imbalanced group as we don't have any way to
	8771	+ * choose one more than another.
	8772	+ */
9058	8773	return false;
9059	8774
9060		- if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
9061		- goto asym_packing;
9062		-
9063		- /*
9064		- * Candidate sg has no more than one task per CPU and
9065		- * has higher per-CPU capacity. Migrating tasks to less
9066		- * capable CPUs may harm throughput. Maximize throughput,
9067		- * power/energy consequences are not considered.
9068		- */
9069		- if (sgs->sum_nr_running <= sgs->group_weight &&
9070		- group_smaller_min_cpu_capacity(sds->local, sg))
9071		- return false;
9072		-
9073		- /*
9074		- * If we have more than one misfit sg go with the biggest misfit.
9075		- */
9076		- if (sgs->group_type == group_misfit_task &&
9077		- sgs->group_misfit_task_load < busiest->group_misfit_task_load)
9078		- return false;
9079		-
9080		-asym_packing:
9081		- /* This is the busiest node in its class. */
9082		- if (!(env->sd->flags & SD_ASYM_PACKING))
9083		- return true;
9084		-
9085		- /* No ASYM_PACKING if target CPU is already busy */
9086		- if (env->idle == CPU_NOT_IDLE)
9087		- return true;
9088		- /*
9089		- * ASYM_PACKING needs to move all the work to the highest
9090		- * prority CPUs in the group, therefore mark all groups
9091		- * of lower priority than ourself as busy.
9092		- */
9093		- if (sgs->sum_nr_running &&
9094		- sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
9095		- if (!sds->busiest)
9096		- return true;
9097		-
	8775	+ case group_asym_packing:
9098	8776	/* Prefer to move from lowest priority CPU's work */
9099		- if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
9100		- sg->asym_prefer_cpu))
9101		- return true;
	8777	+ if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
	8778	+ return false;
	8779	+ break;
	8780	+
	8781	+ case group_misfit_task:
	8782	+ /*
	8783	+ * If we have more than one misfit sg go with the biggest
	8784	+ * misfit.
	8785	+ */
	8786	+ if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
	8787	+ return false;
	8788	+ break;
	8789	+
	8790	+ case group_fully_busy:
	8791	+ /*
	8792	+ * Select the fully busy group with highest avg_load. In
	8793	+ * theory, there is no need to pull task from such kind of
	8794	+ * group because tasks have all compute capacity that they need
	8795	+ * but we can still improve the overall throughput by reducing
	8796	+ * contention when accessing shared HW resources.
	8797	+ *
	8798	+ * XXX for now avg_load is not computed and always 0 so we
	8799	+ * select the 1st one.
	8800	+ */
	8801	+ if (sgs->avg_load <= busiest->avg_load)
	8802	+ return false;
	8803	+ break;
	8804	+
	8805	+ case group_has_spare:
	8806	+ /*
	8807	+ * Select not overloaded group with lowest number of idle cpus
	8808	+ * and highest number of running tasks. We could also compare
	8809	+ * the spare capacity which is more stable but it can end up
	8810	+ * that the group has less spare capacity but finally more idle
	8811	+ * CPUs which means less opportunity to pull tasks.
	8812	+ */
	8813	+ if (sgs->idle_cpus > busiest->idle_cpus)
	8814	+ return false;
	8815	+ else if ((sgs->idle_cpus == busiest->idle_cpus) &&
	8816	+ (sgs->sum_nr_running <= busiest->sum_nr_running))
	8817	+ return false;
	8818	+
	8819	+ break;
9102	8820	}
9103	8821
9104		- return false;
	8822	+ /*
	8823	+ * Candidate sg has no more than one task per CPU and has higher
	8824	+ * per-CPU capacity. Migrating tasks to less capable CPUs may harm
	8825	+ * throughput. Maximize throughput, power/energy consequences are not
	8826	+ * considered.
	8827	+ */
	8828	+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
	8829	+ (sgs->group_type <= group_fully_busy) &&
	8830	+ (group_smaller_min_cpu_capacity(sds->local, sg)))
	8831	+ return false;
	8832	+
	8833	+ return true;
9105	8834	}
9106	8835
9107	8836	#ifdef CONFIG_NUMA_BALANCING
9108	8837	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
9109	8838	{
9110		- if (sgs->sum_nr_running > sgs->nr_numa_running)
	8839	+ if (sgs->sum_h_nr_running > sgs->nr_numa_running)
9111	8840	return regular;
9112		- if (sgs->sum_nr_running > sgs->nr_preferred_running)
	8841	+ if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
9113	8842	return remote;
9114	8843	return all;
9115	8844	}
..	..	@@ -9134,18 +8863,334 @@
9134	8863	}
9135	8864	#endif /* CONFIG_NUMA_BALANCING */
9136	8865
	8866	+
	8867	+struct sg_lb_stats;
	8868	+
	8869	+/*
	8870	+ * task_running_on_cpu - return 1 if @p is running on @cpu.
	8871	+ */
	8872	+
	8873	+static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
	8874	+{
	8875	+ /* Task has no contribution or is new */
	8876	+ if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
	8877	+ return 0;
	8878	+
	8879	+ if (task_on_rq_queued(p))
	8880	+ return 1;
	8881	+
	8882	+ return 0;
	8883	+}
	8884	+
	8885	+/**
	8886	+ * idle_cpu_without - would a given CPU be idle without p ?
	8887	+ * @cpu: the processor on which idleness is tested.
	8888	+ * @p: task which should be ignored.
	8889	+ *
	8890	+ * Return: 1 if the CPU would be idle. 0 otherwise.
	8891	+ */
	8892	+static int idle_cpu_without(int cpu, struct task_struct *p)
	8893	+{
	8894	+ struct rq *rq = cpu_rq(cpu);
	8895	+
	8896	+ if (rq->curr != rq->idle && rq->curr != p)
	8897	+ return 0;
	8898	+
	8899	+ /*
	8900	+ * rq->nr_running can't be used but an updated version without the
	8901	+ * impact of p on cpu must be used instead. The updated nr_running
	8902	+ * be computed and tested before calling idle_cpu_without().
	8903	+ */
	8904	+
	8905	+#ifdef CONFIG_SMP
	8906	+ if (rq->ttwu_pending)
	8907	+ return 0;
	8908	+#endif
	8909	+
	8910	+ return 1;
	8911	+}
	8912	+
	8913	+/*
	8914	+ * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
	8915	+ * @sd: The sched_domain level to look for idlest group.
	8916	+ * @group: sched_group whose statistics are to be updated.
	8917	+ * @sgs: variable to hold the statistics for this group.
	8918	+ * @p: The task for which we look for the idlest group/CPU.
	8919	+ */
	8920	+static inline void update_sg_wakeup_stats(struct sched_domain *sd,
	8921	+ struct sched_group *group,
	8922	+ struct sg_lb_stats *sgs,
	8923	+ struct task_struct *p)
	8924	+{
	8925	+ int i, nr_running;
	8926	+
	8927	+ memset(sgs, 0, sizeof(*sgs));
	8928	+
	8929	+ for_each_cpu(i, sched_group_span(group)) {
	8930	+ struct rq *rq = cpu_rq(i);
	8931	+ unsigned int local;
	8932	+
	8933	+ sgs->group_load += cpu_load_without(rq, p);
	8934	+ sgs->group_util += cpu_util_without(i, p);
	8935	+ sgs->group_runnable += cpu_runnable_without(rq, p);
	8936	+ local = task_running_on_cpu(i, p);
	8937	+ sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
	8938	+
	8939	+ nr_running = rq->nr_running - local;
	8940	+ sgs->sum_nr_running += nr_running;
	8941	+
	8942	+ /*
	8943	+ * No need to call idle_cpu_without() if nr_running is not 0
	8944	+ */
	8945	+ if (!nr_running && idle_cpu_without(i, p))
	8946	+ sgs->idle_cpus++;
	8947	+
	8948	+ }
	8949	+
	8950	+ /* Check if task fits in the group */
	8951	+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
	8952	+ !task_fits_capacity(p, group->sgc->max_capacity)) {
	8953	+ sgs->group_misfit_task_load = 1;
	8954	+ }
	8955	+
	8956	+ sgs->group_capacity = group->sgc->capacity;
	8957	+
	8958	+ sgs->group_weight = group->group_weight;
	8959	+
	8960	+ sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
	8961	+
	8962	+ /*
	8963	+ * Computing avg_load makes sense only when group is fully busy or
	8964	+ * overloaded
	8965	+ */
	8966	+ if (sgs->group_type == group_fully_busy \|\|
	8967	+ sgs->group_type == group_overloaded)
	8968	+ sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
	8969	+ sgs->group_capacity;
	8970	+}
	8971	+
	8972	+static bool update_pick_idlest(struct sched_group *idlest,
	8973	+ struct sg_lb_stats *idlest_sgs,
	8974	+ struct sched_group *group,
	8975	+ struct sg_lb_stats *sgs)
	8976	+{
	8977	+ if (sgs->group_type < idlest_sgs->group_type)
	8978	+ return true;
	8979	+
	8980	+ if (sgs->group_type > idlest_sgs->group_type)
	8981	+ return false;
	8982	+
	8983	+ /*
	8984	+ * The candidate and the current idlest group are the same type of
	8985	+ * group. Let check which one is the idlest according to the type.
	8986	+ */
	8987	+
	8988	+ switch (sgs->group_type) {
	8989	+ case group_overloaded:
	8990	+ case group_fully_busy:
	8991	+ /* Select the group with lowest avg_load. */
	8992	+ if (idlest_sgs->avg_load <= sgs->avg_load)
	8993	+ return false;
	8994	+ break;
	8995	+
	8996	+ case group_imbalanced:
	8997	+ case group_asym_packing:
	8998	+ /* Those types are not used in the slow wakeup path */
	8999	+ return false;
	9000	+
	9001	+ case group_misfit_task:
	9002	+ /* Select group with the highest max capacity */
	9003	+ if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
	9004	+ return false;
	9005	+ break;
	9006	+
	9007	+ case group_has_spare:
	9008	+ /* Select group with most idle CPUs */
	9009	+ if (idlest_sgs->idle_cpus > sgs->idle_cpus)
	9010	+ return false;
	9011	+
	9012	+ /* Select group with lowest group_util */
	9013	+ if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
	9014	+ idlest_sgs->group_util <= sgs->group_util)
	9015	+ return false;
	9016	+
	9017	+ break;
	9018	+ }
	9019	+
	9020	+ return true;
	9021	+}
	9022	+
	9023	+/*
	9024	+ * find_idlest_group() finds and returns the least busy CPU group within the
	9025	+ * domain.
	9026	+ *
	9027	+ * Assumes p is allowed on at least one CPU in sd.
	9028	+ */
	9029	+static struct sched_group *
	9030	+find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
	9031	+{
	9032	+ struct sched_group idlest = NULL, local = NULL, *group = sd->groups;
	9033	+ struct sg_lb_stats local_sgs, tmp_sgs;
	9034	+ struct sg_lb_stats *sgs;
	9035	+ unsigned long imbalance;
	9036	+ struct sg_lb_stats idlest_sgs = {
	9037	+ .avg_load = UINT_MAX,
	9038	+ .group_type = group_overloaded,
	9039	+ };
	9040	+
	9041	+ imbalance = scale_load_down(NICE_0_LOAD) *
	9042	+ (sd->imbalance_pct-100) / 100;
	9043	+
	9044	+ do {
	9045	+ int local_group;
	9046	+
	9047	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	9048	+ struct root_domain *rd = cpu_rq(this_cpu)->rd;
	9049	+ struct cpumask *cpub_mask = rockchip_perf_get_cpub_mask();
	9050	+ int level = rockchip_perf_get_level();
	9051	+
	9052	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	9053	+ cpub_mask && cpumask_intersects(p->cpus_ptr, cpub_mask) &&
	9054	+ !cpumask_intersects(sched_group_span(group), cpub_mask))
	9055	+ continue;
	9056	+ }
	9057	+
	9058	+ /* Skip over this group if it has no CPUs allowed */
	9059	+ if (!cpumask_intersects(sched_group_span(group),
	9060	+ p->cpus_ptr))
	9061	+ continue;
	9062	+
	9063	+ local_group = cpumask_test_cpu(this_cpu,
	9064	+ sched_group_span(group));
	9065	+
	9066	+ if (local_group) {
	9067	+ sgs = &local_sgs;
	9068	+ local = group;
	9069	+ } else {
	9070	+ sgs = &tmp_sgs;
	9071	+ }
	9072	+
	9073	+ update_sg_wakeup_stats(sd, group, sgs, p);
	9074	+
	9075	+ if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
	9076	+ idlest = group;
	9077	+ idlest_sgs = *sgs;
	9078	+ }
	9079	+
	9080	+ } while (group = group->next, group != sd->groups);
	9081	+
	9082	+
	9083	+ /* There is no idlest group to push tasks to */
	9084	+ if (!idlest)
	9085	+ return NULL;
	9086	+
	9087	+ /* The local group has been skipped because of CPU affinity */
	9088	+ if (!local)
	9089	+ return idlest;
	9090	+
	9091	+ /*
	9092	+ * If the local group is idler than the selected idlest group
	9093	+ * don't try and push the task.
	9094	+ */
	9095	+ if (local_sgs.group_type < idlest_sgs.group_type)
	9096	+ return NULL;
	9097	+
	9098	+ /*
	9099	+ * If the local group is busier than the selected idlest group
	9100	+ * try and push the task.
	9101	+ */
	9102	+ if (local_sgs.group_type > idlest_sgs.group_type)
	9103	+ return idlest;
	9104	+
	9105	+ switch (local_sgs.group_type) {
	9106	+ case group_overloaded:
	9107	+ case group_fully_busy:
	9108	+ /*
	9109	+ * When comparing groups across NUMA domains, it's possible for
	9110	+ * the local domain to be very lightly loaded relative to the
	9111	+ * remote domains but "imbalance" skews the comparison making
	9112	+ * remote CPUs look much more favourable. When considering
	9113	+ * cross-domain, add imbalance to the load on the remote node
	9114	+ * and consider staying local.
	9115	+ */
	9116	+
	9117	+ if ((sd->flags & SD_NUMA) &&
	9118	+ ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
	9119	+ return NULL;
	9120	+
	9121	+ /*
	9122	+ * If the local group is less loaded than the selected
	9123	+ * idlest group don't try and push any tasks.
	9124	+ */
	9125	+ if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
	9126	+ return NULL;
	9127	+
	9128	+ if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
	9129	+ return NULL;
	9130	+ break;
	9131	+
	9132	+ case group_imbalanced:
	9133	+ case group_asym_packing:
	9134	+ /* Those type are not used in the slow wakeup path */
	9135	+ return NULL;
	9136	+
	9137	+ case group_misfit_task:
	9138	+ /* Select group with the highest max capacity */
	9139	+ if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
	9140	+ return NULL;
	9141	+ break;
	9142	+
	9143	+ case group_has_spare:
	9144	+ if (sd->flags & SD_NUMA) {
	9145	+#ifdef CONFIG_NUMA_BALANCING
	9146	+ int idlest_cpu;
	9147	+ /*
	9148	+ * If there is spare capacity at NUMA, try to select
	9149	+ * the preferred node
	9150	+ */
	9151	+ if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
	9152	+ return NULL;
	9153	+
	9154	+ idlest_cpu = cpumask_first(sched_group_span(idlest));
	9155	+ if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
	9156	+ return idlest;
	9157	+#endif
	9158	+ /*
	9159	+ * Otherwise, keep the task on this node to stay close
	9160	+ * its wakeup source and improve locality. If there is
	9161	+ * a real need of migration, periodic load balance will
	9162	+ * take care of it.
	9163	+ */
	9164	+ if (local_sgs.idle_cpus)
	9165	+ return NULL;
	9166	+ }
	9167	+
	9168	+ /*
	9169	+ * Select group with highest number of idle CPUs. We could also
	9170	+ * compare the utilization which is more stable but it can end
	9171	+ * up that the group has less spare capacity but finally more
	9172	+ * idle CPUs which means more opportunity to run task.
	9173	+ */
	9174	+ if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
	9175	+ return NULL;
	9176	+ break;
	9177	+ }
	9178	+
	9179	+ return idlest;
	9180	+}
	9181	+
9137	9182	/**
9138	9183	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
9139	9184	* @env: The load balancing environment.
9140	9185	* @sds: variable to hold the statistics for this sched_domain.
9141	9186	*/
	9187	+
9142	9188	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
9143	9189	{
9144	9190	struct sched_domain *child = env->sd->child;
9145	9191	struct sched_group *sg = env->sd->groups;
9146	9192	struct sg_lb_stats *local = &sds->local_stat;
9147	9193	struct sg_lb_stats tmp_sgs;
9148		- bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9149	9194	int sg_status = 0;
9150	9195
9151	9196	#ifdef CONFIG_NO_HZ_COMMON
..	..	@@ -9172,22 +9217,6 @@
9172	9217	if (local_group)
9173	9218	goto next_group;
9174	9219
9175		- /*
9176		- * In case the child domain prefers tasks go to siblings
9177		- * first, lower the sg capacity so that we'll try
9178		- * and move all the excess tasks away. We lower the capacity
9179		- * of a group only if the local group has the capacity to fit
9180		- * these excess tasks. The extra check prevents the case where
9181		- * you always pull from the heaviest group when it is already
9182		- * under-utilized (possible with a large weight task outweighs
9183		- * the tasks on the system).
9184		- */
9185		- if (prefer_sibling && sds->local &&
9186		- group_has_capacity(env, local) &&
9187		- (sgs->sum_nr_running > local->sum_nr_running + 1)) {
9188		- sgs->group_no_capacity = 1;
9189		- sgs->group_type = group_classify(sg, sgs);
9190		- }
9191	9220
9192	9221	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
9193	9222	sds->busiest = sg;
..	..	@@ -9196,12 +9225,14 @@
9196	9225
9197	9226	next_group:
9198	9227	/* Now, start updating sd_lb_stats */
9199		- sds->total_running += sgs->sum_nr_running;
9200	9228	sds->total_load += sgs->group_load;
9201	9229	sds->total_capacity += sgs->group_capacity;
9202	9230
9203	9231	sg = sg->next;
9204	9232	} while (sg != env->sd->groups);
	9233	+
	9234	+ /* Tag domain that child domain prefers tasks go to siblings first */
	9235	+ sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9205	9236
9206	9237	#ifdef CONFIG_NO_HZ_COMMON
9207	9238	if ((env->flags & LBF_NOHZ_AGAIN) &&
..	..	@@ -9215,8 +9246,6 @@
9215	9246	if (env->sd->flags & SD_NUMA)
9216	9247	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
9217	9248
9218		- env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
9219		-
9220	9249	if (!env->sd->parent) {
9221	9250	struct root_domain *rd = env->dst_rq->rd;
9222	9251
..	..	@@ -9225,144 +9254,28 @@
9225	9254
9226	9255	/* Update over-utilization (tipping point, U >= 0) indicator */
9227	9256	WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
9228		- trace_sched_overutilized(!!(sg_status & SG_OVERUTILIZED));
	9257	+ trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
9229	9258	} else if (sg_status & SG_OVERUTILIZED) {
9230		- WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
9231		- trace_sched_overutilized(1);
9232		- }
	9259	+ struct root_domain *rd = env->dst_rq->rd;
9233	9260
	9261	+ WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
	9262	+ trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
	9263	+ }
9234	9264	}
9235	9265
9236		-/**
9237		- * check_asym_packing - Check to see if the group is packed into the
9238		- * sched domain.
9239		- *
9240		- * This is primarily intended to used at the sibling level. Some
9241		- * cores like POWER7 prefer to use lower numbered SMT threads. In the
9242		- * case of POWER7, it can move to lower SMT modes only when higher
9243		- * threads are idle. When in lower SMT modes, the threads will
9244		- * perform better since they share less core resources. Hence when we
9245		- * have idle threads, we want them to be the higher ones.
9246		- *
9247		- * This packing function is run on idle threads. It checks to see if
9248		- * the busiest CPU in this domain (core in the P7 case) has a higher
9249		- * CPU number than the packing function is being run on. Here we are
9250		- * assuming lower CPU number will be equivalent to lower a SMT thread
9251		- * number.
9252		- *
9253		- * Return: 1 when packing is required and a task should be moved to
9254		- * this CPU. The amount of the imbalance is returned in env->imbalance.
9255		- *
9256		- * @env: The load balancing environment.
9257		- * @sds: Statistics of the sched_domain which is to be packed
9258		- */
9259		-static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
	9266	+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
9260	9267	{
9261		- int busiest_cpu;
9262		-
9263		- if (!(env->sd->flags & SD_ASYM_PACKING))
9264		- return 0;
9265		-
9266		- if (env->idle == CPU_NOT_IDLE)
9267		- return 0;
9268		-
9269		- if (!sds->busiest)
9270		- return 0;
9271		-
9272		- busiest_cpu = sds->busiest->asym_prefer_cpu;
9273		- if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
9274		- return 0;
9275		-
9276		- env->imbalance = DIV_ROUND_CLOSEST(
9277		- sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
9278		- SCHED_CAPACITY_SCALE);
9279		-
9280		- return 1;
9281		-}
9282		-
9283		-/**
9284		- * fix_small_imbalance - Calculate the minor imbalance that exists
9285		- * amongst the groups of a sched_domain, during
9286		- * load balancing.
9287		- * @env: The load balancing environment.
9288		- * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
9289		- */
9290		-static inline
9291		-void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
9292		-{
9293		- unsigned long tmp, capa_now = 0, capa_move = 0;
9294		- unsigned int imbn = 2;
9295		- unsigned long scaled_busy_load_per_task;
9296		- struct sg_lb_stats local, busiest;
9297		-
9298		- local = &sds->local_stat;
9299		- busiest = &sds->busiest_stat;
9300		-
9301		- if (!local->sum_nr_running)
9302		- local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
9303		- else if (busiest->load_per_task > local->load_per_task)
9304		- imbn = 1;
9305		-
9306		- scaled_busy_load_per_task =
9307		- (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9308		- busiest->group_capacity;
9309		-
9310		- if (busiest->avg_load + scaled_busy_load_per_task >=
9311		- local->avg_load + (scaled_busy_load_per_task * imbn)) {
9312		- env->imbalance = busiest->load_per_task;
9313		- return;
9314		- }
	9268	+ unsigned int imbalance_min;
9315	9269
9316	9270	/*
9317		- * OK, we don't have enough imbalance to justify moving tasks,
9318		- * however we may be able to increase total CPU capacity used by
9319		- * moving them.
	9271	+ * Allow a small imbalance based on a simple pair of communicating
	9272	+ * tasks that remain local when the source domain is almost idle.
9320	9273	*/
	9274	+ imbalance_min = 2;
	9275	+ if (nr_running <= imbalance_min)
	9276	+ return 0;
9321	9277
9322		- capa_now += busiest->group_capacity *
9323		- min(busiest->load_per_task, busiest->avg_load);
9324		- capa_now += local->group_capacity *
9325		- min(local->load_per_task, local->avg_load);
9326		- capa_now /= SCHED_CAPACITY_SCALE;
9327		-
9328		- /* Amount of load we'd subtract */
9329		- if (busiest->avg_load > scaled_busy_load_per_task) {
9330		- capa_move += busiest->group_capacity *
9331		- min(busiest->load_per_task,
9332		- busiest->avg_load - scaled_busy_load_per_task);
9333		- }
9334		-
9335		- /* Amount of load we'd add */
9336		- if (busiest->avg_load * busiest->group_capacity <
9337		- busiest->load_per_task * SCHED_CAPACITY_SCALE) {
9338		- tmp = (busiest->avg_load * busiest->group_capacity) /
9339		- local->group_capacity;
9340		- } else {
9341		- tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
9342		- local->group_capacity;
9343		- }
9344		- capa_move += local->group_capacity *
9345		- min(local->load_per_task, local->avg_load + tmp);
9346		- capa_move /= SCHED_CAPACITY_SCALE;
9347		-
9348		- /* Move if we gain throughput */
9349		- if (capa_move > capa_now) {
9350		- env->imbalance = busiest->load_per_task;
9351		- return;
9352		- }
9353		-
9354		- /* We can't see throughput improvement with the load-based
9355		- * method, but it is possible depending upon group size and
9356		- * capacity range that there might still be an underutilized
9357		- * cpu available in an asymmetric capacity system. Do one last
9358		- * check just in case.
9359		- */
9360		- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9361		- busiest->group_type == group_overloaded &&
9362		- busiest->sum_nr_running > busiest->group_weight &&
9363		- local->sum_nr_running < local->group_weight &&
9364		- local->group_capacity < busiest->group_capacity)
9365		- env->imbalance = busiest->load_per_task;
	9278	+ return imbalance;
9366	9279	}
9367	9280
9368	9281	/**
..	..	@@ -9373,96 +9286,169 @@
9373	9286	*/
9374	9287	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
9375	9288	{
9376		- unsigned long max_pull, load_above_capacity = ~0UL;
9377	9289	struct sg_lb_stats local, busiest;
9378	9290
9379	9291	local = &sds->local_stat;
9380	9292	busiest = &sds->busiest_stat;
9381	9293
	9294	+ if (busiest->group_type == group_misfit_task) {
	9295	+ /* Set imbalance to allow misfit tasks to be balanced. */
	9296	+ env->migration_type = migrate_misfit;
	9297	+ env->imbalance = 1;
	9298	+ return;
	9299	+ }
	9300	+
	9301	+ if (busiest->group_type == group_asym_packing) {
	9302	+ /*
	9303	+ * In case of asym capacity, we will try to migrate all load to
	9304	+ * the preferred CPU.
	9305	+ */
	9306	+ env->migration_type = migrate_task;
	9307	+ env->imbalance = busiest->sum_h_nr_running;
	9308	+ return;
	9309	+ }
	9310	+
9382	9311	if (busiest->group_type == group_imbalanced) {
9383	9312	/*
9384	9313	* In the group_imb case we cannot rely on group-wide averages
9385		- * to ensure CPU-load equilibrium, look at wider averages. XXX
	9314	+ * to ensure CPU-load equilibrium, try to move any task to fix
	9315	+ * the imbalance. The next load balance will take care of
	9316	+ * balancing back the system.
9386	9317	*/
9387		- busiest->load_per_task =
9388		- min(busiest->load_per_task, sds->avg_load);
	9318	+ env->migration_type = migrate_task;
	9319	+ env->imbalance = 1;
	9320	+ return;
9389	9321	}
9390	9322
9391	9323	/*
9392		- * Avg load of busiest sg can be less and avg load of local sg can
9393		- * be greater than avg load across all sgs of sd because avg load
9394		- * factors in sg capacity and sgs with smaller group_type are
9395		- * skipped when updating the busiest sg:
	9324	+ * Try to use spare capacity of local group without overloading it or
	9325	+ * emptying busiest.
9396	9326	*/
9397		- if (busiest->group_type != group_misfit_task &&
9398		- (busiest->avg_load <= sds->avg_load \|\|
9399		- local->avg_load >= sds->avg_load)) {
9400		- env->imbalance = 0;
9401		- return fix_small_imbalance(env, sds);
	9327	+ if (local->group_type == group_has_spare) {
	9328	+ if ((busiest->group_type > group_fully_busy) &&
	9329	+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
	9330	+ /*
	9331	+ * If busiest is overloaded, try to fill spare
	9332	+ * capacity. This might end up creating spare capacity
	9333	+ * in busiest or busiest still being overloaded but
	9334	+ * there is no simple way to directly compute the
	9335	+ * amount of load to migrate in order to balance the
	9336	+ * system.
	9337	+ */
	9338	+ env->migration_type = migrate_util;
	9339	+ env->imbalance = max(local->group_capacity, local->group_util) -
	9340	+ local->group_util;
	9341	+
	9342	+ /*
	9343	+ * In some cases, the group's utilization is max or even
	9344	+ * higher than capacity because of migrations but the
	9345	+ * local CPU is (newly) idle. There is at least one
	9346	+ * waiting task in this overloaded busiest group. Let's
	9347	+ * try to pull it.
	9348	+ */
	9349	+ if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
	9350	+ env->migration_type = migrate_task;
	9351	+ env->imbalance = 1;
	9352	+ }
	9353	+
	9354	+ return;
	9355	+ }
	9356	+
	9357	+ if (busiest->group_weight == 1 \|\| sds->prefer_sibling) {
	9358	+ unsigned int nr_diff = busiest->sum_nr_running;
	9359	+ /*
	9360	+ * When prefer sibling, evenly spread running tasks on
	9361	+ * groups.
	9362	+ */
	9363	+ env->migration_type = migrate_task;
	9364	+ lsub_positive(&nr_diff, local->sum_nr_running);
	9365	+ env->imbalance = nr_diff >> 1;
	9366	+ } else {
	9367	+
	9368	+ /*
	9369	+ * If there is no overload, we just want to even the number of
	9370	+ * idle cpus.
	9371	+ */
	9372	+ env->migration_type = migrate_task;
	9373	+ env->imbalance = max_t(long, 0, (local->idle_cpus -
	9374	+ busiest->idle_cpus) >> 1);
	9375	+ }
	9376	+
	9377	+ /* Consider allowing a small imbalance between NUMA groups */
	9378	+ if (env->sd->flags & SD_NUMA)
	9379	+ env->imbalance = adjust_numa_imbalance(env->imbalance,
	9380	+ busiest->sum_nr_running);
	9381	+
	9382	+ return;
9402	9383	}
9403	9384
9404	9385	/*
9405		- * If there aren't any idle CPUs, avoid creating some.
	9386	+ * Local is fully busy but has to take more load to relieve the
	9387	+ * busiest group
9406	9388	*/
9407		- if (busiest->group_type == group_overloaded &&
9408		- local->group_type == group_overloaded) {
9409		- load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
9410		- if (load_above_capacity > busiest->group_capacity) {
9411		- load_above_capacity -= busiest->group_capacity;
9412		- load_above_capacity *= scale_load_down(NICE_0_LOAD);
9413		- load_above_capacity /= busiest->group_capacity;
9414		- } else
9415		- load_above_capacity = ~0UL;
	9389	+ if (local->group_type < group_overloaded) {
	9390	+ /*
	9391	+ * Local will become overloaded so the avg_load metrics are
	9392	+ * finally needed.
	9393	+ */
	9394	+
	9395	+ local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
	9396	+ local->group_capacity;
	9397	+
	9398	+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
	9399	+ sds->total_capacity;
	9400	+ /*
	9401	+ * If the local group is more loaded than the selected
	9402	+ * busiest group don't try to pull any tasks.
	9403	+ */
	9404	+ if (local->avg_load >= busiest->avg_load) {
	9405	+ env->imbalance = 0;
	9406	+ return;
	9407	+ }
9416	9408	}
9417	9409
9418	9410	/*
9419		- * We're trying to get all the CPUs to the average_load, so we don't
9420		- * want to push ourselves above the average load, nor do we wish to
9421		- * reduce the max loaded CPU below the average load. At the same time,
9422		- * we also don't want to reduce the group load below the group
9423		- * capacity. Thus we look for the minimum possible imbalance.
	9411	+ * Both group are or will become overloaded and we're trying to get all
	9412	+ * the CPUs to the average_load, so we don't want to push ourselves
	9413	+ * above the average load, nor do we wish to reduce the max loaded CPU
	9414	+ * below the average load. At the same time, we also don't want to
	9415	+ * reduce the group load below the group capacity. Thus we look for
	9416	+ * the minimum possible imbalance.
9424	9417	*/
9425		- max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
9426		-
9427		- /* How much load to actually move to equalise the imbalance */
	9418	+ env->migration_type = migrate_load;
9428	9419	env->imbalance = min(
9429		- max_pull * busiest->group_capacity,
	9420	+ (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
9430	9421	(sds->avg_load - local->avg_load) * local->group_capacity
9431	9422	) / SCHED_CAPACITY_SCALE;
9432		-
9433		- /* Boost imbalance to allow misfit task to be balanced.
9434		- * Always do this if we are doing a NEWLY_IDLE balance
9435		- * on the assumption that any tasks we have must not be
9436		- * long-running (and hence we cannot rely upon load).
9437		- * However if we are not idle, we should assume the tasks
9438		- * we have are longer running and not override load-based
9439		- * calculations above unless we are sure that the local
9440		- * group is underutilized.
9441		- */
9442		- if (busiest->group_type == group_misfit_task &&
9443		- (env->idle == CPU_NEWLY_IDLE \|\|
9444		- local->sum_nr_running < local->group_weight)) {
9445		- env->imbalance = max_t(long, env->imbalance,
9446		- busiest->group_misfit_task_load);
9447		- }
9448		-
9449		- /*
9450		- * if *imbalance is less than the average load per runnable task
9451		- * there is no guarantee that any tasks will be moved so we'll have
9452		- * a think about bumping its value to force at least one task to be
9453		- * moved
9454		- */
9455		- if (env->imbalance < busiest->load_per_task)
9456		- return fix_small_imbalance(env, sds);
9457	9423	}
9458	9424
9459	9425	/***** find_busiest_group() helpers end here *******************/
	9426	+
	9427	+/*
	9428	+ * Decision matrix according to the local and busiest group type:
	9429	+ *
	9430	+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
	9431	+ * has_spare nr_idle balanced N/A N/A balanced balanced
	9432	+ * fully_busy nr_idle nr_idle N/A N/A balanced balanced
	9433	+ * misfit_task force N/A N/A N/A force force
	9434	+ * asym_packing force force N/A N/A force force
	9435	+ * imbalanced force force N/A N/A force force
	9436	+ * overloaded force force N/A N/A force avg_load
	9437	+ *
	9438	+ * N/A : Not Applicable because already filtered while updating
	9439	+ * statistics.
	9440	+ * balanced : The system is balanced for these 2 groups.
	9441	+ * force : Calculate the imbalance as load migration is probably needed.
	9442	+ * avg_load : Only if imbalance is significant enough.
	9443	+ * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
	9444	+ * different in groups.
	9445	+ */
9460	9446
9461	9447	/**
9462	9448	* find_busiest_group - Returns the busiest group within the sched_domain
9463	9449	* if there is an imbalance.
9464	9450	*
9465		- * Also calculates the amount of weighted load which should be moved
	9451	+ * Also calculates the amount of runnable load which should be moved
9466	9452	* to restore balance.
9467	9453	*
9468	9454	* @env: The load balancing environment.
..	..	@@ -9477,91 +9463,120 @@
9477	9463	init_sd_lb_stats(&sds);
9478	9464
9479	9465	/*
9480		- * Compute the various statistics relavent for load balancing at
	9466	+ * Compute the various statistics relevant for load balancing at
9481	9467	* this level.
9482	9468	*/
9483	9469	update_sd_lb_stats(env, &sds);
9484	9470
9485		- if (static_branch_unlikely(&sched_energy_present)) {
	9471	+ if (sched_energy_enabled()) {
9486	9472	struct root_domain *rd = env->dst_rq->rd;
	9473	+ int out_balance = 1;
9487	9474
9488		- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
	9475	+ trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq,
	9476	+ &out_balance);
	9477	+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)
	9478	+ && out_balance)
9489	9479	goto out_balanced;
9490	9480	}
9491	9481
9492	9482	local = &sds.local_stat;
9493	9483	busiest = &sds.busiest_stat;
9494	9484
9495		- /* ASYM feature bypasses nice load balance check */
9496		- if (check_asym_packing(env, &sds))
9497		- return sds.busiest;
9498		-
9499	9485	/* There is no busy sibling group to pull tasks from */
9500		- if (!sds.busiest \|\| busiest->sum_nr_running == 0)
	9486	+ if (!sds.busiest)
9501	9487	goto out_balanced;
9502	9488
9503		- /* XXX broken for overlapping NUMA groups */
9504		- sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
9505		- / sds.total_capacity;
	9489	+ /* Misfit tasks should be dealt with regardless of the avg load */
	9490	+ if (busiest->group_type == group_misfit_task)
	9491	+ goto force_balance;
	9492	+
	9493	+ /* ASYM feature bypasses nice load balance check */
	9494	+ if (busiest->group_type == group_asym_packing)
	9495	+ goto force_balance;
9506	9496
9507	9497	/*
9508	9498	* If the busiest group is imbalanced the below checks don't
9509	9499	* work because they assume all things are equal, which typically
9510		- * isn't true due to cpus_allowed constraints and the like.
	9500	+ * isn't true due to cpus_ptr constraints and the like.
9511	9501	*/
9512	9502	if (busiest->group_type == group_imbalanced)
9513		- goto force_balance;
9514		-
9515		- /*
9516		- * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
9517		- * capacities from resulting in underutilization due to avg_load.
9518		- */
9519		- if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
9520		- busiest->group_no_capacity)
9521		- goto force_balance;
9522		-
9523		- /* Misfit tasks should be dealt with regardless of the avg load */
9524		- if (busiest->group_type == group_misfit_task)
9525	9503	goto force_balance;
9526	9504
9527	9505	/*
9528	9506	* If the local group is busier than the selected busiest group
9529	9507	* don't try and pull any tasks.
9530	9508	*/
9531		- if (local->avg_load >= busiest->avg_load)
	9509	+ if (local->group_type > busiest->group_type)
9532	9510	goto out_balanced;
9533	9511
9534	9512	/*
9535		- * Don't pull any tasks if this group is already above the domain
9536		- * average load.
	9513	+ * When groups are overloaded, use the avg_load to ensure fairness
	9514	+ * between tasks.
9537	9515	*/
9538		- if (local->avg_load >= sds.avg_load)
9539		- goto out_balanced;
9540		-
9541		- if (env->idle == CPU_IDLE) {
	9516	+ if (local->group_type == group_overloaded) {
9542	9517	/*
9543		- * This CPU is idle. If the busiest group is not overloaded
9544		- * and there is no imbalance between this and busiest group
9545		- * wrt idle CPUs, it is balanced. The imbalance becomes
9546		- * significant if the diff is greater than 1 otherwise we
9547		- * might end up to just move the imbalance on another group
	9518	+ * If the local group is more loaded than the selected
	9519	+ * busiest group don't try to pull any tasks.
9548	9520	*/
9549		- if ((busiest->group_type != group_overloaded) &&
9550		- (local->idle_cpus <= (busiest->idle_cpus + 1)))
	9521	+ if (local->avg_load >= busiest->avg_load)
9551	9522	goto out_balanced;
9552		- } else {
	9523	+
	9524	+ /* XXX broken for overlapping NUMA groups */
	9525	+ sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
	9526	+ sds.total_capacity;
	9527	+
9553	9528	/*
9554		- * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
9555		- * imbalance_pct to be conservative.
	9529	+ * Don't pull any tasks if this group is already above the
	9530	+ * domain average load.
	9531	+ */
	9532	+ if (local->avg_load >= sds.avg_load)
	9533	+ goto out_balanced;
	9534	+
	9535	+ /*
	9536	+ * If the busiest group is more loaded, use imbalance_pct to be
	9537	+ * conservative.
9556	9538	*/
9557	9539	if (100 * busiest->avg_load <=
9558	9540	env->sd->imbalance_pct * local->avg_load)
9559	9541	goto out_balanced;
9560	9542	}
9561	9543
	9544	+ /* Try to move all excess tasks to child's sibling domain */
	9545	+ if (sds.prefer_sibling && local->group_type == group_has_spare &&
	9546	+ busiest->sum_nr_running > local->sum_nr_running + 1)
	9547	+ goto force_balance;
	9548	+
	9549	+ if (busiest->group_type != group_overloaded) {
	9550	+ if (env->idle == CPU_NOT_IDLE)
	9551	+ /*
	9552	+ * If the busiest group is not overloaded (and as a
	9553	+ * result the local one too) but this CPU is already
	9554	+ * busy, let another idle CPU try to pull task.
	9555	+ */
	9556	+ goto out_balanced;
	9557	+
	9558	+ if (busiest->group_weight > 1 &&
	9559	+ local->idle_cpus <= (busiest->idle_cpus + 1))
	9560	+ /*
	9561	+ * If the busiest group is not overloaded
	9562	+ * and there is no imbalance between this and busiest
	9563	+ * group wrt idle CPUs, it is balanced. The imbalance
	9564	+ * becomes significant if the diff is greater than 1
	9565	+ * otherwise we might end up to just move the imbalance
	9566	+ * on another group. Of course this applies only if
	9567	+ * there is more than 1 CPU per group.
	9568	+ */
	9569	+ goto out_balanced;
	9570	+
	9571	+ if (busiest->sum_h_nr_running == 1)
	9572	+ /*
	9573	+ * busiest doesn't have any tasks waiting to run
	9574	+ */
	9575	+ goto out_balanced;
	9576	+ }
	9577	+
9562	9578	force_balance:
9563	9579	/* Looks like there is an imbalance. Compute it */
9564		- env->src_grp_type = busiest->group_type;
9565	9580	calculate_imbalance(env, &sds);
9566	9581	return env->imbalance ? sds.busiest : NULL;
9567	9582
..	..	@@ -9577,11 +9592,18 @@
9577	9592	struct sched_group *group)
9578	9593	{
9579	9594	struct rq busiest = NULL, rq;
9580		- unsigned long busiest_load = 0, busiest_capacity = 1;
9581		- int i;
	9595	+ unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
	9596	+ unsigned int busiest_nr = 0;
	9597	+ int i, done = 0;
	9598	+
	9599	+ trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus,
	9600	+ &busiest, &done);
	9601	+ if (done)
	9602	+ return busiest;
9582	9603
9583	9604	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
9584		- unsigned long capacity, wl;
	9605	+ unsigned long capacity, load, util;
	9606	+ unsigned int nr_running;
9585	9607	enum fbq_type rt;
9586	9608
9587	9609	rq = cpu_rq(i);
..	..	@@ -9609,20 +9631,8 @@
9609	9631	if (rt > env->fbq_type)
9610	9632	continue;
9611	9633
9612		- /*
9613		- * For ASYM_CPUCAPACITY domains with misfit tasks we simply
9614		- * seek the "biggest" misfit task.
9615		- */
9616		- if (env->src_grp_type == group_misfit_task) {
9617		- if (rq->misfit_task_load > busiest_load) {
9618		- busiest_load = rq->misfit_task_load;
9619		- busiest = rq;
9620		- }
9621		-
9622		- continue;
9623		- }
9624		-
9625	9634	capacity = capacity_of(i);
	9635	+ nr_running = rq->cfs.h_nr_running;
9626	9636
9627	9637	/*
9628	9638	* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
..	..	@@ -9632,35 +9642,77 @@
9632	9642	*/
9633	9643	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
9634	9644	capacity_of(env->dst_cpu) < capacity &&
9635		- rq->nr_running == 1)
	9645	+ nr_running == 1)
9636	9646	continue;
9637	9647
9638		- wl = weighted_cpuload(rq);
	9648	+ switch (env->migration_type) {
	9649	+ case migrate_load:
	9650	+ /*
	9651	+ * When comparing with load imbalance, use cpu_load()
	9652	+ * which is not scaled with the CPU capacity.
	9653	+ */
	9654	+ load = cpu_load(rq);
9639	9655
9640		- /*
9641		- * When comparing with imbalance, use weighted_cpuload()
9642		- * which is not scaled with the CPU capacity.
9643		- */
	9656	+ if (nr_running == 1 && load > env->imbalance &&
	9657	+ !check_cpu_capacity(rq, env->sd))
	9658	+ break;
9644	9659
9645		- if (rq->nr_running == 1 && wl > env->imbalance &&
9646		- !check_cpu_capacity(rq, env->sd))
9647		- continue;
	9660	+ /*
	9661	+ * For the load comparisons with the other CPUs,
	9662	+ * consider the cpu_load() scaled with the CPU
	9663	+ * capacity, so that the load can be moved away
	9664	+ * from the CPU that is potentially running at a
	9665	+ * lower capacity.
	9666	+ *
	9667	+ * Thus we're looking for max(load_i / capacity_i),
	9668	+ * crosswise multiplication to rid ourselves of the
	9669	+ * division works out to:
	9670	+ * load_i * capacity_j > load_j * capacity_i;
	9671	+ * where j is our previous maximum.
	9672	+ */
	9673	+ if (load * busiest_capacity > busiest_load * capacity) {
	9674	+ busiest_load = load;
	9675	+ busiest_capacity = capacity;
	9676	+ busiest = rq;
	9677	+ }
	9678	+ break;
9648	9679
9649		- /*
9650		- * For the load comparisons with the other CPU's, consider
9651		- * the weighted_cpuload() scaled with the CPU capacity, so
9652		- * that the load can be moved away from the CPU that is
9653		- * potentially running at a lower capacity.
9654		- *
9655		- * Thus we're looking for max(wl_i / capacity_i), crosswise
9656		- * multiplication to rid ourselves of the division works out
9657		- * to: wl_i * capacity_j > wl_j * capacity_i; where j is
9658		- * our previous maximum.
9659		- */
9660		- if (wl * busiest_capacity > busiest_load * capacity) {
9661		- busiest_load = wl;
9662		- busiest_capacity = capacity;
9663		- busiest = rq;
	9680	+ case migrate_util:
	9681	+ util = cpu_util(cpu_of(rq));
	9682	+
	9683	+ /*
	9684	+ * Don't try to pull utilization from a CPU with one
	9685	+ * running task. Whatever its utilization, we will fail
	9686	+ * detach the task.
	9687	+ */
	9688	+ if (nr_running <= 1)
	9689	+ continue;
	9690	+
	9691	+ if (busiest_util < util) {
	9692	+ busiest_util = util;
	9693	+ busiest = rq;
	9694	+ }
	9695	+ break;
	9696	+
	9697	+ case migrate_task:
	9698	+ if (busiest_nr < nr_running) {
	9699	+ busiest_nr = nr_running;
	9700	+ busiest = rq;
	9701	+ }
	9702	+ break;
	9703	+
	9704	+ case migrate_misfit:
	9705	+ /*
	9706	+ * For ASYM_CPUCAPACITY domains with misfit tasks we
	9707	+ * simply seek the "biggest" misfit task.
	9708	+ */
	9709	+ if (rq->misfit_task_load > busiest_load) {
	9710	+ busiest_load = rq->misfit_task_load;
	9711	+ busiest = rq;
	9712	+ }
	9713	+
	9714	+ break;
	9715	+
9664	9716	}
9665	9717	}
9666	9718
..	..	@@ -9673,21 +9725,25 @@
9673	9725	*/
9674	9726	#define MAX_PINNED_INTERVAL 512
9675	9727
9676		-static int need_active_balance(struct lb_env *env)
	9728	+static inline bool
	9729	+asym_active_balance(struct lb_env *env)
	9730	+{
	9731	+ /*
	9732	+ * ASYM_PACKING needs to force migrate tasks from busy but
	9733	+ * lower priority CPUs in order to pack all tasks in the
	9734	+ * highest priority CPUs.
	9735	+ */
	9736	+ return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
	9737	+ sched_asym_prefer(env->dst_cpu, env->src_cpu);
	9738	+}
	9739	+
	9740	+static inline bool
	9741	+voluntary_active_balance(struct lb_env *env)
9677	9742	{
9678	9743	struct sched_domain *sd = env->sd;
9679	9744
9680		- if (env->idle == CPU_NEWLY_IDLE) {
9681		-
9682		- /*
9683		- * ASYM_PACKING needs to force migrate tasks from busy but
9684		- * lower priority CPUs in order to pack all tasks in the
9685		- * highest priority CPUs.
9686		- */
9687		- if ((sd->flags & SD_ASYM_PACKING) &&
9688		- sched_asym_prefer(env->dst_cpu, env->src_cpu))
9689		- return 1;
9690		- }
	9745	+ if (asym_active_balance(env))
	9746	+ return 1;
9691	9747
9692	9748	/*
9693	9749	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
..	..	@@ -9702,19 +9758,18 @@
9702	9758	return 1;
9703	9759	}
9704	9760
9705		- if (env->src_grp_type == group_misfit_task)
	9761	+ if (env->migration_type == migrate_misfit)
9706	9762	return 1;
9707	9763
9708		- if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9709		- env->src_rq->cfs.h_nr_running == 1 &&
9710		- cpu_overutilized(env->src_cpu) &&
9711		- !cpu_overutilized(env->dst_cpu)) {
9712		- return 1;
9713		- }
	9764	+ return 0;
	9765	+}
9714	9766
9715		- if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
9716		- return 1;
	9767	+static int need_active_balance(struct lb_env *env)
	9768	+{
	9769	+ struct sched_domain *sd = env->sd;
9717	9770
	9771	+ if (voluntary_active_balance(env))
	9772	+ return 1;
9718	9773
9719	9774	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9720	9775	}
..	..	@@ -9724,7 +9779,17 @@
9724	9779	static int should_we_balance(struct lb_env *env)
9725	9780	{
9726	9781	struct sched_group *sg = env->sd->groups;
9727		- int cpu, balance_cpu = -1;
	9782	+ int cpu;
	9783	+
	9784	+ if (IS_ENABLED(CONFIG_ROCKCHIP_PERFORMANCE)) {
	9785	+ struct root_domain *rd = env->dst_rq->rd;
	9786	+ struct cpumask *cpul_mask = rockchip_perf_get_cpul_mask();
	9787	+ int level = rockchip_perf_get_level();
	9788	+
	9789	+ if ((level == ROCKCHIP_PERFORMANCE_HIGH) && !READ_ONCE(rd->overutilized) &&
	9790	+ cpul_mask && cpumask_test_cpu(env->dst_cpu, cpul_mask))
	9791	+ return 0;
	9792	+ }
9728	9793
9729	9794	/*
9730	9795	* Ensure the balancing environment is consistent; can happen
..	..	@@ -9745,18 +9810,12 @@
9745	9810	if (!idle_cpu(cpu))
9746	9811	continue;
9747	9812
9748		- balance_cpu = cpu;
9749		- break;
	9813	+ /* Are we the first idle CPU? */
	9814	+ return cpu == env->dst_cpu;
9750	9815	}
9751	9816
9752		- if (balance_cpu == -1)
9753		- balance_cpu = group_balance_cpu(sg);
9754		-
9755		- /*
9756		- * First idle CPU or the first CPU(busiest) in this sched group
9757		- * is eligible for doing load balancing at this and above domains.
9758		- */
9759		- return balance_cpu == env->dst_cpu;
	9817	+ /* Are we the first CPU of this group ? */
	9818	+ return group_balance_cpu(sg) == env->dst_cpu;
9760	9819	}
9761	9820
9762	9821	/*
..	..	@@ -9828,6 +9887,7 @@
9828	9887
9829	9888	more_balance:
9830	9889	rq_lock_irqsave(busiest, &rf);
	9890	+ env.src_rq_rf = &rf;
9831	9891	update_rq_clock(busiest);
9832	9892
9833	9893	/*
..	..	@@ -9880,7 +9940,7 @@
9880	9940	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9881	9941
9882	9942	/* Prevent to re-select dst_cpu via env's CPUs */
9883		- cpumask_clear_cpu(env.dst_cpu, env.cpus);
	9943	+ __cpumask_clear_cpu(env.dst_cpu, env.cpus);
9884	9944
9885	9945	env.dst_rq = cpu_rq(env.new_dst_cpu);
9886	9946	env.dst_cpu = env.new_dst_cpu;
..	..	@@ -9907,7 +9967,7 @@
9907	9967
9908	9968	/* All tasks on this runqueue were pinned by CPU affinity */
9909	9969	if (unlikely(env.flags & LBF_ALL_PINNED)) {
9910		- cpumask_clear_cpu(cpu_of(busiest), cpus);
	9970	+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
9911	9971	/*
9912	9972	* Attempting to continue load balancing at the current
9913	9973	* sched_domain level only makes sense if there are
..	..	@@ -9934,8 +9994,7 @@
9934	9994	* excessive cache_hot migrations and active balances.
9935	9995	*/
9936	9996	if (idle != CPU_NEWLY_IDLE)
9937		- if (env.src_grp_nr_running > 1)
9938		- sd->nr_balance_failed++;
	9997	+ sd->nr_balance_failed++;
9939	9998
9940	9999	if (need_active_balance(&env)) {
9941	10000	unsigned long flags;
..	..	@@ -9947,7 +10006,7 @@
9947	10006	* if the curr task on busiest CPU can't be
9948	10007	* moved to this_cpu:
9949	10008	*/
9950		- if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
	10009	+ if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9951	10010	raw_spin_unlock_irqrestore(&busiest->lock,
9952	10011	flags);
9953	10012	env.flags \|= LBF_ALL_PINNED;
..	..	@@ -9978,7 +10037,7 @@
9978	10037	} else
9979	10038	sd->nr_balance_failed = 0;
9980	10039
9981		- if (likely(!active_balance)) {
	10040	+ if (likely(!active_balance) \|\| voluntary_active_balance(&env)) {
9982	10041	/* We were unbalanced, so reset the balancing interval */
9983	10042	sd->balance_interval = sd->min_interval;
9984	10043	} else {
..	..	@@ -10021,18 +10080,18 @@
10021	10080	ld_moved = 0;
10022	10081
10023	10082	/*
10024		- * idle_balance() disregards balance intervals, so we could repeatedly
10025		- * reach this code, which would lead to balance_interval skyrocketting
10026		- * in a short amount of time. Skip the balance_interval increase logic
10027		- * to avoid that.
	10083	+ * newidle_balance() disregards balance intervals, so we could
	10084	+ * repeatedly reach this code, which would lead to balance_interval
	10085	+ * skyrocketting in a short amount of time. Skip the balance_interval
	10086	+ * increase logic to avoid that.
10028	10087	*/
10029	10088	if (env.idle == CPU_NEWLY_IDLE)
10030	10089	goto out;
10031	10090
10032	10091	/* tune up the balancing interval */
10033		- if (((env.flags & LBF_ALL_PINNED) &&
10034		- sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
10035		- (sd->balance_interval < sd->max_interval))
	10092	+ if ((env.flags & LBF_ALL_PINNED &&
	10093	+ sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
	10094	+ sd->balance_interval < sd->max_interval)
10036	10095	sd->balance_interval *= 2;
10037	10096	out:
10038	10097	return ld_moved;
..	..	@@ -10048,6 +10107,15 @@
10048	10107
10049	10108	/* scale ms to jiffies */
10050	10109	interval = msecs_to_jiffies(interval);
	10110	+
	10111	+ /*
	10112	+ * Reduce likelihood of busy balancing at higher domains racing with
	10113	+ * balancing at lower domains by preventing their balancing periods
	10114	+ * from being multiples of each other.
	10115	+ */
	10116	+ if (cpu_busy)
	10117	+ interval -= 1;
	10118	+
10051	10119	interval = clamp(interval, 1UL, max_load_balance_interval);
10052	10120
10053	10121	return interval;
..	..	@@ -10110,9 +10178,8 @@
10110	10178	/* Search for an sd spanning us and the target CPU. */
10111	10179	rcu_read_lock();
10112	10180	for_each_domain(target_cpu, sd) {
10113		- if ((sd->flags & SD_LOAD_BALANCE) &&
10114		- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10115		- break;
	10181	+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
	10182	+ break;
10116	10183	}
10117	10184
10118	10185	if (likely(sd)) {
..	..	@@ -10130,6 +10197,7 @@
10130	10197	* about DST_PINNED.
10131	10198	*/
10132	10199	.flags = LBF_DST_PINNED,
	10200	+ .src_rq_rf = &rf,
10133	10201	};
10134	10202
10135	10203	schedstat_inc(sd->alb_count);
..	..	@@ -10165,7 +10233,7 @@
10165	10233	*/
10166	10234	void update_max_interval(void)
10167	10235	{
10168		- max_load_balance_interval = HZ*num_online_cpus()/10;
	10236	+ max_load_balance_interval = HZ*num_active_cpus()/10;
10169	10237	}
10170	10238
10171	10239	/*
..	..	@@ -10178,6 +10246,7 @@
10178	10246	{
10179	10247	int continue_balancing = 1;
10180	10248	int cpu = rq->cpu;
	10249	+ int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10181	10250	unsigned long interval;
10182	10251	struct sched_domain *sd;
10183	10252	/* Earliest time when we have to do rebalance again */
..	..	@@ -10185,6 +10254,10 @@
10185	10254	int update_next_balance = 0;
10186	10255	int need_serialize, need_decay = 0;
10187	10256	u64 max_cost = 0;
	10257	+
	10258	+ trace_android_rvh_sched_rebalance_domains(rq, &continue_balancing);
	10259	+ if (!continue_balancing)
	10260	+ return;
10188	10261
10189	10262	rcu_read_lock();
10190	10263	for_each_domain(cpu, sd) {
..	..	@@ -10200,9 +10273,6 @@
10200	10273	}
10201	10274	max_cost += sd->max_newidle_lb_cost;
10202	10275
10203		- if (!(sd->flags & SD_LOAD_BALANCE))
10204		- continue;
10205		-
10206	10276	/*
10207	10277	* Stop the load balance at this level. There is another
10208	10278	* CPU in our sched group which is doing load balancing more
..	..	@@ -10214,7 +10284,7 @@
10214	10284	break;
10215	10285	}
10216	10286
10217		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10287	+ interval = get_sd_balance_interval(sd, busy);
10218	10288
10219	10289	need_serialize = sd->flags & SD_SERIALIZE;
10220	10290	if (need_serialize) {
..	..	@@ -10230,9 +10300,10 @@
10230	10300	* state even if we migrated tasks. Update it.
10231	10301	*/
10232	10302	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
	10303	+ busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
10233	10304	}
10234	10305	sd->last_balance = jiffies;
10235		- interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
	10306	+ interval = get_sd_balance_interval(sd, busy);
10236	10307	}
10237	10308	if (need_serialize)
10238	10309	spin_unlock(&balancing);
..	..	@@ -10292,7 +10363,11 @@
10292	10363
10293	10364	static inline int find_new_ilb(void)
10294	10365	{
10295		- int ilb;
	10366	+ int ilb = -1;
	10367	+
	10368	+ trace_android_rvh_find_new_ilb(nohz.idle_cpus_mask, &ilb);
	10369	+ if (ilb >= 0)
	10370	+ return ilb;
10296	10371
10297	10372	for_each_cpu_and(ilb, nohz.idle_cpus_mask,
10298	10373	housekeeping_cpumask(HK_FLAG_MISC)) {
..	..	@@ -10323,29 +10398,25 @@
10323	10398	if (ilb_cpu >= nr_cpu_ids)
10324	10399	return;
10325	10400
	10401	+ /*
	10402	+ * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
	10403	+ * the first flag owns it; cleared by nohz_csd_func().
	10404	+ */
10326	10405	flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
10327	10406	if (flags & NOHZ_KICK_MASK)
10328	10407	return;
10329	10408
10330	10409	/*
10331		- * Use smp_send_reschedule() instead of resched_cpu().
10332		- * This way we generate a sched IPI on the target CPU which
	10410	+ * This way we generate an IPI on the target CPU which
10333	10411	* is idle. And the softirq performing nohz idle load balance
10334	10412	* will be run before returning from the IPI.
10335	10413	*/
10336		- smp_send_reschedule(ilb_cpu);
	10414	+ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
10337	10415	}
10338	10416
10339	10417	/*
10340		- * Current heuristic for kicking the idle load balancer in the presence
10341		- * of an idle cpu in the system.
10342		- * - This rq has more than one task.
10343		- * - This rq has at least one CFS task and the capacity of the CPU is
10344		- * significantly reduced because of RT tasks or IRQs.
10345		- * - At parent of LLC scheduler domain level, this cpu's scheduler group has
10346		- * multiple busy cpu.
10347		- * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
10348		- * domain span are idle.
	10418	+ * Current decision point for kicking the idle load balancer in the presence
	10419	+ * of idle CPUs in the system.
10349	10420	*/
10350	10421	static void nohz_balancer_kick(struct rq *rq)
10351	10422	{
..	..	@@ -10354,6 +10425,7 @@
10354	10425	struct sched_domain *sd;
10355	10426	int nr_busy, i, cpu = rq->cpu;
10356	10427	unsigned int flags = 0;
	10428	+ int done = 0;
10357	10429
10358	10430	if (unlikely(rq->idle_balance))
10359	10431	return;
..	..	@@ -10378,30 +10450,25 @@
10378	10450	if (time_before(now, nohz.next_balance))
10379	10451	goto out;
10380	10452
10381		- if (rq->nr_running >= 2 \|\| rq->misfit_task_load) {
	10453	+ trace_android_rvh_sched_nohz_balancer_kick(rq, &flags, &done);
	10454	+ if (done)
	10455	+ goto out;
	10456	+
	10457	+ if (rq->nr_running >= 2) {
10382	10458	flags = NOHZ_KICK_MASK;
10383	10459	goto out;
10384	10460	}
10385	10461
10386	10462	rcu_read_lock();
10387		- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10388		- if (sds) {
10389		- /*
10390		- * XXX: write a coherent comment on why we do this.
10391		- * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
10392		- */
10393		- nr_busy = atomic_read(&sds->nr_busy_cpus);
10394		- if (nr_busy > 1) {
10395		- flags = NOHZ_KICK_MASK;
10396		- goto unlock;
10397		- }
10398		-
10399		- }
10400	10463
10401	10464	sd = rcu_dereference(rq->sd);
10402	10465	if (sd) {
10403		- if ((rq->cfs.h_nr_running >= 1) &&
10404		- check_cpu_capacity(rq, sd)) {
	10466	+ /*
	10467	+ * If there's a CFS task and the current CPU has reduced
	10468	+ * capacity; kick the ILB to see if there's a better CPU to run
	10469	+ * on.
	10470	+ */
	10471	+ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
10405	10472	flags = NOHZ_KICK_MASK;
10406	10473	goto unlock;
10407	10474	}
..	..	@@ -10409,15 +10476,55 @@
10409	10476
10410	10477	sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
10411	10478	if (sd) {
10412		- for_each_cpu(i, sched_domain_span(sd)) {
10413		- if (i == cpu \|\|
10414		- !cpumask_test_cpu(i, nohz.idle_cpus_mask))
10415		- continue;
10416		-
	10479	+ /*
	10480	+ * When ASYM_PACKING; see if there's a more preferred CPU
	10481	+ * currently idle; in which case, kick the ILB to move tasks
	10482	+ * around.
	10483	+ */
	10484	+ for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
10417	10485	if (sched_asym_prefer(i, cpu)) {
10418	10486	flags = NOHZ_KICK_MASK;
10419	10487	goto unlock;
10420	10488	}
	10489	+ }
	10490	+ }
	10491	+
	10492	+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
	10493	+ if (sd) {
	10494	+ /*
	10495	+ * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
	10496	+ * to run the misfit task on.
	10497	+ */
	10498	+ if (check_misfit_status(rq, sd)) {
	10499	+ flags = NOHZ_KICK_MASK;
	10500	+ goto unlock;
	10501	+ }
	10502	+
	10503	+ /*
	10504	+ * For asymmetric systems, we do not want to nicely balance
	10505	+ * cache use, instead we want to embrace asymmetry and only
	10506	+ * ensure tasks have enough CPU capacity.
	10507	+ *
	10508	+ * Skip the LLC logic because it's not relevant in that case.
	10509	+ */
	10510	+ goto unlock;
	10511	+ }
	10512	+
	10513	+ sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
	10514	+ if (sds) {
	10515	+ /*
	10516	+ * If there is an imbalance between LLC domains (IOW we could
	10517	+ * increase the overall cache use), we need some less-loaded LLC
	10518	+ * domain to pull some load. Likewise, we may need to spread
	10519	+ * load within the current LLC domain (e.g. packed SMT cores but
	10520	+ * other CPUs are idle). We can't really know from here how busy
	10521	+ * the others are - so just get a nohz balance going if it looks
	10522	+ * like this LLC domain has tasks we could move.
	10523	+ */
	10524	+ nr_busy = atomic_read(&sds->nr_busy_cpus);
	10525	+ if (nr_busy > 1) {
	10526	+ flags = NOHZ_KICK_MASK;
	10527	+ goto unlock;
10421	10528	}
10422	10529	}
10423	10530	unlock:
..	..	@@ -10483,9 +10590,20 @@
10483	10590
10484	10591	SCHED_WARN_ON(cpu != smp_processor_id());
10485	10592
10486		- /* If this CPU is going down, then nothing needs to be done: */
10487		- if (!cpu_active(cpu))
	10593	+ if (!cpu_active(cpu)) {
	10594	+ /*
	10595	+ * A CPU can be paused while it is idle with it's tick
	10596	+ * stopped. nohz_balance_exit_idle() should be called
	10597	+ * from the local CPU, so it can't be called during
	10598	+ * pause. This results in paused CPU participating in
	10599	+ * the nohz idle balance, which should be avoided.
	10600	+ *
	10601	+ * When the paused CPU exits idle and enters again,
	10602	+ * exempt the paused CPU from nohz_balance_exit_idle.
	10603	+ */
	10604	+ nohz_balance_exit_idle(rq);
10488	10605	return;
	10606	+ }
10489	10607
10490	10608	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
10491	10609	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
..	..	@@ -10598,7 +10716,6 @@
10598	10716
10599	10717	rq_lock_irqsave(rq, &rf);
10600	10718	update_rq_clock(rq);
10601		- cpu_load_update_idle(rq);
10602	10719	rq_unlock_irqrestore(rq, &rf);
10603	10720
10604	10721	if (flags & NOHZ_BALANCE_KICK)
..	..	@@ -10648,22 +10765,14 @@
10648	10765	*/
10649	10766	static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10650	10767	{
10651		- int this_cpu = this_rq->cpu;
10652		- unsigned int flags;
	10768	+ unsigned int flags = this_rq->nohz_idle_balance;
10653	10769
10654		- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
	10770	+ if (!flags)
10655	10771	return false;
10656	10772
10657		- if (idle != CPU_IDLE) {
10658		- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10659		- return false;
10660		- }
	10773	+ this_rq->nohz_idle_balance = 0;
10661	10774
10662		- /*
10663		- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
10664		- */
10665		- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
10666		- if (!(flags & NOHZ_KICK_MASK))
	10775	+ if (idle != CPU_IDLE)
10667	10776	return false;
10668	10777
10669	10778	_nohz_idle_balance(this_rq, flags, idle);
..	..	@@ -10717,15 +10826,26 @@
10717	10826	/*
10718	10827	* idle_balance is called by schedule() if this_cpu is about to become
10719	10828	* idle. Attempts to pull tasks from other CPUs.
	10829	+ *
	10830	+ * Returns:
	10831	+ * < 0 - we released the lock and there are !fair tasks present
	10832	+ * 0 - failed, no new tasks
	10833	+ * > 0 - success, new (fair) tasks present
10720	10834	*/
10721		-static int idle_balance(struct rq this_rq, struct rq_flags rf)
	10835	+static int newidle_balance(struct rq this_rq, struct rq_flags rf)
10722	10836	{
10723	10837	unsigned long next_balance = jiffies + HZ;
10724	10838	int this_cpu = this_rq->cpu;
10725	10839	struct sched_domain *sd;
10726	10840	int pulled_task = 0;
10727	10841	u64 curr_cost = 0;
	10842	+ int done = 0;
10728	10843
	10844	+ trace_android_rvh_sched_newidle_balance(this_rq, rf, &pulled_task, &done);
	10845	+ if (done)
	10846	+ return pulled_task;
	10847	+
	10848	+ update_misfit_status(NULL, this_rq);
10729	10849	/*
10730	10850	* We must set idle_stamp _before_ calling idle_balance(), such that we
10731	10851	* measure the duration of idle_balance() as idle time.
..	..	@@ -10767,9 +10887,6 @@
10767	10887	for_each_domain(this_cpu, sd) {
10768	10888	int continue_balancing = 1;
10769	10889	u64 t0, domain_cost;
10770		-
10771		- if (!(sd->flags & SD_LOAD_BALANCE))
10772		- continue;
10773	10890
10774	10891	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
10775	10892	update_next_balance(sd, &next_balance);
..	..	@@ -10943,7 +11060,7 @@
10943	11060	* 'current' within the tree based on its new key value.
10944	11061	*/
10945	11062	swap(curr->vruntime, se->vruntime);
10946		- resched_curr(rq);
	11063	+ resched_curr_lazy(rq);
10947	11064	}
10948	11065
10949	11066	se->vruntime -= cfs_rq->min_vruntime;
..	..	@@ -10960,6 +11077,9 @@
10960	11077	if (!task_on_rq_queued(p))
10961	11078	return;
10962	11079
	11080	+ if (rq->cfs.nr_running == 1)
	11081	+ return;
	11082	+
10963	11083	/*
10964	11084	* Reschedule if we are currently running on this runqueue and
10965	11085	* our priority decreased, or if we are not currently running on
..	..	@@ -10967,7 +11087,7 @@
10967	11087	*/
10968	11088	if (rq->curr == p) {
10969	11089	if (p->prio > oldprio)
10970		- resched_curr(rq);
	11090	+ resched_curr_lazy(rq);
10971	11091	} else
10972	11092	check_preempt_curr(rq, p, 0);
10973	11093	}
..	..	@@ -11038,7 +11158,7 @@
11038	11158	/* Catch up with the cfs_rq and remove our load when we leave */
11039	11159	update_load_avg(cfs_rq, se, 0);
11040	11160	detach_entity_load_avg(cfs_rq, se);
11041		- update_tg_load_avg(cfs_rq, false);
	11161	+ update_tg_load_avg(cfs_rq);
11042	11162	propagate_entity_cfs_rq(se);
11043	11163	}
11044	11164
..	..	@@ -11056,8 +11176,8 @@
11056	11176
11057	11177	/* Synchronize entity with its cfs_rq */
11058	11178	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
11059		- attach_entity_load_avg(cfs_rq, se, 0);
11060		- update_tg_load_avg(cfs_rq, false);
	11179	+ attach_entity_load_avg(cfs_rq, se);
	11180	+ update_tg_load_avg(cfs_rq);
11061	11181	propagate_entity_cfs_rq(se);
11062	11182	}
11063	11183
..	..	@@ -11116,9 +11236,19 @@
11116	11236	* This routine is mostly called to set cfs_rq->curr field when a task
11117	11237	* migrates between groups/classes.
11118	11238	*/
11119		-static void set_curr_task_fair(struct rq *rq)
	11239	+static void set_next_task_fair(struct rq rq, struct task_struct p, bool first)
11120	11240	{
11121		- struct sched_entity *se = &rq->curr->se;
	11241	+ struct sched_entity *se = &p->se;
	11242	+
	11243	+#ifdef CONFIG_SMP
	11244	+ if (task_on_rq_queued(p)) {
	11245	+ /*
	11246	+ * Move the next running task to the front of the list, so our
	11247	+ * cfs_tasks list becomes MRU one.
	11248	+ */
	11249	+ list_move(&se->group_node, &rq->cfs_tasks);
	11250	+ }
	11251	+#endif
11122	11252
11123	11253	for_each_sched_entity(se) {
11124	11254	struct cfs_rq *cfs_rq = cfs_rq_of(se);
..	..	@@ -11379,8 +11509,8 @@
11379	11509	/*
11380	11510	* All the scheduling class methods:
11381	11511	*/
11382		-const struct sched_class fair_sched_class = {
11383		- .next = &idle_sched_class,
	11512	+const struct sched_class fair_sched_class
	11513	+ __section("__fair_sched_class") = {
11384	11514	.enqueue_task = enqueue_task_fair,
11385	11515	.dequeue_task = dequeue_task_fair,
11386	11516	.yield_task = yield_task_fair,
..	..	@@ -11388,10 +11518,12 @@
11388	11518
11389	11519	.check_preempt_curr = check_preempt_wakeup,
11390	11520
11391		- .pick_next_task = pick_next_task_fair,
	11521	+ .pick_next_task = __pick_next_task_fair,
11392	11522	.put_prev_task = put_prev_task_fair,
	11523	+ .set_next_task = set_next_task_fair,
11393	11524
11394	11525	#ifdef CONFIG_SMP
	11526	+ .balance = balance_fair,
11395	11527	.select_task_rq = select_task_rq_fair,
11396	11528	.migrate_task_rq = migrate_task_rq_fair,
11397	11529
..	..	@@ -11402,7 +11534,6 @@
11402	11534	.set_cpus_allowed = set_cpus_allowed_common,
11403	11535	#endif
11404	11536
11405		- .set_curr_task = set_curr_task_fair,
11406	11537	.task_tick = task_tick_fair,
11407	11538	.task_fork = task_fork_fair,
11408	11539
..	..	@@ -11472,3 +11603,101 @@
11472	11603	#endif /* SMP */
11473	11604
11474	11605	}
	11606	+
	11607	+/*
	11608	+ * Helper functions to facilitate extracting info from tracepoints.
	11609	+ */
	11610	+
	11611	+const struct sched_avg sched_trace_cfs_rq_avg(struct cfs_rq cfs_rq)
	11612	+{
	11613	+#ifdef CONFIG_SMP
	11614	+ return cfs_rq ? &cfs_rq->avg : NULL;
	11615	+#else
	11616	+ return NULL;
	11617	+#endif
	11618	+}
	11619	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
	11620	+
	11621	+char sched_trace_cfs_rq_path(struct cfs_rq cfs_rq, char *str, int len)
	11622	+{
	11623	+ if (!cfs_rq) {
	11624	+ if (str)
	11625	+ strlcpy(str, "(null)", len);
	11626	+ else
	11627	+ return NULL;
	11628	+ }
	11629	+
	11630	+ cfs_rq_tg_path(cfs_rq, str, len);
	11631	+ return str;
	11632	+}
	11633	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
	11634	+
	11635	+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
	11636	+{
	11637	+ return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
	11638	+}
	11639	+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
	11640	+
	11641	+const struct sched_avg sched_trace_rq_avg_rt(struct rq rq)
	11642	+{
	11643	+#ifdef CONFIG_SMP
	11644	+ return rq ? &rq->avg_rt : NULL;
	11645	+#else
	11646	+ return NULL;
	11647	+#endif
	11648	+}
	11649	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
	11650	+
	11651	+const struct sched_avg sched_trace_rq_avg_dl(struct rq rq)
	11652	+{
	11653	+#ifdef CONFIG_SMP
	11654	+ return rq ? &rq->avg_dl : NULL;
	11655	+#else
	11656	+ return NULL;
	11657	+#endif
	11658	+}
	11659	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
	11660	+
	11661	+const struct sched_avg sched_trace_rq_avg_irq(struct rq rq)
	11662	+{
	11663	+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
	11664	+ return rq ? &rq->avg_irq : NULL;
	11665	+#else
	11666	+ return NULL;
	11667	+#endif
	11668	+}
	11669	+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
	11670	+
	11671	+int sched_trace_rq_cpu(struct rq *rq)
	11672	+{
	11673	+ return rq ? cpu_of(rq) : -1;
	11674	+}
	11675	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
	11676	+
	11677	+int sched_trace_rq_cpu_capacity(struct rq *rq)
	11678	+{
	11679	+ return rq ?
	11680	+#ifdef CONFIG_SMP
	11681	+ rq->cpu_capacity
	11682	+#else
	11683	+ SCHED_CAPACITY_SCALE
	11684	+#endif
	11685	+ : -1;
	11686	+}
	11687	+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
	11688	+
	11689	+const struct cpumask sched_trace_rd_span(struct root_domain rd)
	11690	+{
	11691	+#ifdef CONFIG_SMP
	11692	+ return rd ? rd->span : NULL;
	11693	+#else
	11694	+ return NULL;
	11695	+#endif
	11696	+}
	11697	+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
	11698	+
	11699	+int sched_trace_rq_nr_running(struct rq *rq)
	11700	+{
	11701	+ return rq ? rq->nr_running : -1;
	11702	+}
	11703	+EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);