~hc/RK356X_SDK_RELEASE.git

..	..	@@ -4,11 +4,16 @@
4	4	*/
5	5	#include "sched.h"
6	6
	7	+#include <trace/hooks/sched.h>
	8	+
7	9	DEFINE_MUTEX(sched_domains_mutex);
	10	+#ifdef CONFIG_LOCKDEP
	11	+EXPORT_SYMBOL_GPL(sched_domains_mutex);
	12	+#endif
8	13
9	14	/* Protected by sched_domains_mutex: */
10		-cpumask_var_t sched_domains_tmpmask;
11		-cpumask_var_t sched_domains_tmpmask2;
	15	+static cpumask_var_t sched_domains_tmpmask;
	16	+static cpumask_var_t sched_domains_tmpmask2;
12	17
13	18	#ifdef CONFIG_SCHED_DEBUG
14	19
..	..	@@ -25,22 +30,22 @@
25	30	return sched_debug_enabled;
26	31	}
27	32
	33	+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
	34	+const struct sd_flag_debug sd_flag_debug[] = {
	35	+#include <linux/sched/sd_flags.h>
	36	+};
	37	+#undef SD_FLAG
	38	+
28	39	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
29	40	struct cpumask *groupmask)
30	41	{
31	42	struct sched_group *group = sd->groups;
	43	+ unsigned long flags = sd->flags;
	44	+ unsigned int idx;
32	45
33	46	cpumask_clear(groupmask);
34	47
35	48	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36		-
37		- if (!(sd->flags & SD_LOAD_BALANCE)) {
38		- printk("does not load-balance\n");
39		- if (sd->parent)
40		- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41		- return -1;
42		- }
43		-
44	49	printk(KERN_CONT "span=%*pbl level=%s\n",
45	50	cpumask_pr_args(sched_domain_span(sd)), sd->name);
46	51
..	..	@@ -49,6 +54,21 @@
49	54	}
50	55	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51	56	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
	57	+ }
	58	+
	59	+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
	60	+ unsigned int flag = BIT(idx);
	61	+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
	62	+
	63	+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
	64	+ !(sd->child->flags & flag))
	65	+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
	66	+ sd_flag_debug[idx].name);
	67	+
	68	+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
	69	+ !(sd->parent->flags & flag))
	70	+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
	71	+ sd_flag_debug[idx].name);
52	72	}
53	73
54	74	printk(KERN_DEBUG "%*s groups:", level + 1, "");
..	..	@@ -145,23 +165,22 @@
145	165	}
146	166	#endif /* CONFIG_SCHED_DEBUG */
147	167
	168	+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
	169	+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
	170	+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
	171	+#include <linux/sched/sd_flags.h>
	172	+0;
	173	+#undef SD_FLAG
	174	+
148	175	static int sd_degenerate(struct sched_domain *sd)
149	176	{
150	177	if (cpumask_weight(sched_domain_span(sd)) == 1)
151	178	return 1;
152	179
153	180	/* Following flags need at least 2 groups */
154		- if (sd->flags & (SD_LOAD_BALANCE \|
155		- SD_BALANCE_NEWIDLE \|
156		- SD_BALANCE_FORK \|
157		- SD_BALANCE_EXEC \|
158		- SD_SHARE_CPUCAPACITY \|
159		- SD_ASYM_CPUCAPACITY \|
160		- SD_SHARE_PKG_RESOURCES \|
161		- SD_SHARE_POWERDOMAIN)) {
162		- if (sd->groups != sd->groups->next)
163		- return 0;
164		- }
	181	+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
	182	+ (sd->groups != sd->groups->next))
	183	+ return 0;
165	184
166	185	/* Following flags don't use groups */
167	186	if (sd->flags & (SD_WAKE_AFFINE))
..	..	@@ -182,36 +201,24 @@
182	201	return 0;
183	202
184	203	/* Flags needing groups don't count if only 1 group in parent */
185		- if (parent->groups == parent->groups->next) {
186		- pflags &= ~(SD_LOAD_BALANCE \|
187		- SD_BALANCE_NEWIDLE \|
188		- SD_BALANCE_FORK \|
189		- SD_BALANCE_EXEC \|
190		- SD_ASYM_CPUCAPACITY \|
191		- SD_SHARE_CPUCAPACITY \|
192		- SD_SHARE_PKG_RESOURCES \|
193		- SD_PREFER_SIBLING \|
194		- SD_SHARE_POWERDOMAIN);
195		- if (nr_node_ids == 1)
196		- pflags &= ~SD_SERIALIZE;
197		- }
	204	+ if (parent->groups == parent->groups->next)
	205	+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
	206	+
198	207	if (~cflags & pflags)
199	208	return 0;
200	209
201	210	return 1;
202	211	}
203	212
204		-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205		-
206		-#ifdef CONFIG_ENERGY_MODEL
207	213	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
	214	+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
208	215	unsigned int sysctl_sched_energy_aware = 1;
209	216	DEFINE_MUTEX(sched_energy_mutex);
210	217	bool sched_energy_update;
211	218
212	219	#ifdef CONFIG_PROC_SYSCTL
213	220	int sched_energy_aware_handler(struct ctl_table *table, int write,
214		- void __user buffer, size_t lenp, loff_t *ppos)
	221	+ void buffer, size_t lenp, loff_t *ppos)
215	222	{
216	223	int ret, state;
217	224
..	..	@@ -233,7 +240,6 @@
233	240	return ret;
234	241	}
235	242	#endif
236		-#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
237	243
238	244	static void free_pd(struct perf_domain *pd)
239	245	{
..	..	@@ -285,10 +291,10 @@
285	291	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
286	292
287	293	while (pd) {
288		- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
	294	+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
289	295	cpumask_first(perf_domain_span(pd)),
290	296	cpumask_pr_args(perf_domain_span(pd)),
291		- em_pd_nr_cap_states(pd->em_pd));
	297	+ em_pd_nr_perf_states(pd->em_pd));
292	298	pd = pd->next;
293	299	}
294	300
..	..	@@ -320,44 +326,55 @@
320	326	* EAS can be used on a root domain if it meets all the following conditions:
321	327	* 1. an Energy Model (EM) is available;
322	328	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
323		- * 3. the EM complexity is low enough to keep scheduling overheads low;
	329	+ * 3. no SMT is detected.
	330	+ * 4. the EM complexity is low enough to keep scheduling overheads low;
324	331	*
325	332	* The complexity of the Energy Model is defined as:
326	333	*
327		- * C = nr_pd * (nr_cpus + nr_cs)
	334	+ * C = nr_pd * (nr_cpus + nr_ps)
328	335	*
329	336	* with parameters defined as:
330	337	* - nr_pd: the number of performance domains
331	338	* - nr_cpus: the number of CPUs
332		- * - nr_cs: the sum of the number of capacity states of all performance
	339	+ * - nr_ps: the sum of the number of performance states of all performance
333	340	* domains (for example, on a system with 2 performance domains,
334		- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
	341	+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
335	342	*
336	343	* It is generally not a good idea to use such a model in the wake-up path on
337	344	* very complex platforms because of the associated scheduling overheads. The
338	345	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
339		- * with per-CPU DVFS and less than 8 capacity states each, for example.
	346	+ * with per-CPU DVFS and less than 8 performance states each, for example.
340	347	*/
341	348	#define EM_MAX_COMPLEXITY 2048
342	349
343	350	static bool build_perf_domains(const struct cpumask *cpu_map)
344	351	{
345		- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
	352	+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
346	353	struct perf_domain pd = NULL, tmp;
347	354	int cpu = cpumask_first(cpu_map);
348	355	struct root_domain *rd = cpu_rq(cpu)->rd;
	356	+ bool eas_check = false;
349	357
350		-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
351	358	if (!sysctl_sched_energy_aware)
352	359	goto free;
353		-#endif
354	360
355		- /* EAS is enabled for asymmetric CPU capacity topologies. */
356		- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
	361	+ /*
	362	+ * EAS is enabled for asymmetric CPU capacity topologies.
	363	+ * Allow vendor to override if desired.
	364	+ */
	365	+ trace_android_rvh_build_perf_domains(&eas_check);
	366	+ if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) {
357	367	if (sched_debug()) {
358	368	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
359	369	cpumask_pr_args(cpu_map));
360	370	}
	371	+ goto free;
	372	+ }
	373	+
	374	+ /* EAS definitely does not handle SMT */
	375	+ if (sched_smt_active()) {
	376	+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
	377	+ cpumask_pr_args(cpu_map));
361	378	goto free;
362	379	}
363	380
..	..	@@ -374,15 +391,15 @@
374	391	pd = tmp;
375	392
376	393	/*
377		- * Count performance domains and capacity states for the
	394	+ * Count performance domains and performance states for the
378	395	* complexity check.
379	396	*/
380	397	nr_pd++;
381		- nr_cs += em_pd_nr_cap_states(pd->em_pd);
	398	+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
382	399	}
383	400
384	401	/* Bail out if the Energy Model complexity is too high. */
385		- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
	402	+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
386	403	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
387	404	cpumask_pr_args(cpu_map));
388	405	goto free;
..	..	@@ -409,7 +426,7 @@
409	426	}
410	427	#else
411	428	static void free_pd(struct perf_domain *pd) { }
412		-#endif /* CONFIG_ENERGY_MODEL */
	429	+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
413	430
414	431	static void free_rootdomain(struct rcu_head *rcu)
415	432	{
..	..	@@ -459,7 +476,7 @@
459	476	raw_spin_unlock_irqrestore(&rq->lock, flags);
460	477
461	478	if (old_rd)
462		- call_rcu_sched(&old_rd->rcu, free_rootdomain);
	479	+ call_rcu(&old_rd->rcu, free_rootdomain);
463	480	}
464	481
465	482	void sched_get_rd(struct root_domain *rd)
..	..	@@ -472,7 +489,7 @@
472	489	if (!atomic_dec_and_test(&rd->refcount))
473	490	return;
474	491
475		- call_rcu_sched(&rd->rcu, free_rootdomain);
	492	+ call_rcu(&rd->rcu, free_rootdomain);
476	493	}
477	494
478	495	static int init_rootdomain(struct root_domain *rd)
..	..	@@ -490,7 +507,6 @@
490	507	rd->rto_cpu = -1;
491	508	raw_spin_lock_init(&rd->rto_lock);
492	509	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
493		- rd->rto_push_work.flags \|= IRQ_WORK_HARD_IRQ;
494	510	#endif
495	511
496	512	init_dl_bw(&rd->dl_bw);
..	..	@@ -499,9 +515,6 @@
499	515
500	516	if (cpupri_init(&rd->cpupri) != 0)
501	517	goto free_cpudl;
502		-
503		- init_max_cpu_capacity(&rd->max_cpu_capacity);
504		-
505	518	return 0;
506	519
507	520	free_cpudl:
..	..	@@ -607,13 +620,13 @@
607	620	* the cpumask of the domain), this allows us to quickly tell if
608	621	* two CPUs are in the same cache domain, see cpus_share_cache().
609	622	*/
610		-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
	623	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
611	624	DEFINE_PER_CPU(int, sd_llc_size);
612	625	DEFINE_PER_CPU(int, sd_llc_id);
613		-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
614		-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
615		-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
616		-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
	626	+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
	627	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
	628	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
	629	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
617	630	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
618	631
619	632	static void update_top_cache_domain(int cpu)
..	..	@@ -1051,6 +1064,7 @@
1051	1064	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
1052	1065	struct sched_domain *child = sd->child;
1053	1066	struct sched_group *sg;
	1067	+ bool already_visited;
1054	1068
1055	1069	if (child)
1056	1070	cpu = cpumask_first(sched_domain_span(child));
..	..	@@ -1058,9 +1072,14 @@
1058	1072	sg = *per_cpu_ptr(sdd->sg, cpu);
1059	1073	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1060	1074
1061		- /* For claim_allocations: */
1062		- atomic_inc(&sg->ref);
1063		- atomic_inc(&sg->sgc->ref);
	1075	+ /* Increase refcounts for claim_allocations: */
	1076	+ already_visited = atomic_inc_return(&sg->ref) > 1;
	1077	+ /* sgc visits should follow a similar trend as sg */
	1078	+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
	1079	+
	1080	+ /* If we have already visited that group, it's already initialized. */
	1081	+ if (already_visited)
	1082	+ return sg;
1064	1083
1065	1084	if (child) {
1066	1085	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
..	..	@@ -1079,8 +1098,8 @@
1079	1098
1080	1099	/*
1081	1100	* build_sched_groups will build a circular linked list of the groups
1082		- * covered by the given span, and will set each group's ->cpumask correctly,
1083		- * and ->cpu_capacity to 0.
	1101	+ * covered by the given span, will set each group's ->cpumask correctly,
	1102	+ * and will initialize their ->sgc.
1084	1103	*
1085	1104	* Assumes the sched_domain tree is fully constructed
1086	1105	*/
..	..	@@ -1187,16 +1206,13 @@
1187	1206	if (!attr \|\| attr->relax_domain_level < 0) {
1188	1207	if (default_relax_domain_level < 0)
1189	1208	return;
1190		- else
1191		- request = default_relax_domain_level;
	1209	+ request = default_relax_domain_level;
1192	1210	} else
1193	1211	request = attr->relax_domain_level;
1194		- if (request < sd->level) {
	1212	+
	1213	+ if (sd->level > request) {
1195	1214	/* Turn off idle balance on this domain: */
1196	1215	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
1197		- } else {
1198		- /* Turn on idle balance on this domain: */
1199		- sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
1200	1216	}
1201	1217	}
1202	1218
..	..	@@ -1210,13 +1226,13 @@
1210	1226	case sa_rootdomain:
1211	1227	if (!atomic_read(&d->rd->refcount))
1212	1228	free_rootdomain(&d->rd->rcu);
1213		- /* Fall through */
	1229	+ fallthrough;
1214	1230	case sa_sd:
1215	1231	free_percpu(d->sd);
1216		- /* Fall through */
	1232	+ fallthrough;
1217	1233	case sa_sd_storage:
1218	1234	__sdt_free(cpu_map);
1219		- /* Fall through */
	1235	+ fallthrough;
1220	1236	case sa_none:
1221	1237	break;
1222	1238	}
..	..	@@ -1270,6 +1286,7 @@
1270	1286	int sched_max_numa_distance;
1271	1287	static int *sched_domains_numa_distance;
1272	1288	static struct cpumask ***sched_domains_numa_masks;
	1289	+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1273	1290	#endif
1274	1291
1275	1292	/*
..	..	@@ -1282,7 +1299,6 @@
1282	1299	* SD_SHARE_CPUCAPACITY - describes SMT topologies
1283	1300	* SD_SHARE_PKG_RESOURCES - describes shared caches
1284	1301	* SD_NUMA - describes NUMA topologies
1285		- * SD_SHARE_POWERDOMAIN - describes shared power domain
1286	1302	*
1287	1303	* Odd one out, which beside describing the topology has a quirk also
1288	1304	* prescribes the desired behaviour that goes along with it:
..	..	@@ -1293,8 +1309,7 @@
1293	1309	(SD_SHARE_CPUCAPACITY \| \
1294	1310	SD_SHARE_PKG_RESOURCES \| \
1295	1311	SD_NUMA \| \
1296		- SD_ASYM_PACKING \| \
1297		- SD_SHARE_POWERDOMAIN)
	1312	+ SD_ASYM_PACKING)
1298	1313
1299	1314	static struct sched_domain *
1300	1315	sd_init(struct sched_domain_topology_level *tl,
..	..	@@ -1326,18 +1341,12 @@
1326	1341	*sd = (struct sched_domain){
1327	1342	.min_interval = sd_weight,
1328	1343	.max_interval = 2*sd_weight,
1329		- .busy_factor = 32,
1330		- .imbalance_pct = 125,
	1344	+ .busy_factor = 16,
	1345	+ .imbalance_pct = 117,
1331	1346
1332	1347	.cache_nice_tries = 0,
1333		- .busy_idx = 0,
1334		- .idle_idx = 0,
1335		- .newidle_idx = 0,
1336		- .wake_idx = 0,
1337		- .forkexec_idx = 0,
1338	1348
1339		- .flags = 1*SD_LOAD_BALANCE
1340		- \| 1*SD_BALANCE_NEWIDLE
	1349	+ .flags = 1*SD_BALANCE_NEWIDLE
1341	1350	\| 1*SD_BALANCE_EXEC
1342	1351	\| 1*SD_BALANCE_FORK
1343	1352	\| 0*SD_BALANCE_WAKE
..	..	@@ -1352,7 +1361,6 @@
1352	1361
1353	1362	.last_balance = jiffies,
1354	1363	.balance_interval = sd_weight,
1355		- .smt_gain = 0,
1356	1364	.max_newidle_lb_cost = 0,
1357	1365	.next_decay_max_lb_cost = jiffies,
1358	1366	.child = child,
..	..	@@ -1368,37 +1376,24 @@
1368	1376	* Convert topological properties into behaviour.
1369	1377	*/
1370	1378
1371		- if (sd->flags & SD_ASYM_CPUCAPACITY) {
1372		- struct sched_domain *t = sd;
1373		-
1374		- /*
1375		- * Don't attempt to spread across CPUs of different capacities.
1376		- */
1377		- if (sd->child)
1378		- sd->child->flags &= ~SD_PREFER_SIBLING;
1379		-
1380		- for_each_lower_domain(t)
1381		- t->flags \|= SD_BALANCE_WAKE;
1382		- }
	1379	+ /* Don't attempt to spread across CPUs of different capacities. */
	1380	+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
	1381	+ sd->child->flags &= ~SD_PREFER_SIBLING;
1383	1382
1384	1383	if (sd->flags & SD_SHARE_CPUCAPACITY) {
1385	1384	sd->imbalance_pct = 110;
1386		- sd->smt_gain = 1178; /* ~15% */
1387	1385
1388	1386	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1389	1387	sd->imbalance_pct = 117;
1390	1388	sd->cache_nice_tries = 1;
1391		- sd->busy_idx = 2;
1392	1389
1393	1390	#ifdef CONFIG_NUMA
1394	1391	} else if (sd->flags & SD_NUMA) {
1395	1392	sd->cache_nice_tries = 2;
1396		- sd->busy_idx = 3;
1397		- sd->idle_idx = 2;
1398	1393
1399	1394	sd->flags &= ~SD_PREFER_SIBLING;
1400	1395	sd->flags \|= SD_SERIALIZE;
1401		- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
	1396	+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1402	1397	sd->flags &= ~(SD_BALANCE_EXEC \|
1403	1398	SD_BALANCE_FORK \|
1404	1399	SD_WAKE_AFFINE);
..	..	@@ -1407,8 +1402,6 @@
1407	1402	#endif
1408	1403	} else {
1409	1404	sd->cache_nice_tries = 1;
1410		- sd->busy_idx = 2;
1411		- sd->idle_idx = 1;
1412	1405	}
1413	1406
1414	1407	/*
..	..	@@ -1549,66 +1542,58 @@
1549	1542	}
1550	1543	}
1551	1544
	1545	+
	1546	+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
	1547	+
1552	1548	void sched_init_numa(void)
1553	1549	{
1554		- int next_distance, curr_distance = node_distance(0, 0);
1555	1550	struct sched_domain_topology_level *tl;
1556		- int level = 0;
1557		- int i, j, k;
1558		-
1559		- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1560		- if (!sched_domains_numa_distance)
1561		- return;
1562		-
1563		- /* Includes NUMA identity node at level 0. */
1564		- sched_domains_numa_distance[level++] = curr_distance;
1565		- sched_domains_numa_levels = level;
	1551	+ unsigned long *distance_map;
	1552	+ int nr_levels = 0;
	1553	+ int i, j;
1566	1554
1567	1555	/*
1568	1556	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
1569	1557	* unique distances in the node_distance() table.
1570		- *
1571		- * Assumes node_distance(0,j) includes all distances in
1572		- * node_distance(i,j) in order to avoid cubic time.
1573	1558	*/
1574		- next_distance = curr_distance;
	1559	+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
	1560	+ if (!distance_map)
	1561	+ return;
	1562	+
	1563	+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1575	1564	for (i = 0; i < nr_node_ids; i++) {
1576	1565	for (j = 0; j < nr_node_ids; j++) {
1577		- for (k = 0; k < nr_node_ids; k++) {
1578		- int distance = node_distance(i, k);
	1566	+ int distance = node_distance(i, j);
1579	1567
1580		- if (distance > curr_distance &&
1581		- (distance < next_distance \|\|
1582		- next_distance == curr_distance))
1583		- next_distance = distance;
1584		-
1585		- /*
1586		- * While not a strong assumption it would be nice to know
1587		- * about cases where if node A is connected to B, B is not
1588		- * equally connected to A.
1589		- */
1590		- if (sched_debug() && node_distance(k, i) != distance)
1591		- sched_numa_warn("Node-distance not symmetric");
1592		-
1593		- if (sched_debug() && i && !find_numa_distance(distance))
1594		- sched_numa_warn("Node-0 not representative");
	1568	+ if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
	1569	+ sched_numa_warn("Invalid distance value range");
	1570	+ return;
1595	1571	}
1596		- if (next_distance != curr_distance) {
1597		- sched_domains_numa_distance[level++] = next_distance;
1598		- sched_domains_numa_levels = level;
1599		- curr_distance = next_distance;
1600		- } else break;
1601		- }
1602	1572
1603		- /*
1604		- * In case of sched_debug() we verify the above assumption.
1605		- */
1606		- if (!sched_debug())
1607		- break;
	1573	+ bitmap_set(distance_map, distance, 1);
	1574	+ }
	1575	+ }
	1576	+ /*
	1577	+ * We can now figure out how many unique distance values there are and
	1578	+ * allocate memory accordingly.
	1579	+ */
	1580	+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
	1581	+
	1582	+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
	1583	+ if (!sched_domains_numa_distance) {
	1584	+ bitmap_free(distance_map);
	1585	+ return;
1608	1586	}
1609	1587
	1588	+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
	1589	+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
	1590	+ sched_domains_numa_distance[i] = j;
	1591	+ }
	1592	+
	1593	+ bitmap_free(distance_map);
	1594	+
1610	1595	/*
1611		- * 'level' contains the number of unique distances
	1596	+ * 'nr_levels' contains the number of unique distances
1612	1597	*
1613	1598	* The sched_domains_numa_distance[] array includes the actual distance
1614	1599	* numbers.
..	..	@@ -1617,15 +1602,15 @@
1617	1602	/*
1618	1603	* Here, we should temporarily reset sched_domains_numa_levels to 0.
1619	1604	* If it fails to allocate memory for array sched_domains_numa_masks[][],
1620		- * the array will contain less then 'level' members. This could be
	1605	+ * the array will contain less then 'nr_levels' members. This could be
1621	1606	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
1622	1607	* in other functions.
1623	1608	*
1624		- * We reset it to 'level' at the end of this function.
	1609	+ * We reset it to 'nr_levels' at the end of this function.
1625	1610	*/
1626	1611	sched_domains_numa_levels = 0;
1627	1612
1628		- sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
	1613	+ sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
1629	1614	if (!sched_domains_numa_masks)
1630	1615	return;
1631	1616
..	..	@@ -1633,7 +1618,7 @@
1633	1618	* Now for each level, construct a mask per node which contains all
1634	1619	* CPUs of nodes that are that many hops away from us.
1635	1620	*/
1636		- for (i = 0; i < level; i++) {
	1621	+ for (i = 0; i < nr_levels; i++) {
1637	1622	sched_domains_numa_masks[i] =
1638	1623	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1639	1624	if (!sched_domains_numa_masks[i])
..	..	@@ -1641,12 +1626,17 @@
1641	1626
1642	1627	for (j = 0; j < nr_node_ids; j++) {
1643	1628	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
	1629	+ int k;
	1630	+
1644	1631	if (!mask)
1645	1632	return;
1646	1633
1647	1634	sched_domains_numa_masks[i][j] = mask;
1648	1635
1649	1636	for_each_node(k) {
	1637	+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
	1638	+ sched_numa_warn("Node-distance not symmetric");
	1639	+
1650	1640	if (node_distance(j, k) > sched_domains_numa_distance[i])
1651	1641	continue;
1652	1642
..	..	@@ -1658,7 +1648,7 @@
1658	1648	/* Compute default topology size */
1659	1649	for (i = 0; sched_domain_topology[i].mask; i++);
1660	1650
1661		- tl = kzalloc((i + level + 1) *
	1651	+ tl = kzalloc((i + nr_levels + 1) *
1662	1652	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1663	1653	if (!tl)
1664	1654	return;
..	..	@@ -1681,7 +1671,7 @@
1681	1671	/*
1682	1672	* .. and append 'j' levels of NUMA goodness.
1683	1673	*/
1684		- for (j = 1; j < level; i++, j++) {
	1674	+ for (j = 1; j < nr_levels; i++, j++) {
1685	1675	tl[i] = (struct sched_domain_topology_level){
1686	1676	.mask = sd_numa_mask,
1687	1677	.sd_flags = cpu_numa_flags,
..	..	@@ -1693,8 +1683,8 @@
1693	1683
1694	1684	sched_domain_topology = tl;
1695	1685
1696		- sched_domains_numa_levels = level;
1697		- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
	1686	+ sched_domains_numa_levels = nr_levels;
	1687	+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1698	1688
1699	1689	init_numa_topology_type();
1700	1690	}
..	..	@@ -1720,6 +1710,26 @@
1720	1710	for (j = 0; j < nr_node_ids; j++)
1721	1711	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1722	1712	}
	1713	+}
	1714	+
	1715	+/*
	1716	+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
	1717	+ * closest to @cpu from @cpumask.
	1718	+ * cpumask: cpumask to find a cpu from
	1719	+ * cpu: cpu to be close to
	1720	+ *
	1721	+ * returns: cpu, or nr_cpu_ids when nothing found.
	1722	+ */
	1723	+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
	1724	+{
	1725	+ int i, j = cpu_to_node(cpu);
	1726	+
	1727	+ for (i = 0; i < sched_domains_numa_levels; i++) {
	1728	+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
	1729	+ if (cpu < nr_cpu_ids)
	1730	+ return cpu;
	1731	+ }
	1732	+ return nr_cpu_ids;
1723	1733	}
1724	1734
1725	1735	#endif /* CONFIG_NUMA */
..	..	@@ -1860,6 +1870,42 @@
1860	1870	}
1861	1871
1862	1872	/*
	1873	+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
	1874	+ * any two given CPUs at this (non-NUMA) topology level.
	1875	+ */
	1876	+static bool topology_span_sane(struct sched_domain_topology_level *tl,
	1877	+ const struct cpumask *cpu_map, int cpu)
	1878	+{
	1879	+ int i;
	1880	+
	1881	+ /* NUMA levels are allowed to overlap */
	1882	+ if (tl->flags & SDTL_OVERLAP)
	1883	+ return true;
	1884	+
	1885	+ /*
	1886	+ * Non-NUMA levels cannot partially overlap - they must be either
	1887	+ * completely equal or completely disjoint. Otherwise we can end up
	1888	+ * breaking the sched_group lists - i.e. a later get_group() pass
	1889	+ * breaks the linking done for an earlier span.
	1890	+ */
	1891	+ for_each_cpu(i, cpu_map) {
	1892	+ if (i == cpu)
	1893	+ continue;
	1894	+ /*
	1895	+ * We should 'and' all those masks with 'cpu_map' to exactly
	1896	+ * match the topology we're about to build, but that can only
	1897	+ * remove CPUs, which only lessens our ability to detect
	1898	+ * overlaps
	1899	+ */
	1900	+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
	1901	+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
	1902	+ return false;
	1903	+ }
	1904	+
	1905	+ return true;
	1906	+}
	1907	+
	1908	+/*
1863	1909	* Find the sched_domain_topology_level where all CPU capacities are visible
1864	1910	* for all CPUs.
1865	1911	*/
..	..	@@ -1872,10 +1918,10 @@
1872	1918	unsigned long cap;
1873	1919
1874	1920	/* Is there any asymmetry? */
1875		- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
	1921	+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
1876	1922
1877	1923	for_each_cpu(i, cpu_map) {
1878		- if (arch_scale_cpu_capacity(NULL, i) != cap) {
	1924	+ if (arch_scale_cpu_capacity(i) != cap) {
1879	1925	asym = true;
1880	1926	break;
1881	1927	}
..	..	@@ -1890,7 +1936,7 @@
1890	1936	* to everyone.
1891	1937	*/
1892	1938	for_each_cpu(i, cpu_map) {
1893		- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
	1939	+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
1894	1940	int tl_id = 0;
1895	1941
1896	1942	for_each_sd_topology(tl) {
..	..	@@ -1900,7 +1946,7 @@
1900	1946	for_each_cpu_and(j, tl->mask(i), cpu_map) {
1901	1947	unsigned long capacity;
1902	1948
1903		- capacity = arch_scale_cpu_capacity(NULL, j);
	1949	+ capacity = arch_scale_cpu_capacity(j);
1904	1950
1905	1951	if (capacity <= max_capacity)
1906	1952	continue;
..	..	@@ -1925,12 +1971,16 @@
1925	1971	static int
1926	1972	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
1927	1973	{
1928		- enum s_alloc alloc_state;
	1974	+ enum s_alloc alloc_state = sa_none;
1929	1975	struct sched_domain *sd;
1930	1976	struct s_data d;
	1977	+ struct rq *rq = NULL;
1931	1978	int i, ret = -ENOMEM;
1932	1979	struct sched_domain_topology_level *tl_asym;
1933	1980	bool has_asym = false;
	1981	+
	1982	+ if (WARN_ON(cpumask_empty(cpu_map)))
	1983	+ goto error;
1934	1984
1935	1985	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1936	1986	if (alloc_state != sa_rootdomain)
..	..	@@ -1941,15 +1991,17 @@
1941	1991	/* Set up domains for CPUs specified by the cpu_map: */
1942	1992	for_each_cpu(i, cpu_map) {
1943	1993	struct sched_domain_topology_level *tl;
	1994	+ int dflags = 0;
1944	1995
1945	1996	sd = NULL;
1946	1997	for_each_sd_topology(tl) {
1947		- int dflags = 0;
1948		-
1949	1998	if (tl == tl_asym) {
1950	1999	dflags \|= SD_ASYM_CPUCAPACITY;
1951	2000	has_asym = true;
1952	2001	}
	2002	+
	2003	+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
	2004	+ goto error;
1953	2005
1954	2006	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
1955	2007
..	..	@@ -1990,13 +2042,25 @@
1990	2042	/* Attach the domains */
1991	2043	rcu_read_lock();
1992	2044	for_each_cpu(i, cpu_map) {
	2045	+ rq = cpu_rq(i);
1993	2046	sd = *per_cpu_ptr(d.sd, i);
	2047	+
	2048	+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
	2049	+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
	2050	+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
	2051	+
1994	2052	cpu_attach_domain(sd, d.rd, i);
1995	2053	}
1996	2054	rcu_read_unlock();
1997	2055
1998	2056	if (has_asym)
1999	2057	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
	2058	+
	2059	+ if (rq && sched_debug_enabled) {
	2060	+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
	2061	+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
	2062	+ }
	2063	+ trace_android_vh_build_sched_domains(has_asym);
2000	2064
2001	2065	ret = 0;
2002	2066	error:
..	..	@@ -2057,9 +2121,8 @@
2057	2121	}
2058	2122
2059	2123	/*
2060		- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
2061		- * For now this just excludes isolated CPUs, but could be used to
2062		- * exclude other special cases in the future.
	2124	+ * Set up scheduler domains and groups. For now this just excludes isolated
	2125	+ * CPUs, but could be used to exclude other special cases in the future.
2063	2126	*/
2064	2127	int sched_init_domains(const struct cpumask *cpu_map)
2065	2128	{
..	..	@@ -2140,16 +2203,16 @@
2140	2203	* ndoms_new == 0 is a special case for destroying existing domains,
2141	2204	* and it will not create the default domain.
2142	2205	*
2143		- * Call with hotplug lock held
	2206	+ * Call with hotplug lock and sched_domains_mutex held
2144	2207	*/
2145		-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2146		- struct sched_domain_attr *dattr_new)
	2208	+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
	2209	+ struct sched_domain_attr *dattr_new)
2147	2210	{
2148	2211	bool __maybe_unused has_eas = false;
2149	2212	int i, j, n;
2150	2213	int new_topology;
2151	2214
2152		- mutex_lock(&sched_domains_mutex);
	2215	+ lockdep_assert_held(&sched_domains_mutex);
2153	2216
2154	2217	/* Always unregister in case we don't destroy any domains: */
2155	2218	unregister_sched_domain_sysctl();
..	..	@@ -2174,8 +2237,19 @@
2174	2237	for (i = 0; i < ndoms_cur; i++) {
2175	2238	for (j = 0; j < n && !new_topology; j++) {
2176	2239	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2177		- dattrs_equal(dattr_cur, i, dattr_new, j))
	2240	+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
	2241	+ struct root_domain *rd;
	2242	+
	2243	+ /*
	2244	+ * This domain won't be destroyed and as such
	2245	+ * its dl_bw->total_bw needs to be cleared. It
	2246	+ * will be recomputed in function
	2247	+ * update_tasks_root_domain().
	2248	+ */
	2249	+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
	2250	+ dl_clear_root_domain(rd);
2178	2251	goto match1;
	2252	+ }
2179	2253	}
2180	2254	/* No match - a current sched domain not in new doms_new[] */
2181	2255	detach_destroy_domains(doms_cur[i]);
..	..	@@ -2204,10 +2278,10 @@
2204	2278	;
2205	2279	}
2206	2280
2207		-#ifdef CONFIG_ENERGY_MODEL
	2281	+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2208	2282	/* Build perf. domains: */
2209	2283	for (i = 0; i < ndoms_new; i++) {
2210		- for (j = 0; j < n; j++) {
	2284	+ for (j = 0; j < n && !sched_energy_update; j++) {
2211	2285	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2212	2286	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2213	2287	has_eas = true;
..	..	@@ -2232,6 +2306,15 @@
2232	2306	ndoms_cur = ndoms_new;
2233	2307
2234	2308	register_sched_domain_sysctl();
	2309	+}
2235	2310
	2311	+/*
	2312	+ * Call with hotplug lock held
	2313	+ */
	2314	+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
	2315	+ struct sched_domain_attr *dattr_new)
	2316	+{
	2317	+ mutex_lock(&sched_domains_mutex);
	2318	+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2236	2319	mutex_unlock(&sched_domains_mutex);
2237	2320	}