~hc/RK356X_SDK_RELEASE.git

..	..	@@ -4,11 +4,16 @@
4	4	*/
5	5	#include "sched.h"
6	6
	7	+#include <trace/hooks/sched.h>
	8	+
7	9	DEFINE_MUTEX(sched_domains_mutex);
	10	+#ifdef CONFIG_LOCKDEP
	11	+EXPORT_SYMBOL_GPL(sched_domains_mutex);
	12	+#endif
8	13
9	14	/* Protected by sched_domains_mutex: */
10		-cpumask_var_t sched_domains_tmpmask;
11		-cpumask_var_t sched_domains_tmpmask2;
	15	+static cpumask_var_t sched_domains_tmpmask;
	16	+static cpumask_var_t sched_domains_tmpmask2;
12	17
13	18	#ifdef CONFIG_SCHED_DEBUG
14	19
..	..	@@ -25,22 +30,22 @@
25	30	return sched_debug_enabled;
26	31	}
27	32
	33	+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
	34	+const struct sd_flag_debug sd_flag_debug[] = {
	35	+#include <linux/sched/sd_flags.h>
	36	+};
	37	+#undef SD_FLAG
	38	+
28	39	static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
29	40	struct cpumask *groupmask)
30	41	{
31	42	struct sched_group *group = sd->groups;
	43	+ unsigned long flags = sd->flags;
	44	+ unsigned int idx;
32	45
33	46	cpumask_clear(groupmask);
34	47
35	48	printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36		-
37		- if (!(sd->flags & SD_LOAD_BALANCE)) {
38		- printk("does not load-balance\n");
39		- if (sd->parent)
40		- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41		- return -1;
42		- }
43		-
44	49	printk(KERN_CONT "span=%*pbl level=%s\n",
45	50	cpumask_pr_args(sched_domain_span(sd)), sd->name);
46	51
..	..	@@ -49,6 +54,21 @@
49	54	}
50	55	if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51	56	printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
	57	+ }
	58	+
	59	+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
	60	+ unsigned int flag = BIT(idx);
	61	+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
	62	+
	63	+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
	64	+ !(sd->child->flags & flag))
	65	+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
	66	+ sd_flag_debug[idx].name);
	67	+
	68	+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
	69	+ !(sd->parent->flags & flag))
	70	+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
	71	+ sd_flag_debug[idx].name);
52	72	}
53	73
54	74	printk(KERN_DEBUG "%*s groups:", level + 1, "");
..	..	@@ -145,23 +165,22 @@
145	165	}
146	166	#endif /* CONFIG_SCHED_DEBUG */
147	167
	168	+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
	169	+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) \|
	170	+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
	171	+#include <linux/sched/sd_flags.h>
	172	+0;
	173	+#undef SD_FLAG
	174	+
148	175	static int sd_degenerate(struct sched_domain *sd)
149	176	{
150	177	if (cpumask_weight(sched_domain_span(sd)) == 1)
151	178	return 1;
152	179
153	180	/* Following flags need at least 2 groups */
154		- if (sd->flags & (SD_LOAD_BALANCE \|
155		- SD_BALANCE_NEWIDLE \|
156		- SD_BALANCE_FORK \|
157		- SD_BALANCE_EXEC \|
158		- SD_SHARE_CPUCAPACITY \|
159		- SD_ASYM_CPUCAPACITY \|
160		- SD_SHARE_PKG_RESOURCES \|
161		- SD_SHARE_POWERDOMAIN)) {
162		- if (sd->groups != sd->groups->next)
163		- return 0;
164		- }
	181	+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
	182	+ (sd->groups != sd->groups->next))
	183	+ return 0;
165	184
166	185	/* Following flags don't use groups */
167	186	if (sd->flags & (SD_WAKE_AFFINE))
..	..	@@ -182,36 +201,24 @@
182	201	return 0;
183	202
184	203	/* Flags needing groups don't count if only 1 group in parent */
185		- if (parent->groups == parent->groups->next) {
186		- pflags &= ~(SD_LOAD_BALANCE \|
187		- SD_BALANCE_NEWIDLE \|
188		- SD_BALANCE_FORK \|
189		- SD_BALANCE_EXEC \|
190		- SD_ASYM_CPUCAPACITY \|
191		- SD_SHARE_CPUCAPACITY \|
192		- SD_SHARE_PKG_RESOURCES \|
193		- SD_PREFER_SIBLING \|
194		- SD_SHARE_POWERDOMAIN);
195		- if (nr_node_ids == 1)
196		- pflags &= ~SD_SERIALIZE;
197		- }
	204	+ if (parent->groups == parent->groups->next)
	205	+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
	206	+
198	207	if (~cflags & pflags)
199	208	return 0;
200	209
201	210	return 1;
202	211	}
203	212
204		-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205		-
206		-#ifdef CONFIG_ENERGY_MODEL
207	213	#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
	214	+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
208	215	unsigned int sysctl_sched_energy_aware = 1;
209	216	DEFINE_MUTEX(sched_energy_mutex);
210	217	bool sched_energy_update;
211	218
212	219	#ifdef CONFIG_PROC_SYSCTL
213	220	int sched_energy_aware_handler(struct ctl_table *table, int write,
214		- void __user buffer, size_t lenp, loff_t *ppos)
	221	+ void buffer, size_t lenp, loff_t *ppos)
215	222	{
216	223	int ret, state;
217	224
..	..	@@ -233,7 +240,6 @@
233	240	return ret;
234	241	}
235	242	#endif
236		-#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
237	243
238	244	static void free_pd(struct perf_domain *pd)
239	245	{
..	..	@@ -285,10 +291,10 @@
285	291	printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
286	292
287	293	while (pd) {
288		- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
	294	+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
289	295	cpumask_first(perf_domain_span(pd)),
290	296	cpumask_pr_args(perf_domain_span(pd)),
291		- em_pd_nr_cap_states(pd->em_pd));
	297	+ em_pd_nr_perf_states(pd->em_pd));
292	298	pd = pd->next;
293	299	}
294	300
..	..	@@ -320,44 +326,55 @@
320	326	* EAS can be used on a root domain if it meets all the following conditions:
321	327	* 1. an Energy Model (EM) is available;
322	328	* 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
323		- * 3. the EM complexity is low enough to keep scheduling overheads low;
	329	+ * 3. no SMT is detected.
	330	+ * 4. the EM complexity is low enough to keep scheduling overheads low;
324	331	*
325	332	* The complexity of the Energy Model is defined as:
326	333	*
327		- * C = nr_pd * (nr_cpus + nr_cs)
	334	+ * C = nr_pd * (nr_cpus + nr_ps)
328	335	*
329	336	* with parameters defined as:
330	337	* - nr_pd: the number of performance domains
331	338	* - nr_cpus: the number of CPUs
332		- * - nr_cs: the sum of the number of capacity states of all performance
	339	+ * - nr_ps: the sum of the number of performance states of all performance
333	340	* domains (for example, on a system with 2 performance domains,
334		- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
	341	+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
335	342	*
336	343	* It is generally not a good idea to use such a model in the wake-up path on
337	344	* very complex platforms because of the associated scheduling overheads. The
338	345	* arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
339		- * with per-CPU DVFS and less than 8 capacity states each, for example.
	346	+ * with per-CPU DVFS and less than 8 performance states each, for example.
340	347	*/
341	348	#define EM_MAX_COMPLEXITY 2048
342	349
343	350	static bool build_perf_domains(const struct cpumask *cpu_map)
344	351	{
345		- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
	352	+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
346	353	struct perf_domain pd = NULL, tmp;
347	354	int cpu = cpumask_first(cpu_map);
348	355	struct root_domain *rd = cpu_rq(cpu)->rd;
	356	+ bool eas_check = false;
349	357
350		-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
351	358	if (!sysctl_sched_energy_aware)
352	359	goto free;
353		-#endif
354	360
355		- /* EAS is enabled for asymmetric CPU capacity topologies. */
356		- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
	361	+ /*
	362	+ * EAS is enabled for asymmetric CPU capacity topologies.
	363	+ * Allow vendor to override if desired.
	364	+ */
	365	+ trace_android_rvh_build_perf_domains(&eas_check);
	366	+ if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) {
357	367	if (sched_debug()) {
358	368	pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
359	369	cpumask_pr_args(cpu_map));
360	370	}
	371	+ goto free;
	372	+ }
	373	+
	374	+ /* EAS definitely does not handle SMT */
	375	+ if (sched_smt_active()) {
	376	+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
	377	+ cpumask_pr_args(cpu_map));
361	378	goto free;
362	379	}
363	380
..	..	@@ -374,15 +391,15 @@
374	391	pd = tmp;
375	392
376	393	/*
377		- * Count performance domains and capacity states for the
	394	+ * Count performance domains and performance states for the
378	395	* complexity check.
379	396	*/
380	397	nr_pd++;
381		- nr_cs += em_pd_nr_cap_states(pd->em_pd);
	398	+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
382	399	}
383	400
384	401	/* Bail out if the Energy Model complexity is too high. */
385		- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
	402	+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
386	403	WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
387	404	cpumask_pr_args(cpu_map));
388	405	goto free;
..	..	@@ -409,7 +426,7 @@
409	426	}
410	427	#else
411	428	static void free_pd(struct perf_domain *pd) { }
412		-#endif /* CONFIG_ENERGY_MODEL */
	429	+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
413	430
414	431	static void free_rootdomain(struct rcu_head *rcu)
415	432	{
..	..	@@ -459,7 +476,7 @@
459	476	raw_spin_unlock_irqrestore(&rq->lock, flags);
460	477
461	478	if (old_rd)
462		- call_rcu_sched(&old_rd->rcu, free_rootdomain);
	479	+ call_rcu(&old_rd->rcu, free_rootdomain);
463	480	}
464	481
465	482	void sched_get_rd(struct root_domain *rd)
..	..	@@ -472,7 +489,7 @@
472	489	if (!atomic_dec_and_test(&rd->refcount))
473	490	return;
474	491
475		- call_rcu_sched(&rd->rcu, free_rootdomain);
	492	+ call_rcu(&rd->rcu, free_rootdomain);
476	493	}
477	494
478	495	static int init_rootdomain(struct root_domain *rd)
..	..	@@ -490,7 +507,7 @@
490	507	rd->rto_cpu = -1;
491	508	raw_spin_lock_init(&rd->rto_lock);
492	509	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
493		- rd->rto_push_work.flags \|= IRQ_WORK_HARD_IRQ;
	510	+ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags);
494	511	#endif
495	512
496	513	init_dl_bw(&rd->dl_bw);
..	..	@@ -499,9 +516,6 @@
499	516
500	517	if (cpupri_init(&rd->cpupri) != 0)
501	518	goto free_cpudl;
502		-
503		- init_max_cpu_capacity(&rd->max_cpu_capacity);
504		-
505	519	return 0;
506	520
507	521	free_cpudl:
..	..	@@ -607,13 +621,13 @@
607	621	* the cpumask of the domain), this allows us to quickly tell if
608	622	* two CPUs are in the same cache domain, see cpus_share_cache().
609	623	*/
610		-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
	624	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
611	625	DEFINE_PER_CPU(int, sd_llc_size);
612	626	DEFINE_PER_CPU(int, sd_llc_id);
613		-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
614		-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
615		-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
616		-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
	627	+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
	628	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
	629	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
	630	+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
617	631	DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
618	632
619	633	static void update_top_cache_domain(int cpu)
..	..	@@ -1051,6 +1065,7 @@
1051	1065	struct sched_domain sd = per_cpu_ptr(sdd->sd, cpu);
1052	1066	struct sched_domain *child = sd->child;
1053	1067	struct sched_group *sg;
	1068	+ bool already_visited;
1054	1069
1055	1070	if (child)
1056	1071	cpu = cpumask_first(sched_domain_span(child));
..	..	@@ -1058,9 +1073,14 @@
1058	1073	sg = *per_cpu_ptr(sdd->sg, cpu);
1059	1074	sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1060	1075
1061		- /* For claim_allocations: */
1062		- atomic_inc(&sg->ref);
1063		- atomic_inc(&sg->sgc->ref);
	1076	+ /* Increase refcounts for claim_allocations: */
	1077	+ already_visited = atomic_inc_return(&sg->ref) > 1;
	1078	+ /* sgc visits should follow a similar trend as sg */
	1079	+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
	1080	+
	1081	+ /* If we have already visited that group, it's already initialized. */
	1082	+ if (already_visited)
	1083	+ return sg;
1064	1084
1065	1085	if (child) {
1066	1086	cpumask_copy(sched_group_span(sg), sched_domain_span(child));
..	..	@@ -1079,8 +1099,8 @@
1079	1099
1080	1100	/*
1081	1101	* build_sched_groups will build a circular linked list of the groups
1082		- * covered by the given span, and will set each group's ->cpumask correctly,
1083		- * and ->cpu_capacity to 0.
	1102	+ * covered by the given span, will set each group's ->cpumask correctly,
	1103	+ * and will initialize their ->sgc.
1084	1104	*
1085	1105	* Assumes the sched_domain tree is fully constructed
1086	1106	*/
..	..	@@ -1187,16 +1207,13 @@
1187	1207	if (!attr \|\| attr->relax_domain_level < 0) {
1188	1208	if (default_relax_domain_level < 0)
1189	1209	return;
1190		- else
1191		- request = default_relax_domain_level;
	1210	+ request = default_relax_domain_level;
1192	1211	} else
1193	1212	request = attr->relax_domain_level;
1194		- if (request < sd->level) {
	1213	+
	1214	+ if (sd->level > request) {
1195	1215	/* Turn off idle balance on this domain: */
1196	1216	sd->flags &= ~(SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
1197		- } else {
1198		- /* Turn on idle balance on this domain: */
1199		- sd->flags \|= (SD_BALANCE_WAKE\|SD_BALANCE_NEWIDLE);
1200	1217	}
1201	1218	}
1202	1219
..	..	@@ -1210,13 +1227,13 @@
1210	1227	case sa_rootdomain:
1211	1228	if (!atomic_read(&d->rd->refcount))
1212	1229	free_rootdomain(&d->rd->rcu);
1213		- /* Fall through */
	1230	+ fallthrough;
1214	1231	case sa_sd:
1215	1232	free_percpu(d->sd);
1216		- /* Fall through */
	1233	+ fallthrough;
1217	1234	case sa_sd_storage:
1218	1235	__sdt_free(cpu_map);
1219		- /* Fall through */
	1236	+ fallthrough;
1220	1237	case sa_none:
1221	1238	break;
1222	1239	}
..	..	@@ -1270,6 +1287,7 @@
1270	1287	int sched_max_numa_distance;
1271	1288	static int *sched_domains_numa_distance;
1272	1289	static struct cpumask ***sched_domains_numa_masks;
	1290	+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1273	1291	#endif
1274	1292
1275	1293	/*
..	..	@@ -1282,7 +1300,6 @@
1282	1300	* SD_SHARE_CPUCAPACITY - describes SMT topologies
1283	1301	* SD_SHARE_PKG_RESOURCES - describes shared caches
1284	1302	* SD_NUMA - describes NUMA topologies
1285		- * SD_SHARE_POWERDOMAIN - describes shared power domain
1286	1303	*
1287	1304	* Odd one out, which beside describing the topology has a quirk also
1288	1305	* prescribes the desired behaviour that goes along with it:
..	..	@@ -1293,8 +1310,7 @@
1293	1310	(SD_SHARE_CPUCAPACITY \| \
1294	1311	SD_SHARE_PKG_RESOURCES \| \
1295	1312	SD_NUMA \| \
1296		- SD_ASYM_PACKING \| \
1297		- SD_SHARE_POWERDOMAIN)
	1313	+ SD_ASYM_PACKING)
1298	1314
1299	1315	static struct sched_domain *
1300	1316	sd_init(struct sched_domain_topology_level *tl,
..	..	@@ -1326,18 +1342,12 @@
1326	1342	*sd = (struct sched_domain){
1327	1343	.min_interval = sd_weight,
1328	1344	.max_interval = 2*sd_weight,
1329		- .busy_factor = 32,
1330		- .imbalance_pct = 125,
	1345	+ .busy_factor = 16,
	1346	+ .imbalance_pct = 117,
1331	1347
1332	1348	.cache_nice_tries = 0,
1333		- .busy_idx = 0,
1334		- .idle_idx = 0,
1335		- .newidle_idx = 0,
1336		- .wake_idx = 0,
1337		- .forkexec_idx = 0,
1338	1349
1339		- .flags = 1*SD_LOAD_BALANCE
1340		- \| 1*SD_BALANCE_NEWIDLE
	1350	+ .flags = 1*SD_BALANCE_NEWIDLE
1341	1351	\| 1*SD_BALANCE_EXEC
1342	1352	\| 1*SD_BALANCE_FORK
1343	1353	\| 0*SD_BALANCE_WAKE
..	..	@@ -1352,7 +1362,6 @@
1352	1362
1353	1363	.last_balance = jiffies,
1354	1364	.balance_interval = sd_weight,
1355		- .smt_gain = 0,
1356	1365	.max_newidle_lb_cost = 0,
1357	1366	.next_decay_max_lb_cost = jiffies,
1358	1367	.child = child,
..	..	@@ -1368,37 +1377,24 @@
1368	1377	* Convert topological properties into behaviour.
1369	1378	*/
1370	1379
1371		- if (sd->flags & SD_ASYM_CPUCAPACITY) {
1372		- struct sched_domain *t = sd;
1373		-
1374		- /*
1375		- * Don't attempt to spread across CPUs of different capacities.
1376		- */
1377		- if (sd->child)
1378		- sd->child->flags &= ~SD_PREFER_SIBLING;
1379		-
1380		- for_each_lower_domain(t)
1381		- t->flags \|= SD_BALANCE_WAKE;
1382		- }
	1380	+ /* Don't attempt to spread across CPUs of different capacities. */
	1381	+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
	1382	+ sd->child->flags &= ~SD_PREFER_SIBLING;
1383	1383
1384	1384	if (sd->flags & SD_SHARE_CPUCAPACITY) {
1385	1385	sd->imbalance_pct = 110;
1386		- sd->smt_gain = 1178; /* ~15% */
1387	1386
1388	1387	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1389	1388	sd->imbalance_pct = 117;
1390	1389	sd->cache_nice_tries = 1;
1391		- sd->busy_idx = 2;
1392	1390
1393	1391	#ifdef CONFIG_NUMA
1394	1392	} else if (sd->flags & SD_NUMA) {
1395	1393	sd->cache_nice_tries = 2;
1396		- sd->busy_idx = 3;
1397		- sd->idle_idx = 2;
1398	1394
1399	1395	sd->flags &= ~SD_PREFER_SIBLING;
1400	1396	sd->flags \|= SD_SERIALIZE;
1401		- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
	1397	+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1402	1398	sd->flags &= ~(SD_BALANCE_EXEC \|
1403	1399	SD_BALANCE_FORK \|
1404	1400	SD_WAKE_AFFINE);
..	..	@@ -1407,8 +1403,6 @@
1407	1403	#endif
1408	1404	} else {
1409	1405	sd->cache_nice_tries = 1;
1410		- sd->busy_idx = 2;
1411		- sd->idle_idx = 1;
1412	1406	}
1413	1407
1414	1408	/*
..	..	@@ -1549,66 +1543,58 @@
1549	1543	}
1550	1544	}
1551	1545
	1546	+
	1547	+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
	1548	+
1552	1549	void sched_init_numa(void)
1553	1550	{
1554		- int next_distance, curr_distance = node_distance(0, 0);
1555	1551	struct sched_domain_topology_level *tl;
1556		- int level = 0;
1557		- int i, j, k;
1558		-
1559		- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1560		- if (!sched_domains_numa_distance)
1561		- return;
1562		-
1563		- /* Includes NUMA identity node at level 0. */
1564		- sched_domains_numa_distance[level++] = curr_distance;
1565		- sched_domains_numa_levels = level;
	1552	+ unsigned long *distance_map;
	1553	+ int nr_levels = 0;
	1554	+ int i, j;
1566	1555
1567	1556	/*
1568	1557	* O(nr_nodes^2) deduplicating selection sort -- in order to find the
1569	1558	* unique distances in the node_distance() table.
1570		- *
1571		- * Assumes node_distance(0,j) includes all distances in
1572		- * node_distance(i,j) in order to avoid cubic time.
1573	1559	*/
1574		- next_distance = curr_distance;
	1560	+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
	1561	+ if (!distance_map)
	1562	+ return;
	1563	+
	1564	+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
1575	1565	for (i = 0; i < nr_node_ids; i++) {
1576	1566	for (j = 0; j < nr_node_ids; j++) {
1577		- for (k = 0; k < nr_node_ids; k++) {
1578		- int distance = node_distance(i, k);
	1567	+ int distance = node_distance(i, j);
1579	1568
1580		- if (distance > curr_distance &&
1581		- (distance < next_distance \|\|
1582		- next_distance == curr_distance))
1583		- next_distance = distance;
1584		-
1585		- /*
1586		- * While not a strong assumption it would be nice to know
1587		- * about cases where if node A is connected to B, B is not
1588		- * equally connected to A.
1589		- */
1590		- if (sched_debug() && node_distance(k, i) != distance)
1591		- sched_numa_warn("Node-distance not symmetric");
1592		-
1593		- if (sched_debug() && i && !find_numa_distance(distance))
1594		- sched_numa_warn("Node-0 not representative");
	1569	+ if (distance < LOCAL_DISTANCE \|\| distance >= NR_DISTANCE_VALUES) {
	1570	+ sched_numa_warn("Invalid distance value range");
	1571	+ return;
1595	1572	}
1596		- if (next_distance != curr_distance) {
1597		- sched_domains_numa_distance[level++] = next_distance;
1598		- sched_domains_numa_levels = level;
1599		- curr_distance = next_distance;
1600		- } else break;
1601		- }
1602	1573
1603		- /*
1604		- * In case of sched_debug() we verify the above assumption.
1605		- */
1606		- if (!sched_debug())
1607		- break;
	1574	+ bitmap_set(distance_map, distance, 1);
	1575	+ }
	1576	+ }
	1577	+ /*
	1578	+ * We can now figure out how many unique distance values there are and
	1579	+ * allocate memory accordingly.
	1580	+ */
	1581	+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
	1582	+
	1583	+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
	1584	+ if (!sched_domains_numa_distance) {
	1585	+ bitmap_free(distance_map);
	1586	+ return;
1608	1587	}
1609	1588
	1589	+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
	1590	+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
	1591	+ sched_domains_numa_distance[i] = j;
	1592	+ }
	1593	+
	1594	+ bitmap_free(distance_map);
	1595	+
1610	1596	/*
1611		- * 'level' contains the number of unique distances
	1597	+ * 'nr_levels' contains the number of unique distances
1612	1598	*
1613	1599	* The sched_domains_numa_distance[] array includes the actual distance
1614	1600	* numbers.
..	..	@@ -1617,15 +1603,15 @@
1617	1603	/*
1618	1604	* Here, we should temporarily reset sched_domains_numa_levels to 0.
1619	1605	* If it fails to allocate memory for array sched_domains_numa_masks[][],
1620		- * the array will contain less then 'level' members. This could be
	1606	+ * the array will contain less then 'nr_levels' members. This could be
1621	1607	* dangerous when we use it to iterate array sched_domains_numa_masks[][]
1622	1608	* in other functions.
1623	1609	*
1624		- * We reset it to 'level' at the end of this function.
	1610	+ * We reset it to 'nr_levels' at the end of this function.
1625	1611	*/
1626	1612	sched_domains_numa_levels = 0;
1627	1613
1628		- sched_domains_numa_masks = kzalloc(sizeof(void ) level, GFP_KERNEL);
	1614	+ sched_domains_numa_masks = kzalloc(sizeof(void ) nr_levels, GFP_KERNEL);
1629	1615	if (!sched_domains_numa_masks)
1630	1616	return;
1631	1617
..	..	@@ -1633,7 +1619,7 @@
1633	1619	* Now for each level, construct a mask per node which contains all
1634	1620	* CPUs of nodes that are that many hops away from us.
1635	1621	*/
1636		- for (i = 0; i < level; i++) {
	1622	+ for (i = 0; i < nr_levels; i++) {
1637	1623	sched_domains_numa_masks[i] =
1638	1624	kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1639	1625	if (!sched_domains_numa_masks[i])
..	..	@@ -1641,12 +1627,17 @@
1641	1627
1642	1628	for (j = 0; j < nr_node_ids; j++) {
1643	1629	struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
	1630	+ int k;
	1631	+
1644	1632	if (!mask)
1645	1633	return;
1646	1634
1647	1635	sched_domains_numa_masks[i][j] = mask;
1648	1636
1649	1637	for_each_node(k) {
	1638	+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
	1639	+ sched_numa_warn("Node-distance not symmetric");
	1640	+
1650	1641	if (node_distance(j, k) > sched_domains_numa_distance[i])
1651	1642	continue;
1652	1643
..	..	@@ -1658,7 +1649,7 @@
1658	1649	/* Compute default topology size */
1659	1650	for (i = 0; sched_domain_topology[i].mask; i++);
1660	1651
1661		- tl = kzalloc((i + level + 1) *
	1652	+ tl = kzalloc((i + nr_levels + 1) *
1662	1653	sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1663	1654	if (!tl)
1664	1655	return;
..	..	@@ -1681,7 +1672,7 @@
1681	1672	/*
1682	1673	* .. and append 'j' levels of NUMA goodness.
1683	1674	*/
1684		- for (j = 1; j < level; i++, j++) {
	1675	+ for (j = 1; j < nr_levels; i++, j++) {
1685	1676	tl[i] = (struct sched_domain_topology_level){
1686	1677	.mask = sd_numa_mask,
1687	1678	.sd_flags = cpu_numa_flags,
..	..	@@ -1693,8 +1684,8 @@
1693	1684
1694	1685	sched_domain_topology = tl;
1695	1686
1696		- sched_domains_numa_levels = level;
1697		- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
	1687	+ sched_domains_numa_levels = nr_levels;
	1688	+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
1698	1689
1699	1690	init_numa_topology_type();
1700	1691	}
..	..	@@ -1720,6 +1711,26 @@
1720	1711	for (j = 0; j < nr_node_ids; j++)
1721	1712	cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1722	1713	}
	1714	+}
	1715	+
	1716	+/*
	1717	+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
	1718	+ * closest to @cpu from @cpumask.
	1719	+ * cpumask: cpumask to find a cpu from
	1720	+ * cpu: cpu to be close to
	1721	+ *
	1722	+ * returns: cpu, or nr_cpu_ids when nothing found.
	1723	+ */
	1724	+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
	1725	+{
	1726	+ int i, j = cpu_to_node(cpu);
	1727	+
	1728	+ for (i = 0; i < sched_domains_numa_levels; i++) {
	1729	+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
	1730	+ if (cpu < nr_cpu_ids)
	1731	+ return cpu;
	1732	+ }
	1733	+ return nr_cpu_ids;
1723	1734	}
1724	1735
1725	1736	#endif /* CONFIG_NUMA */
..	..	@@ -1860,6 +1871,42 @@
1860	1871	}
1861	1872
1862	1873	/*
	1874	+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
	1875	+ * any two given CPUs at this (non-NUMA) topology level.
	1876	+ */
	1877	+static bool topology_span_sane(struct sched_domain_topology_level *tl,
	1878	+ const struct cpumask *cpu_map, int cpu)
	1879	+{
	1880	+ int i;
	1881	+
	1882	+ /* NUMA levels are allowed to overlap */
	1883	+ if (tl->flags & SDTL_OVERLAP)
	1884	+ return true;
	1885	+
	1886	+ /*
	1887	+ * Non-NUMA levels cannot partially overlap - they must be either
	1888	+ * completely equal or completely disjoint. Otherwise we can end up
	1889	+ * breaking the sched_group lists - i.e. a later get_group() pass
	1890	+ * breaks the linking done for an earlier span.
	1891	+ */
	1892	+ for_each_cpu(i, cpu_map) {
	1893	+ if (i == cpu)
	1894	+ continue;
	1895	+ /*
	1896	+ * We should 'and' all those masks with 'cpu_map' to exactly
	1897	+ * match the topology we're about to build, but that can only
	1898	+ * remove CPUs, which only lessens our ability to detect
	1899	+ * overlaps
	1900	+ */
	1901	+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
	1902	+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
	1903	+ return false;
	1904	+ }
	1905	+
	1906	+ return true;
	1907	+}
	1908	+
	1909	+/*
1863	1910	* Find the sched_domain_topology_level where all CPU capacities are visible
1864	1911	* for all CPUs.
1865	1912	*/
..	..	@@ -1872,10 +1919,10 @@
1872	1919	unsigned long cap;
1873	1920
1874	1921	/* Is there any asymmetry? */
1875		- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
	1922	+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
1876	1923
1877	1924	for_each_cpu(i, cpu_map) {
1878		- if (arch_scale_cpu_capacity(NULL, i) != cap) {
	1925	+ if (arch_scale_cpu_capacity(i) != cap) {
1879	1926	asym = true;
1880	1927	break;
1881	1928	}
..	..	@@ -1890,7 +1937,7 @@
1890	1937	* to everyone.
1891	1938	*/
1892	1939	for_each_cpu(i, cpu_map) {
1893		- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
	1940	+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
1894	1941	int tl_id = 0;
1895	1942
1896	1943	for_each_sd_topology(tl) {
..	..	@@ -1900,7 +1947,7 @@
1900	1947	for_each_cpu_and(j, tl->mask(i), cpu_map) {
1901	1948	unsigned long capacity;
1902	1949
1903		- capacity = arch_scale_cpu_capacity(NULL, j);
	1950	+ capacity = arch_scale_cpu_capacity(j);
1904	1951
1905	1952	if (capacity <= max_capacity)
1906	1953	continue;
..	..	@@ -1925,12 +1972,16 @@
1925	1972	static int
1926	1973	build_sched_domains(const struct cpumask cpu_map, struct sched_domain_attr attr)
1927	1974	{
1928		- enum s_alloc alloc_state;
	1975	+ enum s_alloc alloc_state = sa_none;
1929	1976	struct sched_domain *sd;
1930	1977	struct s_data d;
	1978	+ struct rq *rq = NULL;
1931	1979	int i, ret = -ENOMEM;
1932	1980	struct sched_domain_topology_level *tl_asym;
1933	1981	bool has_asym = false;
	1982	+
	1983	+ if (WARN_ON(cpumask_empty(cpu_map)))
	1984	+ goto error;
1934	1985
1935	1986	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1936	1987	if (alloc_state != sa_rootdomain)
..	..	@@ -1941,15 +1992,17 @@
1941	1992	/* Set up domains for CPUs specified by the cpu_map: */
1942	1993	for_each_cpu(i, cpu_map) {
1943	1994	struct sched_domain_topology_level *tl;
	1995	+ int dflags = 0;
1944	1996
1945	1997	sd = NULL;
1946	1998	for_each_sd_topology(tl) {
1947		- int dflags = 0;
1948		-
1949	1999	if (tl == tl_asym) {
1950	2000	dflags \|= SD_ASYM_CPUCAPACITY;
1951	2001	has_asym = true;
1952	2002	}
	2003	+
	2004	+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
	2005	+ goto error;
1953	2006
1954	2007	sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
1955	2008
..	..	@@ -1990,13 +2043,25 @@
1990	2043	/* Attach the domains */
1991	2044	rcu_read_lock();
1992	2045	for_each_cpu(i, cpu_map) {
	2046	+ rq = cpu_rq(i);
1993	2047	sd = *per_cpu_ptr(d.sd, i);
	2048	+
	2049	+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
	2050	+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
	2051	+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
	2052	+
1994	2053	cpu_attach_domain(sd, d.rd, i);
1995	2054	}
1996	2055	rcu_read_unlock();
1997	2056
1998	2057	if (has_asym)
1999	2058	static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
	2059	+
	2060	+ if (rq && sched_debug_enabled) {
	2061	+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
	2062	+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
	2063	+ }
	2064	+ trace_android_vh_build_sched_domains(has_asym);
2000	2065
2001	2066	ret = 0;
2002	2067	error:
..	..	@@ -2057,9 +2122,8 @@
2057	2122	}
2058	2123
2059	2124	/*
2060		- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
2061		- * For now this just excludes isolated CPUs, but could be used to
2062		- * exclude other special cases in the future.
	2125	+ * Set up scheduler domains and groups. For now this just excludes isolated
	2126	+ * CPUs, but could be used to exclude other special cases in the future.
2063	2127	*/
2064	2128	int sched_init_domains(const struct cpumask *cpu_map)
2065	2129	{
..	..	@@ -2140,16 +2204,16 @@
2140	2204	* ndoms_new == 0 is a special case for destroying existing domains,
2141	2205	* and it will not create the default domain.
2142	2206	*
2143		- * Call with hotplug lock held
	2207	+ * Call with hotplug lock and sched_domains_mutex held
2144	2208	*/
2145		-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2146		- struct sched_domain_attr *dattr_new)
	2209	+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
	2210	+ struct sched_domain_attr *dattr_new)
2147	2211	{
2148	2212	bool __maybe_unused has_eas = false;
2149	2213	int i, j, n;
2150	2214	int new_topology;
2151	2215
2152		- mutex_lock(&sched_domains_mutex);
	2216	+ lockdep_assert_held(&sched_domains_mutex);
2153	2217
2154	2218	/* Always unregister in case we don't destroy any domains: */
2155	2219	unregister_sched_domain_sysctl();
..	..	@@ -2174,8 +2238,19 @@
2174	2238	for (i = 0; i < ndoms_cur; i++) {
2175	2239	for (j = 0; j < n && !new_topology; j++) {
2176	2240	if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2177		- dattrs_equal(dattr_cur, i, dattr_new, j))
	2241	+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
	2242	+ struct root_domain *rd;
	2243	+
	2244	+ /*
	2245	+ * This domain won't be destroyed and as such
	2246	+ * its dl_bw->total_bw needs to be cleared. It
	2247	+ * will be recomputed in function
	2248	+ * update_tasks_root_domain().
	2249	+ */
	2250	+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
	2251	+ dl_clear_root_domain(rd);
2178	2252	goto match1;
	2253	+ }
2179	2254	}
2180	2255	/* No match - a current sched domain not in new doms_new[] */
2181	2256	detach_destroy_domains(doms_cur[i]);
..	..	@@ -2204,10 +2279,10 @@
2204	2279	;
2205	2280	}
2206	2281
2207		-#ifdef CONFIG_ENERGY_MODEL
	2282	+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2208	2283	/* Build perf. domains: */
2209	2284	for (i = 0; i < ndoms_new; i++) {
2210		- for (j = 0; j < n; j++) {
	2285	+ for (j = 0; j < n && !sched_energy_update; j++) {
2211	2286	if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2212	2287	cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2213	2288	has_eas = true;
..	..	@@ -2232,6 +2307,15 @@
2232	2307	ndoms_cur = ndoms_new;
2233	2308
2234	2309	register_sched_domain_sysctl();
	2310	+}
2235	2311
	2312	+/*
	2313	+ * Call with hotplug lock held
	2314	+ */
	2315	+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
	2316	+ struct sched_domain_attr *dattr_new)
	2317	+{
	2318	+ mutex_lock(&sched_domains_mutex);
	2319	+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2236	2320	mutex_unlock(&sched_domains_mutex);
2237	2321	}