hc
2023-12-11 6778948f9de86c3cfaf36725a7c87dcff9ba247f
kernel/kernel/sched/topology.c
....@@ -4,11 +4,16 @@
44 */
55 #include "sched.h"
66
7
+#include <trace/hooks/sched.h>
8
+
79 DEFINE_MUTEX(sched_domains_mutex);
10
+#ifdef CONFIG_LOCKDEP
11
+EXPORT_SYMBOL_GPL(sched_domains_mutex);
12
+#endif
813
914 /* Protected by sched_domains_mutex: */
10
-cpumask_var_t sched_domains_tmpmask;
11
-cpumask_var_t sched_domains_tmpmask2;
15
+static cpumask_var_t sched_domains_tmpmask;
16
+static cpumask_var_t sched_domains_tmpmask2;
1217
1318 #ifdef CONFIG_SCHED_DEBUG
1419
....@@ -25,22 +30,22 @@
2530 return sched_debug_enabled;
2631 }
2732
33
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
34
+const struct sd_flag_debug sd_flag_debug[] = {
35
+#include <linux/sched/sd_flags.h>
36
+};
37
+#undef SD_FLAG
38
+
2839 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
2940 struct cpumask *groupmask)
3041 {
3142 struct sched_group *group = sd->groups;
43
+ unsigned long flags = sd->flags;
44
+ unsigned int idx;
3245
3346 cpumask_clear(groupmask);
3447
3548 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36
-
37
- if (!(sd->flags & SD_LOAD_BALANCE)) {
38
- printk("does not load-balance\n");
39
- if (sd->parent)
40
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41
- return -1;
42
- }
43
-
4449 printk(KERN_CONT "span=%*pbl level=%s\n",
4550 cpumask_pr_args(sched_domain_span(sd)), sd->name);
4651
....@@ -49,6 +54,21 @@
4954 }
5055 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
5156 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
57
+ }
58
+
59
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
60
+ unsigned int flag = BIT(idx);
61
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
62
+
63
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
64
+ !(sd->child->flags & flag))
65
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
66
+ sd_flag_debug[idx].name);
67
+
68
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
69
+ !(sd->parent->flags & flag))
70
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
71
+ sd_flag_debug[idx].name);
5272 }
5373
5474 printk(KERN_DEBUG "%*s groups:", level + 1, "");
....@@ -145,23 +165,22 @@
145165 }
146166 #endif /* CONFIG_SCHED_DEBUG */
147167
168
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
169
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
170
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
171
+#include <linux/sched/sd_flags.h>
172
+0;
173
+#undef SD_FLAG
174
+
148175 static int sd_degenerate(struct sched_domain *sd)
149176 {
150177 if (cpumask_weight(sched_domain_span(sd)) == 1)
151178 return 1;
152179
153180 /* Following flags need at least 2 groups */
154
- if (sd->flags & (SD_LOAD_BALANCE |
155
- SD_BALANCE_NEWIDLE |
156
- SD_BALANCE_FORK |
157
- SD_BALANCE_EXEC |
158
- SD_SHARE_CPUCAPACITY |
159
- SD_ASYM_CPUCAPACITY |
160
- SD_SHARE_PKG_RESOURCES |
161
- SD_SHARE_POWERDOMAIN)) {
162
- if (sd->groups != sd->groups->next)
163
- return 0;
164
- }
181
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
182
+ (sd->groups != sd->groups->next))
183
+ return 0;
165184
166185 /* Following flags don't use groups */
167186 if (sd->flags & (SD_WAKE_AFFINE))
....@@ -182,36 +201,24 @@
182201 return 0;
183202
184203 /* Flags needing groups don't count if only 1 group in parent */
185
- if (parent->groups == parent->groups->next) {
186
- pflags &= ~(SD_LOAD_BALANCE |
187
- SD_BALANCE_NEWIDLE |
188
- SD_BALANCE_FORK |
189
- SD_BALANCE_EXEC |
190
- SD_ASYM_CPUCAPACITY |
191
- SD_SHARE_CPUCAPACITY |
192
- SD_SHARE_PKG_RESOURCES |
193
- SD_PREFER_SIBLING |
194
- SD_SHARE_POWERDOMAIN);
195
- if (nr_node_ids == 1)
196
- pflags &= ~SD_SERIALIZE;
197
- }
204
+ if (parent->groups == parent->groups->next)
205
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
206
+
198207 if (~cflags & pflags)
199208 return 0;
200209
201210 return 1;
202211 }
203212
204
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205
-
206
-#ifdef CONFIG_ENERGY_MODEL
207213 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
214
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
208215 unsigned int sysctl_sched_energy_aware = 1;
209216 DEFINE_MUTEX(sched_energy_mutex);
210217 bool sched_energy_update;
211218
212219 #ifdef CONFIG_PROC_SYSCTL
213220 int sched_energy_aware_handler(struct ctl_table *table, int write,
214
- void __user *buffer, size_t *lenp, loff_t *ppos)
221
+ void *buffer, size_t *lenp, loff_t *ppos)
215222 {
216223 int ret, state;
217224
....@@ -233,7 +240,6 @@
233240 return ret;
234241 }
235242 #endif
236
-#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
237243
238244 static void free_pd(struct perf_domain *pd)
239245 {
....@@ -285,10 +291,10 @@
285291 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
286292
287293 while (pd) {
288
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
294
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
289295 cpumask_first(perf_domain_span(pd)),
290296 cpumask_pr_args(perf_domain_span(pd)),
291
- em_pd_nr_cap_states(pd->em_pd));
297
+ em_pd_nr_perf_states(pd->em_pd));
292298 pd = pd->next;
293299 }
294300
....@@ -320,44 +326,55 @@
320326 * EAS can be used on a root domain if it meets all the following conditions:
321327 * 1. an Energy Model (EM) is available;
322328 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
323
- * 3. the EM complexity is low enough to keep scheduling overheads low;
329
+ * 3. no SMT is detected.
330
+ * 4. the EM complexity is low enough to keep scheduling overheads low;
324331 *
325332 * The complexity of the Energy Model is defined as:
326333 *
327
- * C = nr_pd * (nr_cpus + nr_cs)
334
+ * C = nr_pd * (nr_cpus + nr_ps)
328335 *
329336 * with parameters defined as:
330337 * - nr_pd: the number of performance domains
331338 * - nr_cpus: the number of CPUs
332
- * - nr_cs: the sum of the number of capacity states of all performance
339
+ * - nr_ps: the sum of the number of performance states of all performance
333340 * domains (for example, on a system with 2 performance domains,
334
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
341
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
335342 *
336343 * It is generally not a good idea to use such a model in the wake-up path on
337344 * very complex platforms because of the associated scheduling overheads. The
338345 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
339
- * with per-CPU DVFS and less than 8 capacity states each, for example.
346
+ * with per-CPU DVFS and less than 8 performance states each, for example.
340347 */
341348 #define EM_MAX_COMPLEXITY 2048
342349
343350 static bool build_perf_domains(const struct cpumask *cpu_map)
344351 {
345
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
352
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
346353 struct perf_domain *pd = NULL, *tmp;
347354 int cpu = cpumask_first(cpu_map);
348355 struct root_domain *rd = cpu_rq(cpu)->rd;
356
+ bool eas_check = false;
349357
350
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
351358 if (!sysctl_sched_energy_aware)
352359 goto free;
353
-#endif
354360
355
- /* EAS is enabled for asymmetric CPU capacity topologies. */
356
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
361
+ /*
362
+ * EAS is enabled for asymmetric CPU capacity topologies.
363
+ * Allow vendor to override if desired.
364
+ */
365
+ trace_android_rvh_build_perf_domains(&eas_check);
366
+ if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) {
357367 if (sched_debug()) {
358368 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
359369 cpumask_pr_args(cpu_map));
360370 }
371
+ goto free;
372
+ }
373
+
374
+ /* EAS definitely does *not* handle SMT */
375
+ if (sched_smt_active()) {
376
+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
377
+ cpumask_pr_args(cpu_map));
361378 goto free;
362379 }
363380
....@@ -374,15 +391,15 @@
374391 pd = tmp;
375392
376393 /*
377
- * Count performance domains and capacity states for the
394
+ * Count performance domains and performance states for the
378395 * complexity check.
379396 */
380397 nr_pd++;
381
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
398
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
382399 }
383400
384401 /* Bail out if the Energy Model complexity is too high. */
385
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
402
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
386403 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
387404 cpumask_pr_args(cpu_map));
388405 goto free;
....@@ -409,7 +426,7 @@
409426 }
410427 #else
411428 static void free_pd(struct perf_domain *pd) { }
412
-#endif /* CONFIG_ENERGY_MODEL */
429
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
413430
414431 static void free_rootdomain(struct rcu_head *rcu)
415432 {
....@@ -459,7 +476,7 @@
459476 raw_spin_unlock_irqrestore(&rq->lock, flags);
460477
461478 if (old_rd)
462
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
479
+ call_rcu(&old_rd->rcu, free_rootdomain);
463480 }
464481
465482 void sched_get_rd(struct root_domain *rd)
....@@ -472,7 +489,7 @@
472489 if (!atomic_dec_and_test(&rd->refcount))
473490 return;
474491
475
- call_rcu_sched(&rd->rcu, free_rootdomain);
492
+ call_rcu(&rd->rcu, free_rootdomain);
476493 }
477494
478495 static int init_rootdomain(struct root_domain *rd)
....@@ -490,7 +507,6 @@
490507 rd->rto_cpu = -1;
491508 raw_spin_lock_init(&rd->rto_lock);
492509 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
493
- rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
494510 #endif
495511
496512 init_dl_bw(&rd->dl_bw);
....@@ -499,9 +515,6 @@
499515
500516 if (cpupri_init(&rd->cpupri) != 0)
501517 goto free_cpudl;
502
-
503
- init_max_cpu_capacity(&rd->max_cpu_capacity);
504
-
505518 return 0;
506519
507520 free_cpudl:
....@@ -607,13 +620,13 @@
607620 * the cpumask of the domain), this allows us to quickly tell if
608621 * two CPUs are in the same cache domain, see cpus_share_cache().
609622 */
610
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
623
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
611624 DEFINE_PER_CPU(int, sd_llc_size);
612625 DEFINE_PER_CPU(int, sd_llc_id);
613
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
614
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
615
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
616
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
626
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
627
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
628
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
629
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
617630 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
618631
619632 static void update_top_cache_domain(int cpu)
....@@ -1051,6 +1064,7 @@
10511064 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
10521065 struct sched_domain *child = sd->child;
10531066 struct sched_group *sg;
1067
+ bool already_visited;
10541068
10551069 if (child)
10561070 cpu = cpumask_first(sched_domain_span(child));
....@@ -1058,9 +1072,14 @@
10581072 sg = *per_cpu_ptr(sdd->sg, cpu);
10591073 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
10601074
1061
- /* For claim_allocations: */
1062
- atomic_inc(&sg->ref);
1063
- atomic_inc(&sg->sgc->ref);
1075
+ /* Increase refcounts for claim_allocations: */
1076
+ already_visited = atomic_inc_return(&sg->ref) > 1;
1077
+ /* sgc visits should follow a similar trend as sg */
1078
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1079
+
1080
+ /* If we have already visited that group, it's already initialized. */
1081
+ if (already_visited)
1082
+ return sg;
10641083
10651084 if (child) {
10661085 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
....@@ -1079,8 +1098,8 @@
10791098
10801099 /*
10811100 * build_sched_groups will build a circular linked list of the groups
1082
- * covered by the given span, and will set each group's ->cpumask correctly,
1083
- * and ->cpu_capacity to 0.
1101
+ * covered by the given span, will set each group's ->cpumask correctly,
1102
+ * and will initialize their ->sgc.
10841103 *
10851104 * Assumes the sched_domain tree is fully constructed
10861105 */
....@@ -1187,16 +1206,13 @@
11871206 if (!attr || attr->relax_domain_level < 0) {
11881207 if (default_relax_domain_level < 0)
11891208 return;
1190
- else
1191
- request = default_relax_domain_level;
1209
+ request = default_relax_domain_level;
11921210 } else
11931211 request = attr->relax_domain_level;
1194
- if (request < sd->level) {
1212
+
1213
+ if (sd->level > request) {
11951214 /* Turn off idle balance on this domain: */
11961215 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1197
- } else {
1198
- /* Turn on idle balance on this domain: */
1199
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
12001216 }
12011217 }
12021218
....@@ -1210,13 +1226,13 @@
12101226 case sa_rootdomain:
12111227 if (!atomic_read(&d->rd->refcount))
12121228 free_rootdomain(&d->rd->rcu);
1213
- /* Fall through */
1229
+ fallthrough;
12141230 case sa_sd:
12151231 free_percpu(d->sd);
1216
- /* Fall through */
1232
+ fallthrough;
12171233 case sa_sd_storage:
12181234 __sdt_free(cpu_map);
1219
- /* Fall through */
1235
+ fallthrough;
12201236 case sa_none:
12211237 break;
12221238 }
....@@ -1270,6 +1286,7 @@
12701286 int sched_max_numa_distance;
12711287 static int *sched_domains_numa_distance;
12721288 static struct cpumask ***sched_domains_numa_masks;
1289
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
12731290 #endif
12741291
12751292 /*
....@@ -1282,7 +1299,6 @@
12821299 * SD_SHARE_CPUCAPACITY - describes SMT topologies
12831300 * SD_SHARE_PKG_RESOURCES - describes shared caches
12841301 * SD_NUMA - describes NUMA topologies
1285
- * SD_SHARE_POWERDOMAIN - describes shared power domain
12861302 *
12871303 * Odd one out, which beside describing the topology has a quirk also
12881304 * prescribes the desired behaviour that goes along with it:
....@@ -1293,8 +1309,7 @@
12931309 (SD_SHARE_CPUCAPACITY | \
12941310 SD_SHARE_PKG_RESOURCES | \
12951311 SD_NUMA | \
1296
- SD_ASYM_PACKING | \
1297
- SD_SHARE_POWERDOMAIN)
1312
+ SD_ASYM_PACKING)
12981313
12991314 static struct sched_domain *
13001315 sd_init(struct sched_domain_topology_level *tl,
....@@ -1326,18 +1341,12 @@
13261341 *sd = (struct sched_domain){
13271342 .min_interval = sd_weight,
13281343 .max_interval = 2*sd_weight,
1329
- .busy_factor = 32,
1330
- .imbalance_pct = 125,
1344
+ .busy_factor = 16,
1345
+ .imbalance_pct = 117,
13311346
13321347 .cache_nice_tries = 0,
1333
- .busy_idx = 0,
1334
- .idle_idx = 0,
1335
- .newidle_idx = 0,
1336
- .wake_idx = 0,
1337
- .forkexec_idx = 0,
13381348
1339
- .flags = 1*SD_LOAD_BALANCE
1340
- | 1*SD_BALANCE_NEWIDLE
1349
+ .flags = 1*SD_BALANCE_NEWIDLE
13411350 | 1*SD_BALANCE_EXEC
13421351 | 1*SD_BALANCE_FORK
13431352 | 0*SD_BALANCE_WAKE
....@@ -1352,7 +1361,6 @@
13521361
13531362 .last_balance = jiffies,
13541363 .balance_interval = sd_weight,
1355
- .smt_gain = 0,
13561364 .max_newidle_lb_cost = 0,
13571365 .next_decay_max_lb_cost = jiffies,
13581366 .child = child,
....@@ -1368,37 +1376,24 @@
13681376 * Convert topological properties into behaviour.
13691377 */
13701378
1371
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
1372
- struct sched_domain *t = sd;
1373
-
1374
- /*
1375
- * Don't attempt to spread across CPUs of different capacities.
1376
- */
1377
- if (sd->child)
1378
- sd->child->flags &= ~SD_PREFER_SIBLING;
1379
-
1380
- for_each_lower_domain(t)
1381
- t->flags |= SD_BALANCE_WAKE;
1382
- }
1379
+ /* Don't attempt to spread across CPUs of different capacities. */
1380
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1381
+ sd->child->flags &= ~SD_PREFER_SIBLING;
13831382
13841383 if (sd->flags & SD_SHARE_CPUCAPACITY) {
13851384 sd->imbalance_pct = 110;
1386
- sd->smt_gain = 1178; /* ~15% */
13871385
13881386 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
13891387 sd->imbalance_pct = 117;
13901388 sd->cache_nice_tries = 1;
1391
- sd->busy_idx = 2;
13921389
13931390 #ifdef CONFIG_NUMA
13941391 } else if (sd->flags & SD_NUMA) {
13951392 sd->cache_nice_tries = 2;
1396
- sd->busy_idx = 3;
1397
- sd->idle_idx = 2;
13981393
13991394 sd->flags &= ~SD_PREFER_SIBLING;
14001395 sd->flags |= SD_SERIALIZE;
1401
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1396
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
14021397 sd->flags &= ~(SD_BALANCE_EXEC |
14031398 SD_BALANCE_FORK |
14041399 SD_WAKE_AFFINE);
....@@ -1407,8 +1402,6 @@
14071402 #endif
14081403 } else {
14091404 sd->cache_nice_tries = 1;
1410
- sd->busy_idx = 2;
1411
- sd->idle_idx = 1;
14121405 }
14131406
14141407 /*
....@@ -1549,66 +1542,58 @@
15491542 }
15501543 }
15511544
1545
+
1546
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1547
+
15521548 void sched_init_numa(void)
15531549 {
1554
- int next_distance, curr_distance = node_distance(0, 0);
15551550 struct sched_domain_topology_level *tl;
1556
- int level = 0;
1557
- int i, j, k;
1558
-
1559
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1560
- if (!sched_domains_numa_distance)
1561
- return;
1562
-
1563
- /* Includes NUMA identity node at level 0. */
1564
- sched_domains_numa_distance[level++] = curr_distance;
1565
- sched_domains_numa_levels = level;
1551
+ unsigned long *distance_map;
1552
+ int nr_levels = 0;
1553
+ int i, j;
15661554
15671555 /*
15681556 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
15691557 * unique distances in the node_distance() table.
1570
- *
1571
- * Assumes node_distance(0,j) includes all distances in
1572
- * node_distance(i,j) in order to avoid cubic time.
15731558 */
1574
- next_distance = curr_distance;
1559
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1560
+ if (!distance_map)
1561
+ return;
1562
+
1563
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
15751564 for (i = 0; i < nr_node_ids; i++) {
15761565 for (j = 0; j < nr_node_ids; j++) {
1577
- for (k = 0; k < nr_node_ids; k++) {
1578
- int distance = node_distance(i, k);
1566
+ int distance = node_distance(i, j);
15791567
1580
- if (distance > curr_distance &&
1581
- (distance < next_distance ||
1582
- next_distance == curr_distance))
1583
- next_distance = distance;
1584
-
1585
- /*
1586
- * While not a strong assumption it would be nice to know
1587
- * about cases where if node A is connected to B, B is not
1588
- * equally connected to A.
1589
- */
1590
- if (sched_debug() && node_distance(k, i) != distance)
1591
- sched_numa_warn("Node-distance not symmetric");
1592
-
1593
- if (sched_debug() && i && !find_numa_distance(distance))
1594
- sched_numa_warn("Node-0 not representative");
1568
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1569
+ sched_numa_warn("Invalid distance value range");
1570
+ return;
15951571 }
1596
- if (next_distance != curr_distance) {
1597
- sched_domains_numa_distance[level++] = next_distance;
1598
- sched_domains_numa_levels = level;
1599
- curr_distance = next_distance;
1600
- } else break;
1601
- }
16021572
1603
- /*
1604
- * In case of sched_debug() we verify the above assumption.
1605
- */
1606
- if (!sched_debug())
1607
- break;
1573
+ bitmap_set(distance_map, distance, 1);
1574
+ }
1575
+ }
1576
+ /*
1577
+ * We can now figure out how many unique distance values there are and
1578
+ * allocate memory accordingly.
1579
+ */
1580
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1581
+
1582
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1583
+ if (!sched_domains_numa_distance) {
1584
+ bitmap_free(distance_map);
1585
+ return;
16081586 }
16091587
1588
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
1589
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1590
+ sched_domains_numa_distance[i] = j;
1591
+ }
1592
+
1593
+ bitmap_free(distance_map);
1594
+
16101595 /*
1611
- * 'level' contains the number of unique distances
1596
+ * 'nr_levels' contains the number of unique distances
16121597 *
16131598 * The sched_domains_numa_distance[] array includes the actual distance
16141599 * numbers.
....@@ -1617,15 +1602,15 @@
16171602 /*
16181603 * Here, we should temporarily reset sched_domains_numa_levels to 0.
16191604 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1620
- * the array will contain less then 'level' members. This could be
1605
+ * the array will contain less then 'nr_levels' members. This could be
16211606 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
16221607 * in other functions.
16231608 *
1624
- * We reset it to 'level' at the end of this function.
1609
+ * We reset it to 'nr_levels' at the end of this function.
16251610 */
16261611 sched_domains_numa_levels = 0;
16271612
1628
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1613
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
16291614 if (!sched_domains_numa_masks)
16301615 return;
16311616
....@@ -1633,7 +1618,7 @@
16331618 * Now for each level, construct a mask per node which contains all
16341619 * CPUs of nodes that are that many hops away from us.
16351620 */
1636
- for (i = 0; i < level; i++) {
1621
+ for (i = 0; i < nr_levels; i++) {
16371622 sched_domains_numa_masks[i] =
16381623 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
16391624 if (!sched_domains_numa_masks[i])
....@@ -1641,12 +1626,17 @@
16411626
16421627 for (j = 0; j < nr_node_ids; j++) {
16431628 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1629
+ int k;
1630
+
16441631 if (!mask)
16451632 return;
16461633
16471634 sched_domains_numa_masks[i][j] = mask;
16481635
16491636 for_each_node(k) {
1637
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1638
+ sched_numa_warn("Node-distance not symmetric");
1639
+
16501640 if (node_distance(j, k) > sched_domains_numa_distance[i])
16511641 continue;
16521642
....@@ -1658,7 +1648,7 @@
16581648 /* Compute default topology size */
16591649 for (i = 0; sched_domain_topology[i].mask; i++);
16601650
1661
- tl = kzalloc((i + level + 1) *
1651
+ tl = kzalloc((i + nr_levels + 1) *
16621652 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
16631653 if (!tl)
16641654 return;
....@@ -1681,7 +1671,7 @@
16811671 /*
16821672 * .. and append 'j' levels of NUMA goodness.
16831673 */
1684
- for (j = 1; j < level; i++, j++) {
1674
+ for (j = 1; j < nr_levels; i++, j++) {
16851675 tl[i] = (struct sched_domain_topology_level){
16861676 .mask = sd_numa_mask,
16871677 .sd_flags = cpu_numa_flags,
....@@ -1693,8 +1683,8 @@
16931683
16941684 sched_domain_topology = tl;
16951685
1696
- sched_domains_numa_levels = level;
1697
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1686
+ sched_domains_numa_levels = nr_levels;
1687
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
16981688
16991689 init_numa_topology_type();
17001690 }
....@@ -1720,6 +1710,26 @@
17201710 for (j = 0; j < nr_node_ids; j++)
17211711 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
17221712 }
1713
+}
1714
+
1715
+/*
1716
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
1717
+ * closest to @cpu from @cpumask.
1718
+ * cpumask: cpumask to find a cpu from
1719
+ * cpu: cpu to be close to
1720
+ *
1721
+ * returns: cpu, or nr_cpu_ids when nothing found.
1722
+ */
1723
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1724
+{
1725
+ int i, j = cpu_to_node(cpu);
1726
+
1727
+ for (i = 0; i < sched_domains_numa_levels; i++) {
1728
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1729
+ if (cpu < nr_cpu_ids)
1730
+ return cpu;
1731
+ }
1732
+ return nr_cpu_ids;
17231733 }
17241734
17251735 #endif /* CONFIG_NUMA */
....@@ -1860,6 +1870,42 @@
18601870 }
18611871
18621872 /*
1873
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
1874
+ * any two given CPUs at this (non-NUMA) topology level.
1875
+ */
1876
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
1877
+ const struct cpumask *cpu_map, int cpu)
1878
+{
1879
+ int i;
1880
+
1881
+ /* NUMA levels are allowed to overlap */
1882
+ if (tl->flags & SDTL_OVERLAP)
1883
+ return true;
1884
+
1885
+ /*
1886
+ * Non-NUMA levels cannot partially overlap - they must be either
1887
+ * completely equal or completely disjoint. Otherwise we can end up
1888
+ * breaking the sched_group lists - i.e. a later get_group() pass
1889
+ * breaks the linking done for an earlier span.
1890
+ */
1891
+ for_each_cpu(i, cpu_map) {
1892
+ if (i == cpu)
1893
+ continue;
1894
+ /*
1895
+ * We should 'and' all those masks with 'cpu_map' to exactly
1896
+ * match the topology we're about to build, but that can only
1897
+ * remove CPUs, which only lessens our ability to detect
1898
+ * overlaps
1899
+ */
1900
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
1901
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
1902
+ return false;
1903
+ }
1904
+
1905
+ return true;
1906
+}
1907
+
1908
+/*
18631909 * Find the sched_domain_topology_level where all CPU capacities are visible
18641910 * for all CPUs.
18651911 */
....@@ -1872,10 +1918,10 @@
18721918 unsigned long cap;
18731919
18741920 /* Is there any asymmetry? */
1875
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
1921
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
18761922
18771923 for_each_cpu(i, cpu_map) {
1878
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
1924
+ if (arch_scale_cpu_capacity(i) != cap) {
18791925 asym = true;
18801926 break;
18811927 }
....@@ -1890,7 +1936,7 @@
18901936 * to everyone.
18911937 */
18921938 for_each_cpu(i, cpu_map) {
1893
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
1939
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
18941940 int tl_id = 0;
18951941
18961942 for_each_sd_topology(tl) {
....@@ -1900,7 +1946,7 @@
19001946 for_each_cpu_and(j, tl->mask(i), cpu_map) {
19011947 unsigned long capacity;
19021948
1903
- capacity = arch_scale_cpu_capacity(NULL, j);
1949
+ capacity = arch_scale_cpu_capacity(j);
19041950
19051951 if (capacity <= max_capacity)
19061952 continue;
....@@ -1925,12 +1971,16 @@
19251971 static int
19261972 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
19271973 {
1928
- enum s_alloc alloc_state;
1974
+ enum s_alloc alloc_state = sa_none;
19291975 struct sched_domain *sd;
19301976 struct s_data d;
1977
+ struct rq *rq = NULL;
19311978 int i, ret = -ENOMEM;
19321979 struct sched_domain_topology_level *tl_asym;
19331980 bool has_asym = false;
1981
+
1982
+ if (WARN_ON(cpumask_empty(cpu_map)))
1983
+ goto error;
19341984
19351985 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
19361986 if (alloc_state != sa_rootdomain)
....@@ -1941,15 +1991,17 @@
19411991 /* Set up domains for CPUs specified by the cpu_map: */
19421992 for_each_cpu(i, cpu_map) {
19431993 struct sched_domain_topology_level *tl;
1994
+ int dflags = 0;
19441995
19451996 sd = NULL;
19461997 for_each_sd_topology(tl) {
1947
- int dflags = 0;
1948
-
19491998 if (tl == tl_asym) {
19501999 dflags |= SD_ASYM_CPUCAPACITY;
19512000 has_asym = true;
19522001 }
2002
+
2003
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2004
+ goto error;
19532005
19542006 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
19552007
....@@ -1990,13 +2042,25 @@
19902042 /* Attach the domains */
19912043 rcu_read_lock();
19922044 for_each_cpu(i, cpu_map) {
2045
+ rq = cpu_rq(i);
19932046 sd = *per_cpu_ptr(d.sd, i);
2047
+
2048
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2049
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2050
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2051
+
19942052 cpu_attach_domain(sd, d.rd, i);
19952053 }
19962054 rcu_read_unlock();
19972055
19982056 if (has_asym)
19992057 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2058
+
2059
+ if (rq && sched_debug_enabled) {
2060
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2061
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2062
+ }
2063
+ trace_android_vh_build_sched_domains(has_asym);
20002064
20012065 ret = 0;
20022066 error:
....@@ -2057,9 +2121,8 @@
20572121 }
20582122
20592123 /*
2060
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
2061
- * For now this just excludes isolated CPUs, but could be used to
2062
- * exclude other special cases in the future.
2124
+ * Set up scheduler domains and groups. For now this just excludes isolated
2125
+ * CPUs, but could be used to exclude other special cases in the future.
20632126 */
20642127 int sched_init_domains(const struct cpumask *cpu_map)
20652128 {
....@@ -2140,16 +2203,16 @@
21402203 * ndoms_new == 0 is a special case for destroying existing domains,
21412204 * and it will not create the default domain.
21422205 *
2143
- * Call with hotplug lock held
2206
+ * Call with hotplug lock and sched_domains_mutex held
21442207 */
2145
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2146
- struct sched_domain_attr *dattr_new)
2208
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2209
+ struct sched_domain_attr *dattr_new)
21472210 {
21482211 bool __maybe_unused has_eas = false;
21492212 int i, j, n;
21502213 int new_topology;
21512214
2152
- mutex_lock(&sched_domains_mutex);
2215
+ lockdep_assert_held(&sched_domains_mutex);
21532216
21542217 /* Always unregister in case we don't destroy any domains: */
21552218 unregister_sched_domain_sysctl();
....@@ -2174,8 +2237,19 @@
21742237 for (i = 0; i < ndoms_cur; i++) {
21752238 for (j = 0; j < n && !new_topology; j++) {
21762239 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2177
- dattrs_equal(dattr_cur, i, dattr_new, j))
2240
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
2241
+ struct root_domain *rd;
2242
+
2243
+ /*
2244
+ * This domain won't be destroyed and as such
2245
+ * its dl_bw->total_bw needs to be cleared. It
2246
+ * will be recomputed in function
2247
+ * update_tasks_root_domain().
2248
+ */
2249
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2250
+ dl_clear_root_domain(rd);
21782251 goto match1;
2252
+ }
21792253 }
21802254 /* No match - a current sched domain not in new doms_new[] */
21812255 detach_destroy_domains(doms_cur[i]);
....@@ -2204,10 +2278,10 @@
22042278 ;
22052279 }
22062280
2207
-#ifdef CONFIG_ENERGY_MODEL
2281
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
22082282 /* Build perf. domains: */
22092283 for (i = 0; i < ndoms_new; i++) {
2210
- for (j = 0; j < n; j++) {
2284
+ for (j = 0; j < n && !sched_energy_update; j++) {
22112285 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
22122286 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
22132287 has_eas = true;
....@@ -2232,6 +2306,15 @@
22322306 ndoms_cur = ndoms_new;
22332307
22342308 register_sched_domain_sysctl();
2309
+}
22352310
2311
+/*
2312
+ * Call with hotplug lock held
2313
+ */
2314
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2315
+ struct sched_domain_attr *dattr_new)
2316
+{
2317
+ mutex_lock(&sched_domains_mutex);
2318
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
22362319 mutex_unlock(&sched_domains_mutex);
22372320 }