forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/kernel/sched/topology.c
....@@ -4,11 +4,16 @@
44 */
55 #include "sched.h"
66
7
+#include <trace/hooks/sched.h>
8
+
79 DEFINE_MUTEX(sched_domains_mutex);
10
+#ifdef CONFIG_LOCKDEP
11
+EXPORT_SYMBOL_GPL(sched_domains_mutex);
12
+#endif
813
914 /* Protected by sched_domains_mutex: */
10
-cpumask_var_t sched_domains_tmpmask;
11
-cpumask_var_t sched_domains_tmpmask2;
15
+static cpumask_var_t sched_domains_tmpmask;
16
+static cpumask_var_t sched_domains_tmpmask2;
1217
1318 #ifdef CONFIG_SCHED_DEBUG
1419
....@@ -25,22 +30,22 @@
2530 return sched_debug_enabled;
2631 }
2732
33
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
34
+const struct sd_flag_debug sd_flag_debug[] = {
35
+#include <linux/sched/sd_flags.h>
36
+};
37
+#undef SD_FLAG
38
+
2839 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
2940 struct cpumask *groupmask)
3041 {
3142 struct sched_group *group = sd->groups;
43
+ unsigned long flags = sd->flags;
44
+ unsigned int idx;
3245
3346 cpumask_clear(groupmask);
3447
3548 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36
-
37
- if (!(sd->flags & SD_LOAD_BALANCE)) {
38
- printk("does not load-balance\n");
39
- if (sd->parent)
40
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41
- return -1;
42
- }
43
-
4449 printk(KERN_CONT "span=%*pbl level=%s\n",
4550 cpumask_pr_args(sched_domain_span(sd)), sd->name);
4651
....@@ -49,6 +54,21 @@
4954 }
5055 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
5156 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
57
+ }
58
+
59
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
60
+ unsigned int flag = BIT(idx);
61
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
62
+
63
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
64
+ !(sd->child->flags & flag))
65
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
66
+ sd_flag_debug[idx].name);
67
+
68
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
69
+ !(sd->parent->flags & flag))
70
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
71
+ sd_flag_debug[idx].name);
5272 }
5373
5474 printk(KERN_DEBUG "%*s groups:", level + 1, "");
....@@ -145,23 +165,22 @@
145165 }
146166 #endif /* CONFIG_SCHED_DEBUG */
147167
168
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
169
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
170
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
171
+#include <linux/sched/sd_flags.h>
172
+0;
173
+#undef SD_FLAG
174
+
148175 static int sd_degenerate(struct sched_domain *sd)
149176 {
150177 if (cpumask_weight(sched_domain_span(sd)) == 1)
151178 return 1;
152179
153180 /* Following flags need at least 2 groups */
154
- if (sd->flags & (SD_LOAD_BALANCE |
155
- SD_BALANCE_NEWIDLE |
156
- SD_BALANCE_FORK |
157
- SD_BALANCE_EXEC |
158
- SD_SHARE_CPUCAPACITY |
159
- SD_ASYM_CPUCAPACITY |
160
- SD_SHARE_PKG_RESOURCES |
161
- SD_SHARE_POWERDOMAIN)) {
162
- if (sd->groups != sd->groups->next)
163
- return 0;
164
- }
181
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
182
+ (sd->groups != sd->groups->next))
183
+ return 0;
165184
166185 /* Following flags don't use groups */
167186 if (sd->flags & (SD_WAKE_AFFINE))
....@@ -182,36 +201,24 @@
182201 return 0;
183202
184203 /* Flags needing groups don't count if only 1 group in parent */
185
- if (parent->groups == parent->groups->next) {
186
- pflags &= ~(SD_LOAD_BALANCE |
187
- SD_BALANCE_NEWIDLE |
188
- SD_BALANCE_FORK |
189
- SD_BALANCE_EXEC |
190
- SD_ASYM_CPUCAPACITY |
191
- SD_SHARE_CPUCAPACITY |
192
- SD_SHARE_PKG_RESOURCES |
193
- SD_PREFER_SIBLING |
194
- SD_SHARE_POWERDOMAIN);
195
- if (nr_node_ids == 1)
196
- pflags &= ~SD_SERIALIZE;
197
- }
204
+ if (parent->groups == parent->groups->next)
205
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
206
+
198207 if (~cflags & pflags)
199208 return 0;
200209
201210 return 1;
202211 }
203212
204
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205
-
206
-#ifdef CONFIG_ENERGY_MODEL
207213 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
214
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
208215 unsigned int sysctl_sched_energy_aware = 1;
209216 DEFINE_MUTEX(sched_energy_mutex);
210217 bool sched_energy_update;
211218
212219 #ifdef CONFIG_PROC_SYSCTL
213220 int sched_energy_aware_handler(struct ctl_table *table, int write,
214
- void __user *buffer, size_t *lenp, loff_t *ppos)
221
+ void *buffer, size_t *lenp, loff_t *ppos)
215222 {
216223 int ret, state;
217224
....@@ -233,7 +240,6 @@
233240 return ret;
234241 }
235242 #endif
236
-#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
237243
238244 static void free_pd(struct perf_domain *pd)
239245 {
....@@ -285,10 +291,10 @@
285291 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
286292
287293 while (pd) {
288
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
294
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
289295 cpumask_first(perf_domain_span(pd)),
290296 cpumask_pr_args(perf_domain_span(pd)),
291
- em_pd_nr_cap_states(pd->em_pd));
297
+ em_pd_nr_perf_states(pd->em_pd));
292298 pd = pd->next;
293299 }
294300
....@@ -320,44 +326,55 @@
320326 * EAS can be used on a root domain if it meets all the following conditions:
321327 * 1. an Energy Model (EM) is available;
322328 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
323
- * 3. the EM complexity is low enough to keep scheduling overheads low;
329
+ * 3. no SMT is detected.
330
+ * 4. the EM complexity is low enough to keep scheduling overheads low;
324331 *
325332 * The complexity of the Energy Model is defined as:
326333 *
327
- * C = nr_pd * (nr_cpus + nr_cs)
334
+ * C = nr_pd * (nr_cpus + nr_ps)
328335 *
329336 * with parameters defined as:
330337 * - nr_pd: the number of performance domains
331338 * - nr_cpus: the number of CPUs
332
- * - nr_cs: the sum of the number of capacity states of all performance
339
+ * - nr_ps: the sum of the number of performance states of all performance
333340 * domains (for example, on a system with 2 performance domains,
334
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
341
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
335342 *
336343 * It is generally not a good idea to use such a model in the wake-up path on
337344 * very complex platforms because of the associated scheduling overheads. The
338345 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
339
- * with per-CPU DVFS and less than 8 capacity states each, for example.
346
+ * with per-CPU DVFS and less than 8 performance states each, for example.
340347 */
341348 #define EM_MAX_COMPLEXITY 2048
342349
343350 static bool build_perf_domains(const struct cpumask *cpu_map)
344351 {
345
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
352
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
346353 struct perf_domain *pd = NULL, *tmp;
347354 int cpu = cpumask_first(cpu_map);
348355 struct root_domain *rd = cpu_rq(cpu)->rd;
356
+ bool eas_check = false;
349357
350
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
351358 if (!sysctl_sched_energy_aware)
352359 goto free;
353
-#endif
354360
355
- /* EAS is enabled for asymmetric CPU capacity topologies. */
356
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
361
+ /*
362
+ * EAS is enabled for asymmetric CPU capacity topologies.
363
+ * Allow vendor to override if desired.
364
+ */
365
+ trace_android_rvh_build_perf_domains(&eas_check);
366
+ if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) {
357367 if (sched_debug()) {
358368 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
359369 cpumask_pr_args(cpu_map));
360370 }
371
+ goto free;
372
+ }
373
+
374
+ /* EAS definitely does *not* handle SMT */
375
+ if (sched_smt_active()) {
376
+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
377
+ cpumask_pr_args(cpu_map));
361378 goto free;
362379 }
363380
....@@ -374,15 +391,15 @@
374391 pd = tmp;
375392
376393 /*
377
- * Count performance domains and capacity states for the
394
+ * Count performance domains and performance states for the
378395 * complexity check.
379396 */
380397 nr_pd++;
381
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
398
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
382399 }
383400
384401 /* Bail out if the Energy Model complexity is too high. */
385
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
402
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
386403 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
387404 cpumask_pr_args(cpu_map));
388405 goto free;
....@@ -409,7 +426,7 @@
409426 }
410427 #else
411428 static void free_pd(struct perf_domain *pd) { }
412
-#endif /* CONFIG_ENERGY_MODEL */
429
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
413430
414431 static void free_rootdomain(struct rcu_head *rcu)
415432 {
....@@ -459,7 +476,7 @@
459476 raw_spin_unlock_irqrestore(&rq->lock, flags);
460477
461478 if (old_rd)
462
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
479
+ call_rcu(&old_rd->rcu, free_rootdomain);
463480 }
464481
465482 void sched_get_rd(struct root_domain *rd)
....@@ -472,7 +489,7 @@
472489 if (!atomic_dec_and_test(&rd->refcount))
473490 return;
474491
475
- call_rcu_sched(&rd->rcu, free_rootdomain);
492
+ call_rcu(&rd->rcu, free_rootdomain);
476493 }
477494
478495 static int init_rootdomain(struct root_domain *rd)
....@@ -490,7 +507,7 @@
490507 rd->rto_cpu = -1;
491508 raw_spin_lock_init(&rd->rto_lock);
492509 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
493
- rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ;
510
+ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags);
494511 #endif
495512
496513 init_dl_bw(&rd->dl_bw);
....@@ -499,9 +516,6 @@
499516
500517 if (cpupri_init(&rd->cpupri) != 0)
501518 goto free_cpudl;
502
-
503
- init_max_cpu_capacity(&rd->max_cpu_capacity);
504
-
505519 return 0;
506520
507521 free_cpudl:
....@@ -607,13 +621,13 @@
607621 * the cpumask of the domain), this allows us to quickly tell if
608622 * two CPUs are in the same cache domain, see cpus_share_cache().
609623 */
610
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
624
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
611625 DEFINE_PER_CPU(int, sd_llc_size);
612626 DEFINE_PER_CPU(int, sd_llc_id);
613
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
614
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
615
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
616
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
627
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
628
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
629
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
630
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
617631 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
618632
619633 static void update_top_cache_domain(int cpu)
....@@ -1051,6 +1065,7 @@
10511065 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
10521066 struct sched_domain *child = sd->child;
10531067 struct sched_group *sg;
1068
+ bool already_visited;
10541069
10551070 if (child)
10561071 cpu = cpumask_first(sched_domain_span(child));
....@@ -1058,9 +1073,14 @@
10581073 sg = *per_cpu_ptr(sdd->sg, cpu);
10591074 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
10601075
1061
- /* For claim_allocations: */
1062
- atomic_inc(&sg->ref);
1063
- atomic_inc(&sg->sgc->ref);
1076
+ /* Increase refcounts for claim_allocations: */
1077
+ already_visited = atomic_inc_return(&sg->ref) > 1;
1078
+ /* sgc visits should follow a similar trend as sg */
1079
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1080
+
1081
+ /* If we have already visited that group, it's already initialized. */
1082
+ if (already_visited)
1083
+ return sg;
10641084
10651085 if (child) {
10661086 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
....@@ -1079,8 +1099,8 @@
10791099
10801100 /*
10811101 * build_sched_groups will build a circular linked list of the groups
1082
- * covered by the given span, and will set each group's ->cpumask correctly,
1083
- * and ->cpu_capacity to 0.
1102
+ * covered by the given span, will set each group's ->cpumask correctly,
1103
+ * and will initialize their ->sgc.
10841104 *
10851105 * Assumes the sched_domain tree is fully constructed
10861106 */
....@@ -1187,16 +1207,13 @@
11871207 if (!attr || attr->relax_domain_level < 0) {
11881208 if (default_relax_domain_level < 0)
11891209 return;
1190
- else
1191
- request = default_relax_domain_level;
1210
+ request = default_relax_domain_level;
11921211 } else
11931212 request = attr->relax_domain_level;
1194
- if (request < sd->level) {
1213
+
1214
+ if (sd->level > request) {
11951215 /* Turn off idle balance on this domain: */
11961216 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1197
- } else {
1198
- /* Turn on idle balance on this domain: */
1199
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
12001217 }
12011218 }
12021219
....@@ -1210,13 +1227,13 @@
12101227 case sa_rootdomain:
12111228 if (!atomic_read(&d->rd->refcount))
12121229 free_rootdomain(&d->rd->rcu);
1213
- /* Fall through */
1230
+ fallthrough;
12141231 case sa_sd:
12151232 free_percpu(d->sd);
1216
- /* Fall through */
1233
+ fallthrough;
12171234 case sa_sd_storage:
12181235 __sdt_free(cpu_map);
1219
- /* Fall through */
1236
+ fallthrough;
12201237 case sa_none:
12211238 break;
12221239 }
....@@ -1270,6 +1287,7 @@
12701287 int sched_max_numa_distance;
12711288 static int *sched_domains_numa_distance;
12721289 static struct cpumask ***sched_domains_numa_masks;
1290
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
12731291 #endif
12741292
12751293 /*
....@@ -1282,7 +1300,6 @@
12821300 * SD_SHARE_CPUCAPACITY - describes SMT topologies
12831301 * SD_SHARE_PKG_RESOURCES - describes shared caches
12841302 * SD_NUMA - describes NUMA topologies
1285
- * SD_SHARE_POWERDOMAIN - describes shared power domain
12861303 *
12871304 * Odd one out, which beside describing the topology has a quirk also
12881305 * prescribes the desired behaviour that goes along with it:
....@@ -1293,8 +1310,7 @@
12931310 (SD_SHARE_CPUCAPACITY | \
12941311 SD_SHARE_PKG_RESOURCES | \
12951312 SD_NUMA | \
1296
- SD_ASYM_PACKING | \
1297
- SD_SHARE_POWERDOMAIN)
1313
+ SD_ASYM_PACKING)
12981314
12991315 static struct sched_domain *
13001316 sd_init(struct sched_domain_topology_level *tl,
....@@ -1326,18 +1342,12 @@
13261342 *sd = (struct sched_domain){
13271343 .min_interval = sd_weight,
13281344 .max_interval = 2*sd_weight,
1329
- .busy_factor = 32,
1330
- .imbalance_pct = 125,
1345
+ .busy_factor = 16,
1346
+ .imbalance_pct = 117,
13311347
13321348 .cache_nice_tries = 0,
1333
- .busy_idx = 0,
1334
- .idle_idx = 0,
1335
- .newidle_idx = 0,
1336
- .wake_idx = 0,
1337
- .forkexec_idx = 0,
13381349
1339
- .flags = 1*SD_LOAD_BALANCE
1340
- | 1*SD_BALANCE_NEWIDLE
1350
+ .flags = 1*SD_BALANCE_NEWIDLE
13411351 | 1*SD_BALANCE_EXEC
13421352 | 1*SD_BALANCE_FORK
13431353 | 0*SD_BALANCE_WAKE
....@@ -1352,7 +1362,6 @@
13521362
13531363 .last_balance = jiffies,
13541364 .balance_interval = sd_weight,
1355
- .smt_gain = 0,
13561365 .max_newidle_lb_cost = 0,
13571366 .next_decay_max_lb_cost = jiffies,
13581367 .child = child,
....@@ -1368,37 +1377,24 @@
13681377 * Convert topological properties into behaviour.
13691378 */
13701379
1371
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
1372
- struct sched_domain *t = sd;
1373
-
1374
- /*
1375
- * Don't attempt to spread across CPUs of different capacities.
1376
- */
1377
- if (sd->child)
1378
- sd->child->flags &= ~SD_PREFER_SIBLING;
1379
-
1380
- for_each_lower_domain(t)
1381
- t->flags |= SD_BALANCE_WAKE;
1382
- }
1380
+ /* Don't attempt to spread across CPUs of different capacities. */
1381
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1382
+ sd->child->flags &= ~SD_PREFER_SIBLING;
13831383
13841384 if (sd->flags & SD_SHARE_CPUCAPACITY) {
13851385 sd->imbalance_pct = 110;
1386
- sd->smt_gain = 1178; /* ~15% */
13871386
13881387 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
13891388 sd->imbalance_pct = 117;
13901389 sd->cache_nice_tries = 1;
1391
- sd->busy_idx = 2;
13921390
13931391 #ifdef CONFIG_NUMA
13941392 } else if (sd->flags & SD_NUMA) {
13951393 sd->cache_nice_tries = 2;
1396
- sd->busy_idx = 3;
1397
- sd->idle_idx = 2;
13981394
13991395 sd->flags &= ~SD_PREFER_SIBLING;
14001396 sd->flags |= SD_SERIALIZE;
1401
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1397
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
14021398 sd->flags &= ~(SD_BALANCE_EXEC |
14031399 SD_BALANCE_FORK |
14041400 SD_WAKE_AFFINE);
....@@ -1407,8 +1403,6 @@
14071403 #endif
14081404 } else {
14091405 sd->cache_nice_tries = 1;
1410
- sd->busy_idx = 2;
1411
- sd->idle_idx = 1;
14121406 }
14131407
14141408 /*
....@@ -1549,66 +1543,58 @@
15491543 }
15501544 }
15511545
1546
+
1547
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1548
+
15521549 void sched_init_numa(void)
15531550 {
1554
- int next_distance, curr_distance = node_distance(0, 0);
15551551 struct sched_domain_topology_level *tl;
1556
- int level = 0;
1557
- int i, j, k;
1558
-
1559
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1560
- if (!sched_domains_numa_distance)
1561
- return;
1562
-
1563
- /* Includes NUMA identity node at level 0. */
1564
- sched_domains_numa_distance[level++] = curr_distance;
1565
- sched_domains_numa_levels = level;
1552
+ unsigned long *distance_map;
1553
+ int nr_levels = 0;
1554
+ int i, j;
15661555
15671556 /*
15681557 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
15691558 * unique distances in the node_distance() table.
1570
- *
1571
- * Assumes node_distance(0,j) includes all distances in
1572
- * node_distance(i,j) in order to avoid cubic time.
15731559 */
1574
- next_distance = curr_distance;
1560
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1561
+ if (!distance_map)
1562
+ return;
1563
+
1564
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
15751565 for (i = 0; i < nr_node_ids; i++) {
15761566 for (j = 0; j < nr_node_ids; j++) {
1577
- for (k = 0; k < nr_node_ids; k++) {
1578
- int distance = node_distance(i, k);
1567
+ int distance = node_distance(i, j);
15791568
1580
- if (distance > curr_distance &&
1581
- (distance < next_distance ||
1582
- next_distance == curr_distance))
1583
- next_distance = distance;
1584
-
1585
- /*
1586
- * While not a strong assumption it would be nice to know
1587
- * about cases where if node A is connected to B, B is not
1588
- * equally connected to A.
1589
- */
1590
- if (sched_debug() && node_distance(k, i) != distance)
1591
- sched_numa_warn("Node-distance not symmetric");
1592
-
1593
- if (sched_debug() && i && !find_numa_distance(distance))
1594
- sched_numa_warn("Node-0 not representative");
1569
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1570
+ sched_numa_warn("Invalid distance value range");
1571
+ return;
15951572 }
1596
- if (next_distance != curr_distance) {
1597
- sched_domains_numa_distance[level++] = next_distance;
1598
- sched_domains_numa_levels = level;
1599
- curr_distance = next_distance;
1600
- } else break;
1601
- }
16021573
1603
- /*
1604
- * In case of sched_debug() we verify the above assumption.
1605
- */
1606
- if (!sched_debug())
1607
- break;
1574
+ bitmap_set(distance_map, distance, 1);
1575
+ }
1576
+ }
1577
+ /*
1578
+ * We can now figure out how many unique distance values there are and
1579
+ * allocate memory accordingly.
1580
+ */
1581
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1582
+
1583
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1584
+ if (!sched_domains_numa_distance) {
1585
+ bitmap_free(distance_map);
1586
+ return;
16081587 }
16091588
1589
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
1590
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1591
+ sched_domains_numa_distance[i] = j;
1592
+ }
1593
+
1594
+ bitmap_free(distance_map);
1595
+
16101596 /*
1611
- * 'level' contains the number of unique distances
1597
+ * 'nr_levels' contains the number of unique distances
16121598 *
16131599 * The sched_domains_numa_distance[] array includes the actual distance
16141600 * numbers.
....@@ -1617,15 +1603,15 @@
16171603 /*
16181604 * Here, we should temporarily reset sched_domains_numa_levels to 0.
16191605 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1620
- * the array will contain less then 'level' members. This could be
1606
+ * the array will contain less then 'nr_levels' members. This could be
16211607 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
16221608 * in other functions.
16231609 *
1624
- * We reset it to 'level' at the end of this function.
1610
+ * We reset it to 'nr_levels' at the end of this function.
16251611 */
16261612 sched_domains_numa_levels = 0;
16271613
1628
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1614
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
16291615 if (!sched_domains_numa_masks)
16301616 return;
16311617
....@@ -1633,7 +1619,7 @@
16331619 * Now for each level, construct a mask per node which contains all
16341620 * CPUs of nodes that are that many hops away from us.
16351621 */
1636
- for (i = 0; i < level; i++) {
1622
+ for (i = 0; i < nr_levels; i++) {
16371623 sched_domains_numa_masks[i] =
16381624 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
16391625 if (!sched_domains_numa_masks[i])
....@@ -1641,12 +1627,17 @@
16411627
16421628 for (j = 0; j < nr_node_ids; j++) {
16431629 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1630
+ int k;
1631
+
16441632 if (!mask)
16451633 return;
16461634
16471635 sched_domains_numa_masks[i][j] = mask;
16481636
16491637 for_each_node(k) {
1638
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1639
+ sched_numa_warn("Node-distance not symmetric");
1640
+
16501641 if (node_distance(j, k) > sched_domains_numa_distance[i])
16511642 continue;
16521643
....@@ -1658,7 +1649,7 @@
16581649 /* Compute default topology size */
16591650 for (i = 0; sched_domain_topology[i].mask; i++);
16601651
1661
- tl = kzalloc((i + level + 1) *
1652
+ tl = kzalloc((i + nr_levels + 1) *
16621653 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
16631654 if (!tl)
16641655 return;
....@@ -1681,7 +1672,7 @@
16811672 /*
16821673 * .. and append 'j' levels of NUMA goodness.
16831674 */
1684
- for (j = 1; j < level; i++, j++) {
1675
+ for (j = 1; j < nr_levels; i++, j++) {
16851676 tl[i] = (struct sched_domain_topology_level){
16861677 .mask = sd_numa_mask,
16871678 .sd_flags = cpu_numa_flags,
....@@ -1693,8 +1684,8 @@
16931684
16941685 sched_domain_topology = tl;
16951686
1696
- sched_domains_numa_levels = level;
1697
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1687
+ sched_domains_numa_levels = nr_levels;
1688
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
16981689
16991690 init_numa_topology_type();
17001691 }
....@@ -1720,6 +1711,26 @@
17201711 for (j = 0; j < nr_node_ids; j++)
17211712 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
17221713 }
1714
+}
1715
+
1716
+/*
1717
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
1718
+ * closest to @cpu from @cpumask.
1719
+ * cpumask: cpumask to find a cpu from
1720
+ * cpu: cpu to be close to
1721
+ *
1722
+ * returns: cpu, or nr_cpu_ids when nothing found.
1723
+ */
1724
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1725
+{
1726
+ int i, j = cpu_to_node(cpu);
1727
+
1728
+ for (i = 0; i < sched_domains_numa_levels; i++) {
1729
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1730
+ if (cpu < nr_cpu_ids)
1731
+ return cpu;
1732
+ }
1733
+ return nr_cpu_ids;
17231734 }
17241735
17251736 #endif /* CONFIG_NUMA */
....@@ -1860,6 +1871,42 @@
18601871 }
18611872
18621873 /*
1874
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
1875
+ * any two given CPUs at this (non-NUMA) topology level.
1876
+ */
1877
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
1878
+ const struct cpumask *cpu_map, int cpu)
1879
+{
1880
+ int i;
1881
+
1882
+ /* NUMA levels are allowed to overlap */
1883
+ if (tl->flags & SDTL_OVERLAP)
1884
+ return true;
1885
+
1886
+ /*
1887
+ * Non-NUMA levels cannot partially overlap - they must be either
1888
+ * completely equal or completely disjoint. Otherwise we can end up
1889
+ * breaking the sched_group lists - i.e. a later get_group() pass
1890
+ * breaks the linking done for an earlier span.
1891
+ */
1892
+ for_each_cpu(i, cpu_map) {
1893
+ if (i == cpu)
1894
+ continue;
1895
+ /*
1896
+ * We should 'and' all those masks with 'cpu_map' to exactly
1897
+ * match the topology we're about to build, but that can only
1898
+ * remove CPUs, which only lessens our ability to detect
1899
+ * overlaps
1900
+ */
1901
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
1902
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
1903
+ return false;
1904
+ }
1905
+
1906
+ return true;
1907
+}
1908
+
1909
+/*
18631910 * Find the sched_domain_topology_level where all CPU capacities are visible
18641911 * for all CPUs.
18651912 */
....@@ -1872,10 +1919,10 @@
18721919 unsigned long cap;
18731920
18741921 /* Is there any asymmetry? */
1875
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
1922
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
18761923
18771924 for_each_cpu(i, cpu_map) {
1878
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
1925
+ if (arch_scale_cpu_capacity(i) != cap) {
18791926 asym = true;
18801927 break;
18811928 }
....@@ -1890,7 +1937,7 @@
18901937 * to everyone.
18911938 */
18921939 for_each_cpu(i, cpu_map) {
1893
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
1940
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
18941941 int tl_id = 0;
18951942
18961943 for_each_sd_topology(tl) {
....@@ -1900,7 +1947,7 @@
19001947 for_each_cpu_and(j, tl->mask(i), cpu_map) {
19011948 unsigned long capacity;
19021949
1903
- capacity = arch_scale_cpu_capacity(NULL, j);
1950
+ capacity = arch_scale_cpu_capacity(j);
19041951
19051952 if (capacity <= max_capacity)
19061953 continue;
....@@ -1925,12 +1972,16 @@
19251972 static int
19261973 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
19271974 {
1928
- enum s_alloc alloc_state;
1975
+ enum s_alloc alloc_state = sa_none;
19291976 struct sched_domain *sd;
19301977 struct s_data d;
1978
+ struct rq *rq = NULL;
19311979 int i, ret = -ENOMEM;
19321980 struct sched_domain_topology_level *tl_asym;
19331981 bool has_asym = false;
1982
+
1983
+ if (WARN_ON(cpumask_empty(cpu_map)))
1984
+ goto error;
19341985
19351986 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
19361987 if (alloc_state != sa_rootdomain)
....@@ -1941,15 +1992,17 @@
19411992 /* Set up domains for CPUs specified by the cpu_map: */
19421993 for_each_cpu(i, cpu_map) {
19431994 struct sched_domain_topology_level *tl;
1995
+ int dflags = 0;
19441996
19451997 sd = NULL;
19461998 for_each_sd_topology(tl) {
1947
- int dflags = 0;
1948
-
19491999 if (tl == tl_asym) {
19502000 dflags |= SD_ASYM_CPUCAPACITY;
19512001 has_asym = true;
19522002 }
2003
+
2004
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2005
+ goto error;
19532006
19542007 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
19552008
....@@ -1990,13 +2043,25 @@
19902043 /* Attach the domains */
19912044 rcu_read_lock();
19922045 for_each_cpu(i, cpu_map) {
2046
+ rq = cpu_rq(i);
19932047 sd = *per_cpu_ptr(d.sd, i);
2048
+
2049
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2050
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2051
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2052
+
19942053 cpu_attach_domain(sd, d.rd, i);
19952054 }
19962055 rcu_read_unlock();
19972056
19982057 if (has_asym)
19992058 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2059
+
2060
+ if (rq && sched_debug_enabled) {
2061
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2062
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2063
+ }
2064
+ trace_android_vh_build_sched_domains(has_asym);
20002065
20012066 ret = 0;
20022067 error:
....@@ -2057,9 +2122,8 @@
20572122 }
20582123
20592124 /*
2060
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
2061
- * For now this just excludes isolated CPUs, but could be used to
2062
- * exclude other special cases in the future.
2125
+ * Set up scheduler domains and groups. For now this just excludes isolated
2126
+ * CPUs, but could be used to exclude other special cases in the future.
20632127 */
20642128 int sched_init_domains(const struct cpumask *cpu_map)
20652129 {
....@@ -2140,16 +2204,16 @@
21402204 * ndoms_new == 0 is a special case for destroying existing domains,
21412205 * and it will not create the default domain.
21422206 *
2143
- * Call with hotplug lock held
2207
+ * Call with hotplug lock and sched_domains_mutex held
21442208 */
2145
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2146
- struct sched_domain_attr *dattr_new)
2209
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2210
+ struct sched_domain_attr *dattr_new)
21472211 {
21482212 bool __maybe_unused has_eas = false;
21492213 int i, j, n;
21502214 int new_topology;
21512215
2152
- mutex_lock(&sched_domains_mutex);
2216
+ lockdep_assert_held(&sched_domains_mutex);
21532217
21542218 /* Always unregister in case we don't destroy any domains: */
21552219 unregister_sched_domain_sysctl();
....@@ -2174,8 +2238,19 @@
21742238 for (i = 0; i < ndoms_cur; i++) {
21752239 for (j = 0; j < n && !new_topology; j++) {
21762240 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2177
- dattrs_equal(dattr_cur, i, dattr_new, j))
2241
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
2242
+ struct root_domain *rd;
2243
+
2244
+ /*
2245
+ * This domain won't be destroyed and as such
2246
+ * its dl_bw->total_bw needs to be cleared. It
2247
+ * will be recomputed in function
2248
+ * update_tasks_root_domain().
2249
+ */
2250
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2251
+ dl_clear_root_domain(rd);
21782252 goto match1;
2253
+ }
21792254 }
21802255 /* No match - a current sched domain not in new doms_new[] */
21812256 detach_destroy_domains(doms_cur[i]);
....@@ -2204,10 +2279,10 @@
22042279 ;
22052280 }
22062281
2207
-#ifdef CONFIG_ENERGY_MODEL
2282
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
22082283 /* Build perf. domains: */
22092284 for (i = 0; i < ndoms_new; i++) {
2210
- for (j = 0; j < n; j++) {
2285
+ for (j = 0; j < n && !sched_energy_update; j++) {
22112286 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
22122287 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
22132288 has_eas = true;
....@@ -2232,6 +2307,15 @@
22322307 ndoms_cur = ndoms_new;
22332308
22342309 register_sched_domain_sysctl();
2310
+}
22352311
2312
+/*
2313
+ * Call with hotplug lock held
2314
+ */
2315
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2316
+ struct sched_domain_attr *dattr_new)
2317
+{
2318
+ mutex_lock(&sched_domains_mutex);
2319
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
22362320 mutex_unlock(&sched_domains_mutex);
22372321 }