forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/kernel/sched/topology.c
....@@ -4,11 +4,16 @@
44 */
55 #include "sched.h"
66
7
+#include <trace/hooks/sched.h>
8
+
79 DEFINE_MUTEX(sched_domains_mutex);
10
+#ifdef CONFIG_LOCKDEP
11
+EXPORT_SYMBOL_GPL(sched_domains_mutex);
12
+#endif
813
914 /* Protected by sched_domains_mutex: */
10
-cpumask_var_t sched_domains_tmpmask;
11
-cpumask_var_t sched_domains_tmpmask2;
15
+static cpumask_var_t sched_domains_tmpmask;
16
+static cpumask_var_t sched_domains_tmpmask2;
1217
1318 #ifdef CONFIG_SCHED_DEBUG
1419
....@@ -25,22 +30,22 @@
2530 return sched_debug_enabled;
2631 }
2732
33
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
34
+const struct sd_flag_debug sd_flag_debug[] = {
35
+#include <linux/sched/sd_flags.h>
36
+};
37
+#undef SD_FLAG
38
+
2839 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
2940 struct cpumask *groupmask)
3041 {
3142 struct sched_group *group = sd->groups;
43
+ unsigned long flags = sd->flags;
44
+ unsigned int idx;
3245
3346 cpumask_clear(groupmask);
3447
3548 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36
-
37
- if (!(sd->flags & SD_LOAD_BALANCE)) {
38
- printk("does not load-balance\n");
39
- if (sd->parent)
40
- printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41
- return -1;
42
- }
43
-
4449 printk(KERN_CONT "span=%*pbl level=%s\n",
4550 cpumask_pr_args(sched_domain_span(sd)), sd->name);
4651
....@@ -49,6 +54,21 @@
4954 }
5055 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
5156 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
57
+ }
58
+
59
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
60
+ unsigned int flag = BIT(idx);
61
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
62
+
63
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
64
+ !(sd->child->flags & flag))
65
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
66
+ sd_flag_debug[idx].name);
67
+
68
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
69
+ !(sd->parent->flags & flag))
70
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
71
+ sd_flag_debug[idx].name);
5272 }
5373
5474 printk(KERN_DEBUG "%*s groups:", level + 1, "");
....@@ -145,23 +165,22 @@
145165 }
146166 #endif /* CONFIG_SCHED_DEBUG */
147167
168
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
169
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
170
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
171
+#include <linux/sched/sd_flags.h>
172
+0;
173
+#undef SD_FLAG
174
+
148175 static int sd_degenerate(struct sched_domain *sd)
149176 {
150177 if (cpumask_weight(sched_domain_span(sd)) == 1)
151178 return 1;
152179
153180 /* Following flags need at least 2 groups */
154
- if (sd->flags & (SD_LOAD_BALANCE |
155
- SD_BALANCE_NEWIDLE |
156
- SD_BALANCE_FORK |
157
- SD_BALANCE_EXEC |
158
- SD_SHARE_CPUCAPACITY |
159
- SD_ASYM_CPUCAPACITY |
160
- SD_SHARE_PKG_RESOURCES |
161
- SD_SHARE_POWERDOMAIN)) {
162
- if (sd->groups != sd->groups->next)
163
- return 0;
164
- }
181
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
182
+ (sd->groups != sd->groups->next))
183
+ return 0;
165184
166185 /* Following flags don't use groups */
167186 if (sd->flags & (SD_WAKE_AFFINE))
....@@ -182,36 +201,24 @@
182201 return 0;
183202
184203 /* Flags needing groups don't count if only 1 group in parent */
185
- if (parent->groups == parent->groups->next) {
186
- pflags &= ~(SD_LOAD_BALANCE |
187
- SD_BALANCE_NEWIDLE |
188
- SD_BALANCE_FORK |
189
- SD_BALANCE_EXEC |
190
- SD_ASYM_CPUCAPACITY |
191
- SD_SHARE_CPUCAPACITY |
192
- SD_SHARE_PKG_RESOURCES |
193
- SD_PREFER_SIBLING |
194
- SD_SHARE_POWERDOMAIN);
195
- if (nr_node_ids == 1)
196
- pflags &= ~SD_SERIALIZE;
197
- }
204
+ if (parent->groups == parent->groups->next)
205
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
206
+
198207 if (~cflags & pflags)
199208 return 0;
200209
201210 return 1;
202211 }
203212
204
-DEFINE_STATIC_KEY_FALSE(sched_energy_present);
205
-
206
-#ifdef CONFIG_ENERGY_MODEL
207213 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
214
+DEFINE_STATIC_KEY_FALSE(sched_energy_present);
208215 unsigned int sysctl_sched_energy_aware = 1;
209216 DEFINE_MUTEX(sched_energy_mutex);
210217 bool sched_energy_update;
211218
212219 #ifdef CONFIG_PROC_SYSCTL
213220 int sched_energy_aware_handler(struct ctl_table *table, int write,
214
- void __user *buffer, size_t *lenp, loff_t *ppos)
221
+ void *buffer, size_t *lenp, loff_t *ppos)
215222 {
216223 int ret, state;
217224
....@@ -233,7 +240,6 @@
233240 return ret;
234241 }
235242 #endif
236
-#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
237243
238244 static void free_pd(struct perf_domain *pd)
239245 {
....@@ -285,10 +291,10 @@
285291 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
286292
287293 while (pd) {
288
- printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
294
+ printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }",
289295 cpumask_first(perf_domain_span(pd)),
290296 cpumask_pr_args(perf_domain_span(pd)),
291
- em_pd_nr_cap_states(pd->em_pd));
297
+ em_pd_nr_perf_states(pd->em_pd));
292298 pd = pd->next;
293299 }
294300
....@@ -320,44 +326,55 @@
320326 * EAS can be used on a root domain if it meets all the following conditions:
321327 * 1. an Energy Model (EM) is available;
322328 * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy.
323
- * 3. the EM complexity is low enough to keep scheduling overheads low;
329
+ * 3. no SMT is detected.
330
+ * 4. the EM complexity is low enough to keep scheduling overheads low;
324331 *
325332 * The complexity of the Energy Model is defined as:
326333 *
327
- * C = nr_pd * (nr_cpus + nr_cs)
334
+ * C = nr_pd * (nr_cpus + nr_ps)
328335 *
329336 * with parameters defined as:
330337 * - nr_pd: the number of performance domains
331338 * - nr_cpus: the number of CPUs
332
- * - nr_cs: the sum of the number of capacity states of all performance
339
+ * - nr_ps: the sum of the number of performance states of all performance
333340 * domains (for example, on a system with 2 performance domains,
334
- * with 10 capacity states each, nr_cs = 2 * 10 = 20).
341
+ * with 10 performance states each, nr_ps = 2 * 10 = 20).
335342 *
336343 * It is generally not a good idea to use such a model in the wake-up path on
337344 * very complex platforms because of the associated scheduling overheads. The
338345 * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs
339
- * with per-CPU DVFS and less than 8 capacity states each, for example.
346
+ * with per-CPU DVFS and less than 8 performance states each, for example.
340347 */
341348 #define EM_MAX_COMPLEXITY 2048
342349
343350 static bool build_perf_domains(const struct cpumask *cpu_map)
344351 {
345
- int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
352
+ int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map);
346353 struct perf_domain *pd = NULL, *tmp;
347354 int cpu = cpumask_first(cpu_map);
348355 struct root_domain *rd = cpu_rq(cpu)->rd;
356
+ bool eas_check = false;
349357
350
-#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
351358 if (!sysctl_sched_energy_aware)
352359 goto free;
353
-#endif
354360
355
- /* EAS is enabled for asymmetric CPU capacity topologies. */
356
- if (!per_cpu(sd_asym_cpucapacity, cpu)) {
361
+ /*
362
+ * EAS is enabled for asymmetric CPU capacity topologies.
363
+ * Allow vendor to override if desired.
364
+ */
365
+ trace_android_rvh_build_perf_domains(&eas_check);
366
+ if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) {
357367 if (sched_debug()) {
358368 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
359369 cpumask_pr_args(cpu_map));
360370 }
371
+ goto free;
372
+ }
373
+
374
+ /* EAS definitely does *not* handle SMT */
375
+ if (sched_smt_active()) {
376
+ pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n",
377
+ cpumask_pr_args(cpu_map));
361378 goto free;
362379 }
363380
....@@ -374,15 +391,15 @@
374391 pd = tmp;
375392
376393 /*
377
- * Count performance domains and capacity states for the
394
+ * Count performance domains and performance states for the
378395 * complexity check.
379396 */
380397 nr_pd++;
381
- nr_cs += em_pd_nr_cap_states(pd->em_pd);
398
+ nr_ps += em_pd_nr_perf_states(pd->em_pd);
382399 }
383400
384401 /* Bail out if the Energy Model complexity is too high. */
385
- if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
402
+ if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) {
386403 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
387404 cpumask_pr_args(cpu_map));
388405 goto free;
....@@ -409,7 +426,7 @@
409426 }
410427 #else
411428 static void free_pd(struct perf_domain *pd) { }
412
-#endif /* CONFIG_ENERGY_MODEL */
429
+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/
413430
414431 static void free_rootdomain(struct rcu_head *rcu)
415432 {
....@@ -459,7 +476,7 @@
459476 raw_spin_unlock_irqrestore(&rq->lock, flags);
460477
461478 if (old_rd)
462
- call_rcu_sched(&old_rd->rcu, free_rootdomain);
479
+ call_rcu(&old_rd->rcu, free_rootdomain);
463480 }
464481
465482 void sched_get_rd(struct root_domain *rd)
....@@ -472,7 +489,7 @@
472489 if (!atomic_dec_and_test(&rd->refcount))
473490 return;
474491
475
- call_rcu_sched(&rd->rcu, free_rootdomain);
492
+ call_rcu(&rd->rcu, free_rootdomain);
476493 }
477494
478495 static int init_rootdomain(struct root_domain *rd)
....@@ -490,6 +507,7 @@
490507 rd->rto_cpu = -1;
491508 raw_spin_lock_init(&rd->rto_lock);
492509 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
510
+ atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags);
493511 #endif
494512
495513 init_dl_bw(&rd->dl_bw);
....@@ -498,9 +516,6 @@
498516
499517 if (cpupri_init(&rd->cpupri) != 0)
500518 goto free_cpudl;
501
-
502
- init_max_cpu_capacity(&rd->max_cpu_capacity);
503
-
504519 return 0;
505520
506521 free_cpudl:
....@@ -606,13 +621,13 @@
606621 * the cpumask of the domain), this allows us to quickly tell if
607622 * two CPUs are in the same cache domain, see cpus_share_cache().
608623 */
609
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
624
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
610625 DEFINE_PER_CPU(int, sd_llc_size);
611626 DEFINE_PER_CPU(int, sd_llc_id);
612
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
613
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
614
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
615
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
627
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
628
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
629
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
630
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
616631 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
617632
618633 static void update_top_cache_domain(int cpu)
....@@ -1050,6 +1065,7 @@
10501065 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
10511066 struct sched_domain *child = sd->child;
10521067 struct sched_group *sg;
1068
+ bool already_visited;
10531069
10541070 if (child)
10551071 cpu = cpumask_first(sched_domain_span(child));
....@@ -1057,9 +1073,14 @@
10571073 sg = *per_cpu_ptr(sdd->sg, cpu);
10581074 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
10591075
1060
- /* For claim_allocations: */
1061
- atomic_inc(&sg->ref);
1062
- atomic_inc(&sg->sgc->ref);
1076
+ /* Increase refcounts for claim_allocations: */
1077
+ already_visited = atomic_inc_return(&sg->ref) > 1;
1078
+ /* sgc visits should follow a similar trend as sg */
1079
+ WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1080
+
1081
+ /* If we have already visited that group, it's already initialized. */
1082
+ if (already_visited)
1083
+ return sg;
10631084
10641085 if (child) {
10651086 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
....@@ -1078,8 +1099,8 @@
10781099
10791100 /*
10801101 * build_sched_groups will build a circular linked list of the groups
1081
- * covered by the given span, and will set each group's ->cpumask correctly,
1082
- * and ->cpu_capacity to 0.
1102
+ * covered by the given span, will set each group's ->cpumask correctly,
1103
+ * and will initialize their ->sgc.
10831104 *
10841105 * Assumes the sched_domain tree is fully constructed
10851106 */
....@@ -1186,16 +1207,13 @@
11861207 if (!attr || attr->relax_domain_level < 0) {
11871208 if (default_relax_domain_level < 0)
11881209 return;
1189
- else
1190
- request = default_relax_domain_level;
1210
+ request = default_relax_domain_level;
11911211 } else
11921212 request = attr->relax_domain_level;
1193
- if (request < sd->level) {
1213
+
1214
+ if (sd->level > request) {
11941215 /* Turn off idle balance on this domain: */
11951216 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1196
- } else {
1197
- /* Turn on idle balance on this domain: */
1198
- sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
11991217 }
12001218 }
12011219
....@@ -1209,13 +1227,13 @@
12091227 case sa_rootdomain:
12101228 if (!atomic_read(&d->rd->refcount))
12111229 free_rootdomain(&d->rd->rcu);
1212
- /* Fall through */
1230
+ fallthrough;
12131231 case sa_sd:
12141232 free_percpu(d->sd);
1215
- /* Fall through */
1233
+ fallthrough;
12161234 case sa_sd_storage:
12171235 __sdt_free(cpu_map);
1218
- /* Fall through */
1236
+ fallthrough;
12191237 case sa_none:
12201238 break;
12211239 }
....@@ -1269,6 +1287,7 @@
12691287 int sched_max_numa_distance;
12701288 static int *sched_domains_numa_distance;
12711289 static struct cpumask ***sched_domains_numa_masks;
1290
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
12721291 #endif
12731292
12741293 /*
....@@ -1281,7 +1300,6 @@
12811300 * SD_SHARE_CPUCAPACITY - describes SMT topologies
12821301 * SD_SHARE_PKG_RESOURCES - describes shared caches
12831302 * SD_NUMA - describes NUMA topologies
1284
- * SD_SHARE_POWERDOMAIN - describes shared power domain
12851303 *
12861304 * Odd one out, which beside describing the topology has a quirk also
12871305 * prescribes the desired behaviour that goes along with it:
....@@ -1292,8 +1310,7 @@
12921310 (SD_SHARE_CPUCAPACITY | \
12931311 SD_SHARE_PKG_RESOURCES | \
12941312 SD_NUMA | \
1295
- SD_ASYM_PACKING | \
1296
- SD_SHARE_POWERDOMAIN)
1313
+ SD_ASYM_PACKING)
12971314
12981315 static struct sched_domain *
12991316 sd_init(struct sched_domain_topology_level *tl,
....@@ -1325,18 +1342,12 @@
13251342 *sd = (struct sched_domain){
13261343 .min_interval = sd_weight,
13271344 .max_interval = 2*sd_weight,
1328
- .busy_factor = 32,
1329
- .imbalance_pct = 125,
1345
+ .busy_factor = 16,
1346
+ .imbalance_pct = 117,
13301347
13311348 .cache_nice_tries = 0,
1332
- .busy_idx = 0,
1333
- .idle_idx = 0,
1334
- .newidle_idx = 0,
1335
- .wake_idx = 0,
1336
- .forkexec_idx = 0,
13371349
1338
- .flags = 1*SD_LOAD_BALANCE
1339
- | 1*SD_BALANCE_NEWIDLE
1350
+ .flags = 1*SD_BALANCE_NEWIDLE
13401351 | 1*SD_BALANCE_EXEC
13411352 | 1*SD_BALANCE_FORK
13421353 | 0*SD_BALANCE_WAKE
....@@ -1351,7 +1362,6 @@
13511362
13521363 .last_balance = jiffies,
13531364 .balance_interval = sd_weight,
1354
- .smt_gain = 0,
13551365 .max_newidle_lb_cost = 0,
13561366 .next_decay_max_lb_cost = jiffies,
13571367 .child = child,
....@@ -1367,37 +1377,24 @@
13671377 * Convert topological properties into behaviour.
13681378 */
13691379
1370
- if (sd->flags & SD_ASYM_CPUCAPACITY) {
1371
- struct sched_domain *t = sd;
1372
-
1373
- /*
1374
- * Don't attempt to spread across CPUs of different capacities.
1375
- */
1376
- if (sd->child)
1377
- sd->child->flags &= ~SD_PREFER_SIBLING;
1378
-
1379
- for_each_lower_domain(t)
1380
- t->flags |= SD_BALANCE_WAKE;
1381
- }
1380
+ /* Don't attempt to spread across CPUs of different capacities. */
1381
+ if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child)
1382
+ sd->child->flags &= ~SD_PREFER_SIBLING;
13821383
13831384 if (sd->flags & SD_SHARE_CPUCAPACITY) {
13841385 sd->imbalance_pct = 110;
1385
- sd->smt_gain = 1178; /* ~15% */
13861386
13871387 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
13881388 sd->imbalance_pct = 117;
13891389 sd->cache_nice_tries = 1;
1390
- sd->busy_idx = 2;
13911390
13921391 #ifdef CONFIG_NUMA
13931392 } else if (sd->flags & SD_NUMA) {
13941393 sd->cache_nice_tries = 2;
1395
- sd->busy_idx = 3;
1396
- sd->idle_idx = 2;
13971394
13981395 sd->flags &= ~SD_PREFER_SIBLING;
13991396 sd->flags |= SD_SERIALIZE;
1400
- if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1397
+ if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
14011398 sd->flags &= ~(SD_BALANCE_EXEC |
14021399 SD_BALANCE_FORK |
14031400 SD_WAKE_AFFINE);
....@@ -1406,8 +1403,6 @@
14061403 #endif
14071404 } else {
14081405 sd->cache_nice_tries = 1;
1409
- sd->busy_idx = 2;
1410
- sd->idle_idx = 1;
14111406 }
14121407
14131408 /*
....@@ -1548,66 +1543,58 @@
15481543 }
15491544 }
15501545
1546
+
1547
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
1548
+
15511549 void sched_init_numa(void)
15521550 {
1553
- int next_distance, curr_distance = node_distance(0, 0);
15541551 struct sched_domain_topology_level *tl;
1555
- int level = 0;
1556
- int i, j, k;
1557
-
1558
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1559
- if (!sched_domains_numa_distance)
1560
- return;
1561
-
1562
- /* Includes NUMA identity node at level 0. */
1563
- sched_domains_numa_distance[level++] = curr_distance;
1564
- sched_domains_numa_levels = level;
1552
+ unsigned long *distance_map;
1553
+ int nr_levels = 0;
1554
+ int i, j;
15651555
15661556 /*
15671557 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
15681558 * unique distances in the node_distance() table.
1569
- *
1570
- * Assumes node_distance(0,j) includes all distances in
1571
- * node_distance(i,j) in order to avoid cubic time.
15721559 */
1573
- next_distance = curr_distance;
1560
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
1561
+ if (!distance_map)
1562
+ return;
1563
+
1564
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
15741565 for (i = 0; i < nr_node_ids; i++) {
15751566 for (j = 0; j < nr_node_ids; j++) {
1576
- for (k = 0; k < nr_node_ids; k++) {
1577
- int distance = node_distance(i, k);
1567
+ int distance = node_distance(i, j);
15781568
1579
- if (distance > curr_distance &&
1580
- (distance < next_distance ||
1581
- next_distance == curr_distance))
1582
- next_distance = distance;
1583
-
1584
- /*
1585
- * While not a strong assumption it would be nice to know
1586
- * about cases where if node A is connected to B, B is not
1587
- * equally connected to A.
1588
- */
1589
- if (sched_debug() && node_distance(k, i) != distance)
1590
- sched_numa_warn("Node-distance not symmetric");
1591
-
1592
- if (sched_debug() && i && !find_numa_distance(distance))
1593
- sched_numa_warn("Node-0 not representative");
1569
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
1570
+ sched_numa_warn("Invalid distance value range");
1571
+ return;
15941572 }
1595
- if (next_distance != curr_distance) {
1596
- sched_domains_numa_distance[level++] = next_distance;
1597
- sched_domains_numa_levels = level;
1598
- curr_distance = next_distance;
1599
- } else break;
1600
- }
16011573
1602
- /*
1603
- * In case of sched_debug() we verify the above assumption.
1604
- */
1605
- if (!sched_debug())
1606
- break;
1574
+ bitmap_set(distance_map, distance, 1);
1575
+ }
1576
+ }
1577
+ /*
1578
+ * We can now figure out how many unique distance values there are and
1579
+ * allocate memory accordingly.
1580
+ */
1581
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
1582
+
1583
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
1584
+ if (!sched_domains_numa_distance) {
1585
+ bitmap_free(distance_map);
1586
+ return;
16071587 }
16081588
1589
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
1590
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
1591
+ sched_domains_numa_distance[i] = j;
1592
+ }
1593
+
1594
+ bitmap_free(distance_map);
1595
+
16091596 /*
1610
- * 'level' contains the number of unique distances
1597
+ * 'nr_levels' contains the number of unique distances
16111598 *
16121599 * The sched_domains_numa_distance[] array includes the actual distance
16131600 * numbers.
....@@ -1616,15 +1603,15 @@
16161603 /*
16171604 * Here, we should temporarily reset sched_domains_numa_levels to 0.
16181605 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1619
- * the array will contain less then 'level' members. This could be
1606
+ * the array will contain less then 'nr_levels' members. This could be
16201607 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
16211608 * in other functions.
16221609 *
1623
- * We reset it to 'level' at the end of this function.
1610
+ * We reset it to 'nr_levels' at the end of this function.
16241611 */
16251612 sched_domains_numa_levels = 0;
16261613
1627
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1614
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
16281615 if (!sched_domains_numa_masks)
16291616 return;
16301617
....@@ -1632,7 +1619,7 @@
16321619 * Now for each level, construct a mask per node which contains all
16331620 * CPUs of nodes that are that many hops away from us.
16341621 */
1635
- for (i = 0; i < level; i++) {
1622
+ for (i = 0; i < nr_levels; i++) {
16361623 sched_domains_numa_masks[i] =
16371624 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
16381625 if (!sched_domains_numa_masks[i])
....@@ -1640,12 +1627,17 @@
16401627
16411628 for (j = 0; j < nr_node_ids; j++) {
16421629 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1630
+ int k;
1631
+
16431632 if (!mask)
16441633 return;
16451634
16461635 sched_domains_numa_masks[i][j] = mask;
16471636
16481637 for_each_node(k) {
1638
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
1639
+ sched_numa_warn("Node-distance not symmetric");
1640
+
16491641 if (node_distance(j, k) > sched_domains_numa_distance[i])
16501642 continue;
16511643
....@@ -1657,7 +1649,7 @@
16571649 /* Compute default topology size */
16581650 for (i = 0; sched_domain_topology[i].mask; i++);
16591651
1660
- tl = kzalloc((i + level + 1) *
1652
+ tl = kzalloc((i + nr_levels + 1) *
16611653 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
16621654 if (!tl)
16631655 return;
....@@ -1680,7 +1672,7 @@
16801672 /*
16811673 * .. and append 'j' levels of NUMA goodness.
16821674 */
1683
- for (j = 1; j < level; i++, j++) {
1675
+ for (j = 1; j < nr_levels; i++, j++) {
16841676 tl[i] = (struct sched_domain_topology_level){
16851677 .mask = sd_numa_mask,
16861678 .sd_flags = cpu_numa_flags,
....@@ -1692,8 +1684,8 @@
16921684
16931685 sched_domain_topology = tl;
16941686
1695
- sched_domains_numa_levels = level;
1696
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1687
+ sched_domains_numa_levels = nr_levels;
1688
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
16971689
16981690 init_numa_topology_type();
16991691 }
....@@ -1719,6 +1711,26 @@
17191711 for (j = 0; j < nr_node_ids; j++)
17201712 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
17211713 }
1714
+}
1715
+
1716
+/*
1717
+ * sched_numa_find_closest() - given the NUMA topology, find the cpu
1718
+ * closest to @cpu from @cpumask.
1719
+ * cpumask: cpumask to find a cpu from
1720
+ * cpu: cpu to be close to
1721
+ *
1722
+ * returns: cpu, or nr_cpu_ids when nothing found.
1723
+ */
1724
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1725
+{
1726
+ int i, j = cpu_to_node(cpu);
1727
+
1728
+ for (i = 0; i < sched_domains_numa_levels; i++) {
1729
+ cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1730
+ if (cpu < nr_cpu_ids)
1731
+ return cpu;
1732
+ }
1733
+ return nr_cpu_ids;
17221734 }
17231735
17241736 #endif /* CONFIG_NUMA */
....@@ -1859,6 +1871,42 @@
18591871 }
18601872
18611873 /*
1874
+ * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for
1875
+ * any two given CPUs at this (non-NUMA) topology level.
1876
+ */
1877
+static bool topology_span_sane(struct sched_domain_topology_level *tl,
1878
+ const struct cpumask *cpu_map, int cpu)
1879
+{
1880
+ int i;
1881
+
1882
+ /* NUMA levels are allowed to overlap */
1883
+ if (tl->flags & SDTL_OVERLAP)
1884
+ return true;
1885
+
1886
+ /*
1887
+ * Non-NUMA levels cannot partially overlap - they must be either
1888
+ * completely equal or completely disjoint. Otherwise we can end up
1889
+ * breaking the sched_group lists - i.e. a later get_group() pass
1890
+ * breaks the linking done for an earlier span.
1891
+ */
1892
+ for_each_cpu(i, cpu_map) {
1893
+ if (i == cpu)
1894
+ continue;
1895
+ /*
1896
+ * We should 'and' all those masks with 'cpu_map' to exactly
1897
+ * match the topology we're about to build, but that can only
1898
+ * remove CPUs, which only lessens our ability to detect
1899
+ * overlaps
1900
+ */
1901
+ if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
1902
+ cpumask_intersects(tl->mask(cpu), tl->mask(i)))
1903
+ return false;
1904
+ }
1905
+
1906
+ return true;
1907
+}
1908
+
1909
+/*
18621910 * Find the sched_domain_topology_level where all CPU capacities are visible
18631911 * for all CPUs.
18641912 */
....@@ -1871,10 +1919,10 @@
18711919 unsigned long cap;
18721920
18731921 /* Is there any asymmetry? */
1874
- cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
1922
+ cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
18751923
18761924 for_each_cpu(i, cpu_map) {
1877
- if (arch_scale_cpu_capacity(NULL, i) != cap) {
1925
+ if (arch_scale_cpu_capacity(i) != cap) {
18781926 asym = true;
18791927 break;
18801928 }
....@@ -1889,7 +1937,7 @@
18891937 * to everyone.
18901938 */
18911939 for_each_cpu(i, cpu_map) {
1892
- unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
1940
+ unsigned long max_capacity = arch_scale_cpu_capacity(i);
18931941 int tl_id = 0;
18941942
18951943 for_each_sd_topology(tl) {
....@@ -1899,7 +1947,7 @@
18991947 for_each_cpu_and(j, tl->mask(i), cpu_map) {
19001948 unsigned long capacity;
19011949
1902
- capacity = arch_scale_cpu_capacity(NULL, j);
1950
+ capacity = arch_scale_cpu_capacity(j);
19031951
19041952 if (capacity <= max_capacity)
19051953 continue;
....@@ -1924,12 +1972,16 @@
19241972 static int
19251973 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
19261974 {
1927
- enum s_alloc alloc_state;
1975
+ enum s_alloc alloc_state = sa_none;
19281976 struct sched_domain *sd;
19291977 struct s_data d;
1978
+ struct rq *rq = NULL;
19301979 int i, ret = -ENOMEM;
19311980 struct sched_domain_topology_level *tl_asym;
19321981 bool has_asym = false;
1982
+
1983
+ if (WARN_ON(cpumask_empty(cpu_map)))
1984
+ goto error;
19331985
19341986 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
19351987 if (alloc_state != sa_rootdomain)
....@@ -1940,15 +1992,17 @@
19401992 /* Set up domains for CPUs specified by the cpu_map: */
19411993 for_each_cpu(i, cpu_map) {
19421994 struct sched_domain_topology_level *tl;
1995
+ int dflags = 0;
19431996
19441997 sd = NULL;
19451998 for_each_sd_topology(tl) {
1946
- int dflags = 0;
1947
-
19481999 if (tl == tl_asym) {
19492000 dflags |= SD_ASYM_CPUCAPACITY;
19502001 has_asym = true;
19512002 }
2003
+
2004
+ if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2005
+ goto error;
19522006
19532007 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
19542008
....@@ -1989,13 +2043,25 @@
19892043 /* Attach the domains */
19902044 rcu_read_lock();
19912045 for_each_cpu(i, cpu_map) {
2046
+ rq = cpu_rq(i);
19922047 sd = *per_cpu_ptr(d.sd, i);
2048
+
2049
+ /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
2050
+ if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2051
+ WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2052
+
19932053 cpu_attach_domain(sd, d.rd, i);
19942054 }
19952055 rcu_read_unlock();
19962056
19972057 if (has_asym)
19982058 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2059
+
2060
+ if (rq && sched_debug_enabled) {
2061
+ pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2062
+ cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2063
+ }
2064
+ trace_android_vh_build_sched_domains(has_asym);
19992065
20002066 ret = 0;
20012067 error:
....@@ -2056,9 +2122,8 @@
20562122 }
20572123
20582124 /*
2059
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
2060
- * For now this just excludes isolated CPUs, but could be used to
2061
- * exclude other special cases in the future.
2125
+ * Set up scheduler domains and groups. For now this just excludes isolated
2126
+ * CPUs, but could be used to exclude other special cases in the future.
20622127 */
20632128 int sched_init_domains(const struct cpumask *cpu_map)
20642129 {
....@@ -2139,16 +2204,16 @@
21392204 * ndoms_new == 0 is a special case for destroying existing domains,
21402205 * and it will not create the default domain.
21412206 *
2142
- * Call with hotplug lock held
2207
+ * Call with hotplug lock and sched_domains_mutex held
21432208 */
2144
-void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2145
- struct sched_domain_attr *dattr_new)
2209
+void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2210
+ struct sched_domain_attr *dattr_new)
21462211 {
21472212 bool __maybe_unused has_eas = false;
21482213 int i, j, n;
21492214 int new_topology;
21502215
2151
- mutex_lock(&sched_domains_mutex);
2216
+ lockdep_assert_held(&sched_domains_mutex);
21522217
21532218 /* Always unregister in case we don't destroy any domains: */
21542219 unregister_sched_domain_sysctl();
....@@ -2173,8 +2238,19 @@
21732238 for (i = 0; i < ndoms_cur; i++) {
21742239 for (j = 0; j < n && !new_topology; j++) {
21752240 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2176
- dattrs_equal(dattr_cur, i, dattr_new, j))
2241
+ dattrs_equal(dattr_cur, i, dattr_new, j)) {
2242
+ struct root_domain *rd;
2243
+
2244
+ /*
2245
+ * This domain won't be destroyed and as such
2246
+ * its dl_bw->total_bw needs to be cleared. It
2247
+ * will be recomputed in function
2248
+ * update_tasks_root_domain().
2249
+ */
2250
+ rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2251
+ dl_clear_root_domain(rd);
21772252 goto match1;
2253
+ }
21782254 }
21792255 /* No match - a current sched domain not in new doms_new[] */
21802256 detach_destroy_domains(doms_cur[i]);
....@@ -2203,10 +2279,10 @@
22032279 ;
22042280 }
22052281
2206
-#ifdef CONFIG_ENERGY_MODEL
2282
+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
22072283 /* Build perf. domains: */
22082284 for (i = 0; i < ndoms_new; i++) {
2209
- for (j = 0; j < n; j++) {
2285
+ for (j = 0; j < n && !sched_energy_update; j++) {
22102286 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
22112287 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
22122288 has_eas = true;
....@@ -2231,6 +2307,15 @@
22312307 ndoms_cur = ndoms_new;
22322308
22332309 register_sched_domain_sysctl();
2310
+}
22342311
2312
+/*
2313
+ * Call with hotplug lock held
2314
+ */
2315
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2316
+ struct sched_domain_attr *dattr_new)
2317
+{
2318
+ mutex_lock(&sched_domains_mutex);
2319
+ partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
22352320 mutex_unlock(&sched_domains_mutex);
22362321 }