| .. | .. |
|---|
| 4 | 4 | */ |
|---|
| 5 | 5 | #include "sched.h" |
|---|
| 6 | 6 | |
|---|
| 7 | +#include <trace/hooks/sched.h> |
|---|
| 8 | + |
|---|
| 7 | 9 | DEFINE_MUTEX(sched_domains_mutex); |
|---|
| 10 | +#ifdef CONFIG_LOCKDEP |
|---|
| 11 | +EXPORT_SYMBOL_GPL(sched_domains_mutex); |
|---|
| 12 | +#endif |
|---|
| 8 | 13 | |
|---|
| 9 | 14 | /* Protected by sched_domains_mutex: */ |
|---|
| 10 | | -cpumask_var_t sched_domains_tmpmask; |
|---|
| 11 | | -cpumask_var_t sched_domains_tmpmask2; |
|---|
| 15 | +static cpumask_var_t sched_domains_tmpmask; |
|---|
| 16 | +static cpumask_var_t sched_domains_tmpmask2; |
|---|
| 12 | 17 | |
|---|
| 13 | 18 | #ifdef CONFIG_SCHED_DEBUG |
|---|
| 14 | 19 | |
|---|
| .. | .. |
|---|
| 25 | 30 | return sched_debug_enabled; |
|---|
| 26 | 31 | } |
|---|
| 27 | 32 | |
|---|
| 33 | +#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, |
|---|
| 34 | +const struct sd_flag_debug sd_flag_debug[] = { |
|---|
| 35 | +#include <linux/sched/sd_flags.h> |
|---|
| 36 | +}; |
|---|
| 37 | +#undef SD_FLAG |
|---|
| 38 | + |
|---|
| 28 | 39 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
|---|
| 29 | 40 | struct cpumask *groupmask) |
|---|
| 30 | 41 | { |
|---|
| 31 | 42 | struct sched_group *group = sd->groups; |
|---|
| 43 | + unsigned long flags = sd->flags; |
|---|
| 44 | + unsigned int idx; |
|---|
| 32 | 45 | |
|---|
| 33 | 46 | cpumask_clear(groupmask); |
|---|
| 34 | 47 | |
|---|
| 35 | 48 | printk(KERN_DEBUG "%*s domain-%d: ", level, "", level); |
|---|
| 36 | | - |
|---|
| 37 | | - if (!(sd->flags & SD_LOAD_BALANCE)) { |
|---|
| 38 | | - printk("does not load-balance\n"); |
|---|
| 39 | | - if (sd->parent) |
|---|
| 40 | | - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
|---|
| 41 | | - return -1; |
|---|
| 42 | | - } |
|---|
| 43 | | - |
|---|
| 44 | 49 | printk(KERN_CONT "span=%*pbl level=%s\n", |
|---|
| 45 | 50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
|---|
| 46 | 51 | |
|---|
| .. | .. |
|---|
| 49 | 54 | } |
|---|
| 50 | 55 | if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) { |
|---|
| 51 | 56 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
|---|
| 57 | + } |
|---|
| 58 | + |
|---|
| 59 | + for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { |
|---|
| 60 | + unsigned int flag = BIT(idx); |
|---|
| 61 | + unsigned int meta_flags = sd_flag_debug[idx].meta_flags; |
|---|
| 62 | + |
|---|
| 63 | + if ((meta_flags & SDF_SHARED_CHILD) && sd->child && |
|---|
| 64 | + !(sd->child->flags & flag)) |
|---|
| 65 | + printk(KERN_ERR "ERROR: flag %s set here but not in child\n", |
|---|
| 66 | + sd_flag_debug[idx].name); |
|---|
| 67 | + |
|---|
| 68 | + if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && |
|---|
| 69 | + !(sd->parent->flags & flag)) |
|---|
| 70 | + printk(KERN_ERR "ERROR: flag %s set here but not in parent\n", |
|---|
| 71 | + sd_flag_debug[idx].name); |
|---|
| 52 | 72 | } |
|---|
| 53 | 73 | |
|---|
| 54 | 74 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
|---|
| .. | .. |
|---|
| 145 | 165 | } |
|---|
| 146 | 166 | #endif /* CONFIG_SCHED_DEBUG */ |
|---|
| 147 | 167 | |
|---|
| 168 | +/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ |
|---|
| 169 | +#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | |
|---|
| 170 | +static const unsigned int SD_DEGENERATE_GROUPS_MASK = |
|---|
| 171 | +#include <linux/sched/sd_flags.h> |
|---|
| 172 | +0; |
|---|
| 173 | +#undef SD_FLAG |
|---|
| 174 | + |
|---|
| 148 | 175 | static int sd_degenerate(struct sched_domain *sd) |
|---|
| 149 | 176 | { |
|---|
| 150 | 177 | if (cpumask_weight(sched_domain_span(sd)) == 1) |
|---|
| 151 | 178 | return 1; |
|---|
| 152 | 179 | |
|---|
| 153 | 180 | /* Following flags need at least 2 groups */ |
|---|
| 154 | | - if (sd->flags & (SD_LOAD_BALANCE | |
|---|
| 155 | | - SD_BALANCE_NEWIDLE | |
|---|
| 156 | | - SD_BALANCE_FORK | |
|---|
| 157 | | - SD_BALANCE_EXEC | |
|---|
| 158 | | - SD_SHARE_CPUCAPACITY | |
|---|
| 159 | | - SD_ASYM_CPUCAPACITY | |
|---|
| 160 | | - SD_SHARE_PKG_RESOURCES | |
|---|
| 161 | | - SD_SHARE_POWERDOMAIN)) { |
|---|
| 162 | | - if (sd->groups != sd->groups->next) |
|---|
| 163 | | - return 0; |
|---|
| 164 | | - } |
|---|
| 181 | + if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && |
|---|
| 182 | + (sd->groups != sd->groups->next)) |
|---|
| 183 | + return 0; |
|---|
| 165 | 184 | |
|---|
| 166 | 185 | /* Following flags don't use groups */ |
|---|
| 167 | 186 | if (sd->flags & (SD_WAKE_AFFINE)) |
|---|
| .. | .. |
|---|
| 182 | 201 | return 0; |
|---|
| 183 | 202 | |
|---|
| 184 | 203 | /* Flags needing groups don't count if only 1 group in parent */ |
|---|
| 185 | | - if (parent->groups == parent->groups->next) { |
|---|
| 186 | | - pflags &= ~(SD_LOAD_BALANCE | |
|---|
| 187 | | - SD_BALANCE_NEWIDLE | |
|---|
| 188 | | - SD_BALANCE_FORK | |
|---|
| 189 | | - SD_BALANCE_EXEC | |
|---|
| 190 | | - SD_ASYM_CPUCAPACITY | |
|---|
| 191 | | - SD_SHARE_CPUCAPACITY | |
|---|
| 192 | | - SD_SHARE_PKG_RESOURCES | |
|---|
| 193 | | - SD_PREFER_SIBLING | |
|---|
| 194 | | - SD_SHARE_POWERDOMAIN); |
|---|
| 195 | | - if (nr_node_ids == 1) |
|---|
| 196 | | - pflags &= ~SD_SERIALIZE; |
|---|
| 197 | | - } |
|---|
| 204 | + if (parent->groups == parent->groups->next) |
|---|
| 205 | + pflags &= ~SD_DEGENERATE_GROUPS_MASK; |
|---|
| 206 | + |
|---|
| 198 | 207 | if (~cflags & pflags) |
|---|
| 199 | 208 | return 0; |
|---|
| 200 | 209 | |
|---|
| 201 | 210 | return 1; |
|---|
| 202 | 211 | } |
|---|
| 203 | 212 | |
|---|
| 204 | | -DEFINE_STATIC_KEY_FALSE(sched_energy_present); |
|---|
| 205 | | - |
|---|
| 206 | | -#ifdef CONFIG_ENERGY_MODEL |
|---|
| 207 | 213 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
|---|
| 214 | +DEFINE_STATIC_KEY_FALSE(sched_energy_present); |
|---|
| 208 | 215 | unsigned int sysctl_sched_energy_aware = 1; |
|---|
| 209 | 216 | DEFINE_MUTEX(sched_energy_mutex); |
|---|
| 210 | 217 | bool sched_energy_update; |
|---|
| 211 | 218 | |
|---|
| 212 | 219 | #ifdef CONFIG_PROC_SYSCTL |
|---|
| 213 | 220 | int sched_energy_aware_handler(struct ctl_table *table, int write, |
|---|
| 214 | | - void __user *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 221 | + void *buffer, size_t *lenp, loff_t *ppos) |
|---|
| 215 | 222 | { |
|---|
| 216 | 223 | int ret, state; |
|---|
| 217 | 224 | |
|---|
| .. | .. |
|---|
| 233 | 240 | return ret; |
|---|
| 234 | 241 | } |
|---|
| 235 | 242 | #endif |
|---|
| 236 | | -#endif /* defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ |
|---|
| 237 | 243 | |
|---|
| 238 | 244 | static void free_pd(struct perf_domain *pd) |
|---|
| 239 | 245 | { |
|---|
| .. | .. |
|---|
| 285 | 291 | printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map)); |
|---|
| 286 | 292 | |
|---|
| 287 | 293 | while (pd) { |
|---|
| 288 | | - printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }", |
|---|
| 294 | + printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }", |
|---|
| 289 | 295 | cpumask_first(perf_domain_span(pd)), |
|---|
| 290 | 296 | cpumask_pr_args(perf_domain_span(pd)), |
|---|
| 291 | | - em_pd_nr_cap_states(pd->em_pd)); |
|---|
| 297 | + em_pd_nr_perf_states(pd->em_pd)); |
|---|
| 292 | 298 | pd = pd->next; |
|---|
| 293 | 299 | } |
|---|
| 294 | 300 | |
|---|
| .. | .. |
|---|
| 320 | 326 | * EAS can be used on a root domain if it meets all the following conditions: |
|---|
| 321 | 327 | * 1. an Energy Model (EM) is available; |
|---|
| 322 | 328 | * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. |
|---|
| 323 | | - * 3. the EM complexity is low enough to keep scheduling overheads low; |
|---|
| 329 | + * 3. no SMT is detected. |
|---|
| 330 | + * 4. the EM complexity is low enough to keep scheduling overheads low; |
|---|
| 324 | 331 | * |
|---|
| 325 | 332 | * The complexity of the Energy Model is defined as: |
|---|
| 326 | 333 | * |
|---|
| 327 | | - * C = nr_pd * (nr_cpus + nr_cs) |
|---|
| 334 | + * C = nr_pd * (nr_cpus + nr_ps) |
|---|
| 328 | 335 | * |
|---|
| 329 | 336 | * with parameters defined as: |
|---|
| 330 | 337 | * - nr_pd: the number of performance domains |
|---|
| 331 | 338 | * - nr_cpus: the number of CPUs |
|---|
| 332 | | - * - nr_cs: the sum of the number of capacity states of all performance |
|---|
| 339 | + * - nr_ps: the sum of the number of performance states of all performance |
|---|
| 333 | 340 | * domains (for example, on a system with 2 performance domains, |
|---|
| 334 | | - * with 10 capacity states each, nr_cs = 2 * 10 = 20). |
|---|
| 341 | + * with 10 performance states each, nr_ps = 2 * 10 = 20). |
|---|
| 335 | 342 | * |
|---|
| 336 | 343 | * It is generally not a good idea to use such a model in the wake-up path on |
|---|
| 337 | 344 | * very complex platforms because of the associated scheduling overheads. The |
|---|
| 338 | 345 | * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs |
|---|
| 339 | | - * with per-CPU DVFS and less than 8 capacity states each, for example. |
|---|
| 346 | + * with per-CPU DVFS and less than 8 performance states each, for example. |
|---|
| 340 | 347 | */ |
|---|
| 341 | 348 | #define EM_MAX_COMPLEXITY 2048 |
|---|
| 342 | 349 | |
|---|
| 343 | 350 | static bool build_perf_domains(const struct cpumask *cpu_map) |
|---|
| 344 | 351 | { |
|---|
| 345 | | - int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map); |
|---|
| 352 | + int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); |
|---|
| 346 | 353 | struct perf_domain *pd = NULL, *tmp; |
|---|
| 347 | 354 | int cpu = cpumask_first(cpu_map); |
|---|
| 348 | 355 | struct root_domain *rd = cpu_rq(cpu)->rd; |
|---|
| 356 | + bool eas_check = false; |
|---|
| 349 | 357 | |
|---|
| 350 | | -#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
|---|
| 351 | 358 | if (!sysctl_sched_energy_aware) |
|---|
| 352 | 359 | goto free; |
|---|
| 353 | | -#endif |
|---|
| 354 | 360 | |
|---|
| 355 | | - /* EAS is enabled for asymmetric CPU capacity topologies. */ |
|---|
| 356 | | - if (!per_cpu(sd_asym_cpucapacity, cpu)) { |
|---|
| 361 | + /* |
|---|
| 362 | + * EAS is enabled for asymmetric CPU capacity topologies. |
|---|
| 363 | + * Allow vendor to override if desired. |
|---|
| 364 | + */ |
|---|
| 365 | + trace_android_rvh_build_perf_domains(&eas_check); |
|---|
| 366 | + if (!per_cpu(sd_asym_cpucapacity, cpu) && !eas_check) { |
|---|
| 357 | 367 | if (sched_debug()) { |
|---|
| 358 | 368 | pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", |
|---|
| 359 | 369 | cpumask_pr_args(cpu_map)); |
|---|
| 360 | 370 | } |
|---|
| 371 | + goto free; |
|---|
| 372 | + } |
|---|
| 373 | + |
|---|
| 374 | + /* EAS definitely does *not* handle SMT */ |
|---|
| 375 | + if (sched_smt_active()) { |
|---|
| 376 | + pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", |
|---|
| 377 | + cpumask_pr_args(cpu_map)); |
|---|
| 361 | 378 | goto free; |
|---|
| 362 | 379 | } |
|---|
| 363 | 380 | |
|---|
| .. | .. |
|---|
| 374 | 391 | pd = tmp; |
|---|
| 375 | 392 | |
|---|
| 376 | 393 | /* |
|---|
| 377 | | - * Count performance domains and capacity states for the |
|---|
| 394 | + * Count performance domains and performance states for the |
|---|
| 378 | 395 | * complexity check. |
|---|
| 379 | 396 | */ |
|---|
| 380 | 397 | nr_pd++; |
|---|
| 381 | | - nr_cs += em_pd_nr_cap_states(pd->em_pd); |
|---|
| 398 | + nr_ps += em_pd_nr_perf_states(pd->em_pd); |
|---|
| 382 | 399 | } |
|---|
| 383 | 400 | |
|---|
| 384 | 401 | /* Bail out if the Energy Model complexity is too high. */ |
|---|
| 385 | | - if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) { |
|---|
| 402 | + if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { |
|---|
| 386 | 403 | WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", |
|---|
| 387 | 404 | cpumask_pr_args(cpu_map)); |
|---|
| 388 | 405 | goto free; |
|---|
| .. | .. |
|---|
| 409 | 426 | } |
|---|
| 410 | 427 | #else |
|---|
| 411 | 428 | static void free_pd(struct perf_domain *pd) { } |
|---|
| 412 | | -#endif /* CONFIG_ENERGY_MODEL */ |
|---|
| 429 | +#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ |
|---|
| 413 | 430 | |
|---|
| 414 | 431 | static void free_rootdomain(struct rcu_head *rcu) |
|---|
| 415 | 432 | { |
|---|
| .. | .. |
|---|
| 459 | 476 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
|---|
| 460 | 477 | |
|---|
| 461 | 478 | if (old_rd) |
|---|
| 462 | | - call_rcu_sched(&old_rd->rcu, free_rootdomain); |
|---|
| 479 | + call_rcu(&old_rd->rcu, free_rootdomain); |
|---|
| 463 | 480 | } |
|---|
| 464 | 481 | |
|---|
| 465 | 482 | void sched_get_rd(struct root_domain *rd) |
|---|
| .. | .. |
|---|
| 472 | 489 | if (!atomic_dec_and_test(&rd->refcount)) |
|---|
| 473 | 490 | return; |
|---|
| 474 | 491 | |
|---|
| 475 | | - call_rcu_sched(&rd->rcu, free_rootdomain); |
|---|
| 492 | + call_rcu(&rd->rcu, free_rootdomain); |
|---|
| 476 | 493 | } |
|---|
| 477 | 494 | |
|---|
| 478 | 495 | static int init_rootdomain(struct root_domain *rd) |
|---|
| .. | .. |
|---|
| 490 | 507 | rd->rto_cpu = -1; |
|---|
| 491 | 508 | raw_spin_lock_init(&rd->rto_lock); |
|---|
| 492 | 509 | init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); |
|---|
| 493 | | - rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ; |
|---|
| 510 | + atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.flags); |
|---|
| 494 | 511 | #endif |
|---|
| 495 | 512 | |
|---|
| 496 | 513 | init_dl_bw(&rd->dl_bw); |
|---|
| .. | .. |
|---|
| 499 | 516 | |
|---|
| 500 | 517 | if (cpupri_init(&rd->cpupri) != 0) |
|---|
| 501 | 518 | goto free_cpudl; |
|---|
| 502 | | - |
|---|
| 503 | | - init_max_cpu_capacity(&rd->max_cpu_capacity); |
|---|
| 504 | | - |
|---|
| 505 | 519 | return 0; |
|---|
| 506 | 520 | |
|---|
| 507 | 521 | free_cpudl: |
|---|
| .. | .. |
|---|
| 607 | 621 | * the cpumask of the domain), this allows us to quickly tell if |
|---|
| 608 | 622 | * two CPUs are in the same cache domain, see cpus_share_cache(). |
|---|
| 609 | 623 | */ |
|---|
| 610 | | -DEFINE_PER_CPU(struct sched_domain *, sd_llc); |
|---|
| 624 | +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); |
|---|
| 611 | 625 | DEFINE_PER_CPU(int, sd_llc_size); |
|---|
| 612 | 626 | DEFINE_PER_CPU(int, sd_llc_id); |
|---|
| 613 | | -DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); |
|---|
| 614 | | -DEFINE_PER_CPU(struct sched_domain *, sd_numa); |
|---|
| 615 | | -DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing); |
|---|
| 616 | | -DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); |
|---|
| 627 | +DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); |
|---|
| 628 | +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); |
|---|
| 629 | +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); |
|---|
| 630 | +DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); |
|---|
| 617 | 631 | DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); |
|---|
| 618 | 632 | |
|---|
| 619 | 633 | static void update_top_cache_domain(int cpu) |
|---|
| .. | .. |
|---|
| 1051 | 1065 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
|---|
| 1052 | 1066 | struct sched_domain *child = sd->child; |
|---|
| 1053 | 1067 | struct sched_group *sg; |
|---|
| 1068 | + bool already_visited; |
|---|
| 1054 | 1069 | |
|---|
| 1055 | 1070 | if (child) |
|---|
| 1056 | 1071 | cpu = cpumask_first(sched_domain_span(child)); |
|---|
| .. | .. |
|---|
| 1058 | 1073 | sg = *per_cpu_ptr(sdd->sg, cpu); |
|---|
| 1059 | 1074 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
|---|
| 1060 | 1075 | |
|---|
| 1061 | | - /* For claim_allocations: */ |
|---|
| 1062 | | - atomic_inc(&sg->ref); |
|---|
| 1063 | | - atomic_inc(&sg->sgc->ref); |
|---|
| 1076 | + /* Increase refcounts for claim_allocations: */ |
|---|
| 1077 | + already_visited = atomic_inc_return(&sg->ref) > 1; |
|---|
| 1078 | + /* sgc visits should follow a similar trend as sg */ |
|---|
| 1079 | + WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); |
|---|
| 1080 | + |
|---|
| 1081 | + /* If we have already visited that group, it's already initialized. */ |
|---|
| 1082 | + if (already_visited) |
|---|
| 1083 | + return sg; |
|---|
| 1064 | 1084 | |
|---|
| 1065 | 1085 | if (child) { |
|---|
| 1066 | 1086 | cpumask_copy(sched_group_span(sg), sched_domain_span(child)); |
|---|
| .. | .. |
|---|
| 1079 | 1099 | |
|---|
| 1080 | 1100 | /* |
|---|
| 1081 | 1101 | * build_sched_groups will build a circular linked list of the groups |
|---|
| 1082 | | - * covered by the given span, and will set each group's ->cpumask correctly, |
|---|
| 1083 | | - * and ->cpu_capacity to 0. |
|---|
| 1102 | + * covered by the given span, will set each group's ->cpumask correctly, |
|---|
| 1103 | + * and will initialize their ->sgc. |
|---|
| 1084 | 1104 | * |
|---|
| 1085 | 1105 | * Assumes the sched_domain tree is fully constructed |
|---|
| 1086 | 1106 | */ |
|---|
| .. | .. |
|---|
| 1187 | 1207 | if (!attr || attr->relax_domain_level < 0) { |
|---|
| 1188 | 1208 | if (default_relax_domain_level < 0) |
|---|
| 1189 | 1209 | return; |
|---|
| 1190 | | - else |
|---|
| 1191 | | - request = default_relax_domain_level; |
|---|
| 1210 | + request = default_relax_domain_level; |
|---|
| 1192 | 1211 | } else |
|---|
| 1193 | 1212 | request = attr->relax_domain_level; |
|---|
| 1194 | | - if (request < sd->level) { |
|---|
| 1213 | + |
|---|
| 1214 | + if (sd->level > request) { |
|---|
| 1195 | 1215 | /* Turn off idle balance on this domain: */ |
|---|
| 1196 | 1216 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
|---|
| 1197 | | - } else { |
|---|
| 1198 | | - /* Turn on idle balance on this domain: */ |
|---|
| 1199 | | - sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
|---|
| 1200 | 1217 | } |
|---|
| 1201 | 1218 | } |
|---|
| 1202 | 1219 | |
|---|
| .. | .. |
|---|
| 1210 | 1227 | case sa_rootdomain: |
|---|
| 1211 | 1228 | if (!atomic_read(&d->rd->refcount)) |
|---|
| 1212 | 1229 | free_rootdomain(&d->rd->rcu); |
|---|
| 1213 | | - /* Fall through */ |
|---|
| 1230 | + fallthrough; |
|---|
| 1214 | 1231 | case sa_sd: |
|---|
| 1215 | 1232 | free_percpu(d->sd); |
|---|
| 1216 | | - /* Fall through */ |
|---|
| 1233 | + fallthrough; |
|---|
| 1217 | 1234 | case sa_sd_storage: |
|---|
| 1218 | 1235 | __sdt_free(cpu_map); |
|---|
| 1219 | | - /* Fall through */ |
|---|
| 1236 | + fallthrough; |
|---|
| 1220 | 1237 | case sa_none: |
|---|
| 1221 | 1238 | break; |
|---|
| 1222 | 1239 | } |
|---|
| .. | .. |
|---|
| 1270 | 1287 | int sched_max_numa_distance; |
|---|
| 1271 | 1288 | static int *sched_domains_numa_distance; |
|---|
| 1272 | 1289 | static struct cpumask ***sched_domains_numa_masks; |
|---|
| 1290 | +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; |
|---|
| 1273 | 1291 | #endif |
|---|
| 1274 | 1292 | |
|---|
| 1275 | 1293 | /* |
|---|
| .. | .. |
|---|
| 1282 | 1300 | * SD_SHARE_CPUCAPACITY - describes SMT topologies |
|---|
| 1283 | 1301 | * SD_SHARE_PKG_RESOURCES - describes shared caches |
|---|
| 1284 | 1302 | * SD_NUMA - describes NUMA topologies |
|---|
| 1285 | | - * SD_SHARE_POWERDOMAIN - describes shared power domain |
|---|
| 1286 | 1303 | * |
|---|
| 1287 | 1304 | * Odd one out, which beside describing the topology has a quirk also |
|---|
| 1288 | 1305 | * prescribes the desired behaviour that goes along with it: |
|---|
| .. | .. |
|---|
| 1293 | 1310 | (SD_SHARE_CPUCAPACITY | \ |
|---|
| 1294 | 1311 | SD_SHARE_PKG_RESOURCES | \ |
|---|
| 1295 | 1312 | SD_NUMA | \ |
|---|
| 1296 | | - SD_ASYM_PACKING | \ |
|---|
| 1297 | | - SD_SHARE_POWERDOMAIN) |
|---|
| 1313 | + SD_ASYM_PACKING) |
|---|
| 1298 | 1314 | |
|---|
| 1299 | 1315 | static struct sched_domain * |
|---|
| 1300 | 1316 | sd_init(struct sched_domain_topology_level *tl, |
|---|
| .. | .. |
|---|
| 1326 | 1342 | *sd = (struct sched_domain){ |
|---|
| 1327 | 1343 | .min_interval = sd_weight, |
|---|
| 1328 | 1344 | .max_interval = 2*sd_weight, |
|---|
| 1329 | | - .busy_factor = 32, |
|---|
| 1330 | | - .imbalance_pct = 125, |
|---|
| 1345 | + .busy_factor = 16, |
|---|
| 1346 | + .imbalance_pct = 117, |
|---|
| 1331 | 1347 | |
|---|
| 1332 | 1348 | .cache_nice_tries = 0, |
|---|
| 1333 | | - .busy_idx = 0, |
|---|
| 1334 | | - .idle_idx = 0, |
|---|
| 1335 | | - .newidle_idx = 0, |
|---|
| 1336 | | - .wake_idx = 0, |
|---|
| 1337 | | - .forkexec_idx = 0, |
|---|
| 1338 | 1349 | |
|---|
| 1339 | | - .flags = 1*SD_LOAD_BALANCE |
|---|
| 1340 | | - | 1*SD_BALANCE_NEWIDLE |
|---|
| 1350 | + .flags = 1*SD_BALANCE_NEWIDLE |
|---|
| 1341 | 1351 | | 1*SD_BALANCE_EXEC |
|---|
| 1342 | 1352 | | 1*SD_BALANCE_FORK |
|---|
| 1343 | 1353 | | 0*SD_BALANCE_WAKE |
|---|
| .. | .. |
|---|
| 1352 | 1362 | |
|---|
| 1353 | 1363 | .last_balance = jiffies, |
|---|
| 1354 | 1364 | .balance_interval = sd_weight, |
|---|
| 1355 | | - .smt_gain = 0, |
|---|
| 1356 | 1365 | .max_newidle_lb_cost = 0, |
|---|
| 1357 | 1366 | .next_decay_max_lb_cost = jiffies, |
|---|
| 1358 | 1367 | .child = child, |
|---|
| .. | .. |
|---|
| 1368 | 1377 | * Convert topological properties into behaviour. |
|---|
| 1369 | 1378 | */ |
|---|
| 1370 | 1379 | |
|---|
| 1371 | | - if (sd->flags & SD_ASYM_CPUCAPACITY) { |
|---|
| 1372 | | - struct sched_domain *t = sd; |
|---|
| 1373 | | - |
|---|
| 1374 | | - /* |
|---|
| 1375 | | - * Don't attempt to spread across CPUs of different capacities. |
|---|
| 1376 | | - */ |
|---|
| 1377 | | - if (sd->child) |
|---|
| 1378 | | - sd->child->flags &= ~SD_PREFER_SIBLING; |
|---|
| 1379 | | - |
|---|
| 1380 | | - for_each_lower_domain(t) |
|---|
| 1381 | | - t->flags |= SD_BALANCE_WAKE; |
|---|
| 1382 | | - } |
|---|
| 1380 | + /* Don't attempt to spread across CPUs of different capacities. */ |
|---|
| 1381 | + if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) |
|---|
| 1382 | + sd->child->flags &= ~SD_PREFER_SIBLING; |
|---|
| 1383 | 1383 | |
|---|
| 1384 | 1384 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
|---|
| 1385 | 1385 | sd->imbalance_pct = 110; |
|---|
| 1386 | | - sd->smt_gain = 1178; /* ~15% */ |
|---|
| 1387 | 1386 | |
|---|
| 1388 | 1387 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
|---|
| 1389 | 1388 | sd->imbalance_pct = 117; |
|---|
| 1390 | 1389 | sd->cache_nice_tries = 1; |
|---|
| 1391 | | - sd->busy_idx = 2; |
|---|
| 1392 | 1390 | |
|---|
| 1393 | 1391 | #ifdef CONFIG_NUMA |
|---|
| 1394 | 1392 | } else if (sd->flags & SD_NUMA) { |
|---|
| 1395 | 1393 | sd->cache_nice_tries = 2; |
|---|
| 1396 | | - sd->busy_idx = 3; |
|---|
| 1397 | | - sd->idle_idx = 2; |
|---|
| 1398 | 1394 | |
|---|
| 1399 | 1395 | sd->flags &= ~SD_PREFER_SIBLING; |
|---|
| 1400 | 1396 | sd->flags |= SD_SERIALIZE; |
|---|
| 1401 | | - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { |
|---|
| 1397 | + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { |
|---|
| 1402 | 1398 | sd->flags &= ~(SD_BALANCE_EXEC | |
|---|
| 1403 | 1399 | SD_BALANCE_FORK | |
|---|
| 1404 | 1400 | SD_WAKE_AFFINE); |
|---|
| .. | .. |
|---|
| 1407 | 1403 | #endif |
|---|
| 1408 | 1404 | } else { |
|---|
| 1409 | 1405 | sd->cache_nice_tries = 1; |
|---|
| 1410 | | - sd->busy_idx = 2; |
|---|
| 1411 | | - sd->idle_idx = 1; |
|---|
| 1412 | 1406 | } |
|---|
| 1413 | 1407 | |
|---|
| 1414 | 1408 | /* |
|---|
| .. | .. |
|---|
| 1549 | 1543 | } |
|---|
| 1550 | 1544 | } |
|---|
| 1551 | 1545 | |
|---|
| 1546 | + |
|---|
| 1547 | +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) |
|---|
| 1548 | + |
|---|
| 1552 | 1549 | void sched_init_numa(void) |
|---|
| 1553 | 1550 | { |
|---|
| 1554 | | - int next_distance, curr_distance = node_distance(0, 0); |
|---|
| 1555 | 1551 | struct sched_domain_topology_level *tl; |
|---|
| 1556 | | - int level = 0; |
|---|
| 1557 | | - int i, j, k; |
|---|
| 1558 | | - |
|---|
| 1559 | | - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); |
|---|
| 1560 | | - if (!sched_domains_numa_distance) |
|---|
| 1561 | | - return; |
|---|
| 1562 | | - |
|---|
| 1563 | | - /* Includes NUMA identity node at level 0. */ |
|---|
| 1564 | | - sched_domains_numa_distance[level++] = curr_distance; |
|---|
| 1565 | | - sched_domains_numa_levels = level; |
|---|
| 1552 | + unsigned long *distance_map; |
|---|
| 1553 | + int nr_levels = 0; |
|---|
| 1554 | + int i, j; |
|---|
| 1566 | 1555 | |
|---|
| 1567 | 1556 | /* |
|---|
| 1568 | 1557 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the |
|---|
| 1569 | 1558 | * unique distances in the node_distance() table. |
|---|
| 1570 | | - * |
|---|
| 1571 | | - * Assumes node_distance(0,j) includes all distances in |
|---|
| 1572 | | - * node_distance(i,j) in order to avoid cubic time. |
|---|
| 1573 | 1559 | */ |
|---|
| 1574 | | - next_distance = curr_distance; |
|---|
| 1560 | + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); |
|---|
| 1561 | + if (!distance_map) |
|---|
| 1562 | + return; |
|---|
| 1563 | + |
|---|
| 1564 | + bitmap_zero(distance_map, NR_DISTANCE_VALUES); |
|---|
| 1575 | 1565 | for (i = 0; i < nr_node_ids; i++) { |
|---|
| 1576 | 1566 | for (j = 0; j < nr_node_ids; j++) { |
|---|
| 1577 | | - for (k = 0; k < nr_node_ids; k++) { |
|---|
| 1578 | | - int distance = node_distance(i, k); |
|---|
| 1567 | + int distance = node_distance(i, j); |
|---|
| 1579 | 1568 | |
|---|
| 1580 | | - if (distance > curr_distance && |
|---|
| 1581 | | - (distance < next_distance || |
|---|
| 1582 | | - next_distance == curr_distance)) |
|---|
| 1583 | | - next_distance = distance; |
|---|
| 1584 | | - |
|---|
| 1585 | | - /* |
|---|
| 1586 | | - * While not a strong assumption it would be nice to know |
|---|
| 1587 | | - * about cases where if node A is connected to B, B is not |
|---|
| 1588 | | - * equally connected to A. |
|---|
| 1589 | | - */ |
|---|
| 1590 | | - if (sched_debug() && node_distance(k, i) != distance) |
|---|
| 1591 | | - sched_numa_warn("Node-distance not symmetric"); |
|---|
| 1592 | | - |
|---|
| 1593 | | - if (sched_debug() && i && !find_numa_distance(distance)) |
|---|
| 1594 | | - sched_numa_warn("Node-0 not representative"); |
|---|
| 1569 | + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { |
|---|
| 1570 | + sched_numa_warn("Invalid distance value range"); |
|---|
| 1571 | + return; |
|---|
| 1595 | 1572 | } |
|---|
| 1596 | | - if (next_distance != curr_distance) { |
|---|
| 1597 | | - sched_domains_numa_distance[level++] = next_distance; |
|---|
| 1598 | | - sched_domains_numa_levels = level; |
|---|
| 1599 | | - curr_distance = next_distance; |
|---|
| 1600 | | - } else break; |
|---|
| 1601 | | - } |
|---|
| 1602 | 1573 | |
|---|
| 1603 | | - /* |
|---|
| 1604 | | - * In case of sched_debug() we verify the above assumption. |
|---|
| 1605 | | - */ |
|---|
| 1606 | | - if (!sched_debug()) |
|---|
| 1607 | | - break; |
|---|
| 1574 | + bitmap_set(distance_map, distance, 1); |
|---|
| 1575 | + } |
|---|
| 1576 | + } |
|---|
| 1577 | + /* |
|---|
| 1578 | + * We can now figure out how many unique distance values there are and |
|---|
| 1579 | + * allocate memory accordingly. |
|---|
| 1580 | + */ |
|---|
| 1581 | + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); |
|---|
| 1582 | + |
|---|
| 1583 | + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); |
|---|
| 1584 | + if (!sched_domains_numa_distance) { |
|---|
| 1585 | + bitmap_free(distance_map); |
|---|
| 1586 | + return; |
|---|
| 1608 | 1587 | } |
|---|
| 1609 | 1588 | |
|---|
| 1589 | + for (i = 0, j = 0; i < nr_levels; i++, j++) { |
|---|
| 1590 | + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); |
|---|
| 1591 | + sched_domains_numa_distance[i] = j; |
|---|
| 1592 | + } |
|---|
| 1593 | + |
|---|
| 1594 | + bitmap_free(distance_map); |
|---|
| 1595 | + |
|---|
| 1610 | 1596 | /* |
|---|
| 1611 | | - * 'level' contains the number of unique distances |
|---|
| 1597 | + * 'nr_levels' contains the number of unique distances |
|---|
| 1612 | 1598 | * |
|---|
| 1613 | 1599 | * The sched_domains_numa_distance[] array includes the actual distance |
|---|
| 1614 | 1600 | * numbers. |
|---|
| .. | .. |
|---|
| 1617 | 1603 | /* |
|---|
| 1618 | 1604 | * Here, we should temporarily reset sched_domains_numa_levels to 0. |
|---|
| 1619 | 1605 | * If it fails to allocate memory for array sched_domains_numa_masks[][], |
|---|
| 1620 | | - * the array will contain less then 'level' members. This could be |
|---|
| 1606 | + * the array will contain less then 'nr_levels' members. This could be |
|---|
| 1621 | 1607 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] |
|---|
| 1622 | 1608 | * in other functions. |
|---|
| 1623 | 1609 | * |
|---|
| 1624 | | - * We reset it to 'level' at the end of this function. |
|---|
| 1610 | + * We reset it to 'nr_levels' at the end of this function. |
|---|
| 1625 | 1611 | */ |
|---|
| 1626 | 1612 | sched_domains_numa_levels = 0; |
|---|
| 1627 | 1613 | |
|---|
| 1628 | | - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); |
|---|
| 1614 | + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); |
|---|
| 1629 | 1615 | if (!sched_domains_numa_masks) |
|---|
| 1630 | 1616 | return; |
|---|
| 1631 | 1617 | |
|---|
| .. | .. |
|---|
| 1633 | 1619 | * Now for each level, construct a mask per node which contains all |
|---|
| 1634 | 1620 | * CPUs of nodes that are that many hops away from us. |
|---|
| 1635 | 1621 | */ |
|---|
| 1636 | | - for (i = 0; i < level; i++) { |
|---|
| 1622 | + for (i = 0; i < nr_levels; i++) { |
|---|
| 1637 | 1623 | sched_domains_numa_masks[i] = |
|---|
| 1638 | 1624 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); |
|---|
| 1639 | 1625 | if (!sched_domains_numa_masks[i]) |
|---|
| .. | .. |
|---|
| 1641 | 1627 | |
|---|
| 1642 | 1628 | for (j = 0; j < nr_node_ids; j++) { |
|---|
| 1643 | 1629 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); |
|---|
| 1630 | + int k; |
|---|
| 1631 | + |
|---|
| 1644 | 1632 | if (!mask) |
|---|
| 1645 | 1633 | return; |
|---|
| 1646 | 1634 | |
|---|
| 1647 | 1635 | sched_domains_numa_masks[i][j] = mask; |
|---|
| 1648 | 1636 | |
|---|
| 1649 | 1637 | for_each_node(k) { |
|---|
| 1638 | + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) |
|---|
| 1639 | + sched_numa_warn("Node-distance not symmetric"); |
|---|
| 1640 | + |
|---|
| 1650 | 1641 | if (node_distance(j, k) > sched_domains_numa_distance[i]) |
|---|
| 1651 | 1642 | continue; |
|---|
| 1652 | 1643 | |
|---|
| .. | .. |
|---|
| 1658 | 1649 | /* Compute default topology size */ |
|---|
| 1659 | 1650 | for (i = 0; sched_domain_topology[i].mask; i++); |
|---|
| 1660 | 1651 | |
|---|
| 1661 | | - tl = kzalloc((i + level + 1) * |
|---|
| 1652 | + tl = kzalloc((i + nr_levels + 1) * |
|---|
| 1662 | 1653 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); |
|---|
| 1663 | 1654 | if (!tl) |
|---|
| 1664 | 1655 | return; |
|---|
| .. | .. |
|---|
| 1681 | 1672 | /* |
|---|
| 1682 | 1673 | * .. and append 'j' levels of NUMA goodness. |
|---|
| 1683 | 1674 | */ |
|---|
| 1684 | | - for (j = 1; j < level; i++, j++) { |
|---|
| 1675 | + for (j = 1; j < nr_levels; i++, j++) { |
|---|
| 1685 | 1676 | tl[i] = (struct sched_domain_topology_level){ |
|---|
| 1686 | 1677 | .mask = sd_numa_mask, |
|---|
| 1687 | 1678 | .sd_flags = cpu_numa_flags, |
|---|
| .. | .. |
|---|
| 1693 | 1684 | |
|---|
| 1694 | 1685 | sched_domain_topology = tl; |
|---|
| 1695 | 1686 | |
|---|
| 1696 | | - sched_domains_numa_levels = level; |
|---|
| 1697 | | - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; |
|---|
| 1687 | + sched_domains_numa_levels = nr_levels; |
|---|
| 1688 | + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; |
|---|
| 1698 | 1689 | |
|---|
| 1699 | 1690 | init_numa_topology_type(); |
|---|
| 1700 | 1691 | } |
|---|
| .. | .. |
|---|
| 1720 | 1711 | for (j = 0; j < nr_node_ids; j++) |
|---|
| 1721 | 1712 | cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); |
|---|
| 1722 | 1713 | } |
|---|
| 1714 | +} |
|---|
| 1715 | + |
|---|
| 1716 | +/* |
|---|
| 1717 | + * sched_numa_find_closest() - given the NUMA topology, find the cpu |
|---|
| 1718 | + * closest to @cpu from @cpumask. |
|---|
| 1719 | + * cpumask: cpumask to find a cpu from |
|---|
| 1720 | + * cpu: cpu to be close to |
|---|
| 1721 | + * |
|---|
| 1722 | + * returns: cpu, or nr_cpu_ids when nothing found. |
|---|
| 1723 | + */ |
|---|
| 1724 | +int sched_numa_find_closest(const struct cpumask *cpus, int cpu) |
|---|
| 1725 | +{ |
|---|
| 1726 | + int i, j = cpu_to_node(cpu); |
|---|
| 1727 | + |
|---|
| 1728 | + for (i = 0; i < sched_domains_numa_levels; i++) { |
|---|
| 1729 | + cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]); |
|---|
| 1730 | + if (cpu < nr_cpu_ids) |
|---|
| 1731 | + return cpu; |
|---|
| 1732 | + } |
|---|
| 1733 | + return nr_cpu_ids; |
|---|
| 1723 | 1734 | } |
|---|
| 1724 | 1735 | |
|---|
| 1725 | 1736 | #endif /* CONFIG_NUMA */ |
|---|
| .. | .. |
|---|
| 1860 | 1871 | } |
|---|
| 1861 | 1872 | |
|---|
| 1862 | 1873 | /* |
|---|
| 1874 | + * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for |
|---|
| 1875 | + * any two given CPUs at this (non-NUMA) topology level. |
|---|
| 1876 | + */ |
|---|
| 1877 | +static bool topology_span_sane(struct sched_domain_topology_level *tl, |
|---|
| 1878 | + const struct cpumask *cpu_map, int cpu) |
|---|
| 1879 | +{ |
|---|
| 1880 | + int i; |
|---|
| 1881 | + |
|---|
| 1882 | + /* NUMA levels are allowed to overlap */ |
|---|
| 1883 | + if (tl->flags & SDTL_OVERLAP) |
|---|
| 1884 | + return true; |
|---|
| 1885 | + |
|---|
| 1886 | + /* |
|---|
| 1887 | + * Non-NUMA levels cannot partially overlap - they must be either |
|---|
| 1888 | + * completely equal or completely disjoint. Otherwise we can end up |
|---|
| 1889 | + * breaking the sched_group lists - i.e. a later get_group() pass |
|---|
| 1890 | + * breaks the linking done for an earlier span. |
|---|
| 1891 | + */ |
|---|
| 1892 | + for_each_cpu(i, cpu_map) { |
|---|
| 1893 | + if (i == cpu) |
|---|
| 1894 | + continue; |
|---|
| 1895 | + /* |
|---|
| 1896 | + * We should 'and' all those masks with 'cpu_map' to exactly |
|---|
| 1897 | + * match the topology we're about to build, but that can only |
|---|
| 1898 | + * remove CPUs, which only lessens our ability to detect |
|---|
| 1899 | + * overlaps |
|---|
| 1900 | + */ |
|---|
| 1901 | + if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) && |
|---|
| 1902 | + cpumask_intersects(tl->mask(cpu), tl->mask(i))) |
|---|
| 1903 | + return false; |
|---|
| 1904 | + } |
|---|
| 1905 | + |
|---|
| 1906 | + return true; |
|---|
| 1907 | +} |
|---|
| 1908 | + |
|---|
| 1909 | +/* |
|---|
| 1863 | 1910 | * Find the sched_domain_topology_level where all CPU capacities are visible |
|---|
| 1864 | 1911 | * for all CPUs. |
|---|
| 1865 | 1912 | */ |
|---|
| .. | .. |
|---|
| 1872 | 1919 | unsigned long cap; |
|---|
| 1873 | 1920 | |
|---|
| 1874 | 1921 | /* Is there any asymmetry? */ |
|---|
| 1875 | | - cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); |
|---|
| 1922 | + cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); |
|---|
| 1876 | 1923 | |
|---|
| 1877 | 1924 | for_each_cpu(i, cpu_map) { |
|---|
| 1878 | | - if (arch_scale_cpu_capacity(NULL, i) != cap) { |
|---|
| 1925 | + if (arch_scale_cpu_capacity(i) != cap) { |
|---|
| 1879 | 1926 | asym = true; |
|---|
| 1880 | 1927 | break; |
|---|
| 1881 | 1928 | } |
|---|
| .. | .. |
|---|
| 1890 | 1937 | * to everyone. |
|---|
| 1891 | 1938 | */ |
|---|
| 1892 | 1939 | for_each_cpu(i, cpu_map) { |
|---|
| 1893 | | - unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); |
|---|
| 1940 | + unsigned long max_capacity = arch_scale_cpu_capacity(i); |
|---|
| 1894 | 1941 | int tl_id = 0; |
|---|
| 1895 | 1942 | |
|---|
| 1896 | 1943 | for_each_sd_topology(tl) { |
|---|
| .. | .. |
|---|
| 1900 | 1947 | for_each_cpu_and(j, tl->mask(i), cpu_map) { |
|---|
| 1901 | 1948 | unsigned long capacity; |
|---|
| 1902 | 1949 | |
|---|
| 1903 | | - capacity = arch_scale_cpu_capacity(NULL, j); |
|---|
| 1950 | + capacity = arch_scale_cpu_capacity(j); |
|---|
| 1904 | 1951 | |
|---|
| 1905 | 1952 | if (capacity <= max_capacity) |
|---|
| 1906 | 1953 | continue; |
|---|
| .. | .. |
|---|
| 1925 | 1972 | static int |
|---|
| 1926 | 1973 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) |
|---|
| 1927 | 1974 | { |
|---|
| 1928 | | - enum s_alloc alloc_state; |
|---|
| 1975 | + enum s_alloc alloc_state = sa_none; |
|---|
| 1929 | 1976 | struct sched_domain *sd; |
|---|
| 1930 | 1977 | struct s_data d; |
|---|
| 1978 | + struct rq *rq = NULL; |
|---|
| 1931 | 1979 | int i, ret = -ENOMEM; |
|---|
| 1932 | 1980 | struct sched_domain_topology_level *tl_asym; |
|---|
| 1933 | 1981 | bool has_asym = false; |
|---|
| 1982 | + |
|---|
| 1983 | + if (WARN_ON(cpumask_empty(cpu_map))) |
|---|
| 1984 | + goto error; |
|---|
| 1934 | 1985 | |
|---|
| 1935 | 1986 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
|---|
| 1936 | 1987 | if (alloc_state != sa_rootdomain) |
|---|
| .. | .. |
|---|
| 1941 | 1992 | /* Set up domains for CPUs specified by the cpu_map: */ |
|---|
| 1942 | 1993 | for_each_cpu(i, cpu_map) { |
|---|
| 1943 | 1994 | struct sched_domain_topology_level *tl; |
|---|
| 1995 | + int dflags = 0; |
|---|
| 1944 | 1996 | |
|---|
| 1945 | 1997 | sd = NULL; |
|---|
| 1946 | 1998 | for_each_sd_topology(tl) { |
|---|
| 1947 | | - int dflags = 0; |
|---|
| 1948 | | - |
|---|
| 1949 | 1999 | if (tl == tl_asym) { |
|---|
| 1950 | 2000 | dflags |= SD_ASYM_CPUCAPACITY; |
|---|
| 1951 | 2001 | has_asym = true; |
|---|
| 1952 | 2002 | } |
|---|
| 2003 | + |
|---|
| 2004 | + if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) |
|---|
| 2005 | + goto error; |
|---|
| 1953 | 2006 | |
|---|
| 1954 | 2007 | sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i); |
|---|
| 1955 | 2008 | |
|---|
| .. | .. |
|---|
| 1990 | 2043 | /* Attach the domains */ |
|---|
| 1991 | 2044 | rcu_read_lock(); |
|---|
| 1992 | 2045 | for_each_cpu(i, cpu_map) { |
|---|
| 2046 | + rq = cpu_rq(i); |
|---|
| 1993 | 2047 | sd = *per_cpu_ptr(d.sd, i); |
|---|
| 2048 | + |
|---|
| 2049 | + /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ |
|---|
| 2050 | + if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) |
|---|
| 2051 | + WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); |
|---|
| 2052 | + |
|---|
| 1994 | 2053 | cpu_attach_domain(sd, d.rd, i); |
|---|
| 1995 | 2054 | } |
|---|
| 1996 | 2055 | rcu_read_unlock(); |
|---|
| 1997 | 2056 | |
|---|
| 1998 | 2057 | if (has_asym) |
|---|
| 1999 | 2058 | static_branch_inc_cpuslocked(&sched_asym_cpucapacity); |
|---|
| 2059 | + |
|---|
| 2060 | + if (rq && sched_debug_enabled) { |
|---|
| 2061 | + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", |
|---|
| 2062 | + cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); |
|---|
| 2063 | + } |
|---|
| 2064 | + trace_android_vh_build_sched_domains(has_asym); |
|---|
| 2000 | 2065 | |
|---|
| 2001 | 2066 | ret = 0; |
|---|
| 2002 | 2067 | error: |
|---|
| .. | .. |
|---|
| 2057 | 2122 | } |
|---|
| 2058 | 2123 | |
|---|
| 2059 | 2124 | /* |
|---|
| 2060 | | - * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
|---|
| 2061 | | - * For now this just excludes isolated CPUs, but could be used to |
|---|
| 2062 | | - * exclude other special cases in the future. |
|---|
| 2125 | + * Set up scheduler domains and groups. For now this just excludes isolated |
|---|
| 2126 | + * CPUs, but could be used to exclude other special cases in the future. |
|---|
| 2063 | 2127 | */ |
|---|
| 2064 | 2128 | int sched_init_domains(const struct cpumask *cpu_map) |
|---|
| 2065 | 2129 | { |
|---|
| .. | .. |
|---|
| 2140 | 2204 | * ndoms_new == 0 is a special case for destroying existing domains, |
|---|
| 2141 | 2205 | * and it will not create the default domain. |
|---|
| 2142 | 2206 | * |
|---|
| 2143 | | - * Call with hotplug lock held |
|---|
| 2207 | + * Call with hotplug lock and sched_domains_mutex held |
|---|
| 2144 | 2208 | */ |
|---|
| 2145 | | -void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
|---|
| 2146 | | - struct sched_domain_attr *dattr_new) |
|---|
| 2209 | +void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], |
|---|
| 2210 | + struct sched_domain_attr *dattr_new) |
|---|
| 2147 | 2211 | { |
|---|
| 2148 | 2212 | bool __maybe_unused has_eas = false; |
|---|
| 2149 | 2213 | int i, j, n; |
|---|
| 2150 | 2214 | int new_topology; |
|---|
| 2151 | 2215 | |
|---|
| 2152 | | - mutex_lock(&sched_domains_mutex); |
|---|
| 2216 | + lockdep_assert_held(&sched_domains_mutex); |
|---|
| 2153 | 2217 | |
|---|
| 2154 | 2218 | /* Always unregister in case we don't destroy any domains: */ |
|---|
| 2155 | 2219 | unregister_sched_domain_sysctl(); |
|---|
| .. | .. |
|---|
| 2174 | 2238 | for (i = 0; i < ndoms_cur; i++) { |
|---|
| 2175 | 2239 | for (j = 0; j < n && !new_topology; j++) { |
|---|
| 2176 | 2240 | if (cpumask_equal(doms_cur[i], doms_new[j]) && |
|---|
| 2177 | | - dattrs_equal(dattr_cur, i, dattr_new, j)) |
|---|
| 2241 | + dattrs_equal(dattr_cur, i, dattr_new, j)) { |
|---|
| 2242 | + struct root_domain *rd; |
|---|
| 2243 | + |
|---|
| 2244 | + /* |
|---|
| 2245 | + * This domain won't be destroyed and as such |
|---|
| 2246 | + * its dl_bw->total_bw needs to be cleared. It |
|---|
| 2247 | + * will be recomputed in function |
|---|
| 2248 | + * update_tasks_root_domain(). |
|---|
| 2249 | + */ |
|---|
| 2250 | + rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; |
|---|
| 2251 | + dl_clear_root_domain(rd); |
|---|
| 2178 | 2252 | goto match1; |
|---|
| 2253 | + } |
|---|
| 2179 | 2254 | } |
|---|
| 2180 | 2255 | /* No match - a current sched domain not in new doms_new[] */ |
|---|
| 2181 | 2256 | detach_destroy_domains(doms_cur[i]); |
|---|
| .. | .. |
|---|
| 2204 | 2279 | ; |
|---|
| 2205 | 2280 | } |
|---|
| 2206 | 2281 | |
|---|
| 2207 | | -#ifdef CONFIG_ENERGY_MODEL |
|---|
| 2282 | +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
|---|
| 2208 | 2283 | /* Build perf. domains: */ |
|---|
| 2209 | 2284 | for (i = 0; i < ndoms_new; i++) { |
|---|
| 2210 | | - for (j = 0; j < n; j++) { |
|---|
| 2285 | + for (j = 0; j < n && !sched_energy_update; j++) { |
|---|
| 2211 | 2286 | if (cpumask_equal(doms_new[i], doms_cur[j]) && |
|---|
| 2212 | 2287 | cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { |
|---|
| 2213 | 2288 | has_eas = true; |
|---|
| .. | .. |
|---|
| 2232 | 2307 | ndoms_cur = ndoms_new; |
|---|
| 2233 | 2308 | |
|---|
| 2234 | 2309 | register_sched_domain_sysctl(); |
|---|
| 2310 | +} |
|---|
| 2235 | 2311 | |
|---|
| 2312 | +/* |
|---|
| 2313 | + * Call with hotplug lock held |
|---|
| 2314 | + */ |
|---|
| 2315 | +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
|---|
| 2316 | + struct sched_domain_attr *dattr_new) |
|---|
| 2317 | +{ |
|---|
| 2318 | + mutex_lock(&sched_domains_mutex); |
|---|
| 2319 | + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); |
|---|
| 2236 | 2320 | mutex_unlock(&sched_domains_mutex); |
|---|
| 2237 | 2321 | } |
|---|