.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * x86 SMP booting functions |
---|
3 | 4 | * |
---|
.. | .. |
---|
11 | 12 | * Thanks to Intel for making available several different Pentium, |
---|
12 | 13 | * Pentium Pro and Pentium-II/Xeon MP machines. |
---|
13 | 14 | * Original development of Linux SMP code supported by Caldera. |
---|
14 | | - * |
---|
15 | | - * This code is released under the GNU General Public License version 2 or |
---|
16 | | - * later. |
---|
17 | 15 | * |
---|
18 | 16 | * Fixes |
---|
19 | 17 | * Felix Koop : NR_CPUS used properly |
---|
.. | .. |
---|
49 | 47 | #include <linux/sched/hotplug.h> |
---|
50 | 48 | #include <linux/sched/task_stack.h> |
---|
51 | 49 | #include <linux/percpu.h> |
---|
52 | | -#include <linux/bootmem.h> |
---|
| 50 | +#include <linux/memblock.h> |
---|
53 | 51 | #include <linux/err.h> |
---|
54 | 52 | #include <linux/nmi.h> |
---|
55 | 53 | #include <linux/tboot.h> |
---|
56 | | -#include <linux/stackprotector.h> |
---|
57 | 54 | #include <linux/gfp.h> |
---|
58 | 55 | #include <linux/cpuidle.h> |
---|
| 56 | +#include <linux/numa.h> |
---|
| 57 | +#include <linux/pgtable.h> |
---|
| 58 | +#include <linux/overflow.h> |
---|
59 | 59 | |
---|
60 | 60 | #include <asm/acpi.h> |
---|
61 | 61 | #include <asm/desc.h> |
---|
.. | .. |
---|
64 | 64 | #include <asm/realmode.h> |
---|
65 | 65 | #include <asm/cpu.h> |
---|
66 | 66 | #include <asm/numa.h> |
---|
67 | | -#include <asm/pgtable.h> |
---|
68 | 67 | #include <asm/tlbflush.h> |
---|
69 | 68 | #include <asm/mtrr.h> |
---|
70 | 69 | #include <asm/mwait.h> |
---|
.. | .. |
---|
81 | 80 | #include <asm/cpu_device_id.h> |
---|
82 | 81 | #include <asm/spec-ctrl.h> |
---|
83 | 82 | #include <asm/hw_irq.h> |
---|
| 83 | +#include <asm/stackprotector.h> |
---|
84 | 84 | |
---|
85 | 85 | /* representing HT siblings of each logical CPU */ |
---|
86 | 86 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); |
---|
.. | .. |
---|
90 | 90 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); |
---|
91 | 91 | EXPORT_PER_CPU_SYMBOL(cpu_core_map); |
---|
92 | 92 | |
---|
| 93 | +/* representing HT, core, and die siblings of each logical CPU */ |
---|
| 94 | +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); |
---|
| 95 | +EXPORT_PER_CPU_SYMBOL(cpu_die_map); |
---|
| 96 | + |
---|
93 | 97 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); |
---|
94 | 98 | |
---|
95 | 99 | /* Per CPU bogomips and other parameters */ |
---|
96 | 100 | DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); |
---|
97 | 101 | EXPORT_PER_CPU_SYMBOL(cpu_info); |
---|
98 | 102 | |
---|
| 103 | +struct mwait_cpu_dead { |
---|
| 104 | + unsigned int control; |
---|
| 105 | + unsigned int status; |
---|
| 106 | +}; |
---|
| 107 | + |
---|
| 108 | +/* |
---|
| 109 | + * Cache line aligned data for mwait_play_dead(). Separate on purpose so |
---|
| 110 | + * that it's unlikely to be touched by other CPUs. |
---|
| 111 | + */ |
---|
| 112 | +static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead); |
---|
| 113 | + |
---|
99 | 114 | /* Logical package management. We might want to allocate that dynamically */ |
---|
100 | 115 | unsigned int __max_logical_packages __read_mostly; |
---|
101 | 116 | EXPORT_SYMBOL(__max_logical_packages); |
---|
102 | 117 | static unsigned int logical_packages __read_mostly; |
---|
| 118 | +static unsigned int logical_die __read_mostly; |
---|
103 | 119 | |
---|
104 | 120 | /* Maximum number of SMT threads on any online core */ |
---|
105 | 121 | int __read_mostly __max_smt_threads = 1; |
---|
.. | .. |
---|
143 | 159 | *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; |
---|
144 | 160 | } |
---|
145 | 161 | |
---|
| 162 | +static void init_freq_invariance(bool secondary); |
---|
| 163 | + |
---|
146 | 164 | /* |
---|
147 | 165 | * Report back to the Boot Processor during boot time or to the caller processor |
---|
148 | 166 | * during CPU online. |
---|
149 | 167 | */ |
---|
150 | 168 | static void smp_callin(void) |
---|
151 | 169 | { |
---|
152 | | - int cpuid, phys_id; |
---|
| 170 | + int cpuid; |
---|
153 | 171 | |
---|
154 | 172 | /* |
---|
155 | 173 | * If waken up by an INIT in an 82489DX configuration |
---|
.. | .. |
---|
158 | 176 | * now safe to touch our local APIC. |
---|
159 | 177 | */ |
---|
160 | 178 | cpuid = smp_processor_id(); |
---|
161 | | - |
---|
162 | | - /* |
---|
163 | | - * (This works even if the APIC is not enabled.) |
---|
164 | | - */ |
---|
165 | | - phys_id = read_apic_id(); |
---|
166 | 179 | |
---|
167 | 180 | /* |
---|
168 | 181 | * the boot CPU has finished the init stage and is spinning |
---|
.. | .. |
---|
183 | 196 | * calibrate_delay() and notify_cpu_starting(). |
---|
184 | 197 | */ |
---|
185 | 198 | set_cpu_sibling_map(raw_smp_processor_id()); |
---|
| 199 | + |
---|
| 200 | + init_freq_invariance(true); |
---|
186 | 201 | |
---|
187 | 202 | /* |
---|
188 | 203 | * Get our bogomips. |
---|
.. | .. |
---|
216 | 231 | * before cpu_init(), SMP booting is too fragile that we want to |
---|
217 | 232 | * limit the things done here to the most necessary things. |
---|
218 | 233 | */ |
---|
219 | | - if (boot_cpu_has(X86_FEATURE_PCID)) |
---|
220 | | - __write_cr4(__read_cr4() | X86_CR4_PCIDE); |
---|
| 234 | + cr4_init(); |
---|
221 | 235 | |
---|
222 | 236 | #ifdef CONFIG_X86_32 |
---|
223 | 237 | /* switch away from the initial page table */ |
---|
224 | 238 | load_cr3(swapper_pg_dir); |
---|
225 | | - /* |
---|
226 | | - * Initialize the CR4 shadow before doing anything that could |
---|
227 | | - * try to read it. |
---|
228 | | - */ |
---|
229 | | - cr4_init_shadow(); |
---|
230 | 239 | __flush_tlb_all(); |
---|
231 | 240 | #endif |
---|
232 | | - load_current_idt(); |
---|
233 | | - cpu_init(); |
---|
| 241 | + cpu_init_secondary(); |
---|
| 242 | + rcu_cpu_starting(raw_smp_processor_id()); |
---|
234 | 243 | x86_cpuinit.early_percpu_clock_init(); |
---|
235 | | - preempt_disable(); |
---|
236 | 244 | smp_callin(); |
---|
237 | 245 | |
---|
238 | 246 | enable_start_cpu0 = 0; |
---|
.. | .. |
---|
262 | 270 | /* enable local interrupts */ |
---|
263 | 271 | local_irq_enable(); |
---|
264 | 272 | |
---|
265 | | - /* to prevent fake stack check failure in clock setup */ |
---|
266 | | - boot_init_stack_canary(); |
---|
267 | | - |
---|
268 | 273 | x86_cpuinit.setup_percpu_clockev(); |
---|
269 | 274 | |
---|
270 | 275 | wmb(); |
---|
271 | 276 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); |
---|
272 | | - |
---|
273 | | - /* |
---|
274 | | - * Prevent tail call to cpu_startup_entry() because the stack protector |
---|
275 | | - * guard has been changed a couple of function calls up, in |
---|
276 | | - * boot_init_stack_canary() and must not be checked before tail calling |
---|
277 | | - * another function. |
---|
278 | | - */ |
---|
279 | | - prevent_tail_call_optimization(); |
---|
280 | 277 | } |
---|
281 | 278 | |
---|
282 | 279 | /** |
---|
.. | .. |
---|
314 | 311 | return -1; |
---|
315 | 312 | } |
---|
316 | 313 | EXPORT_SYMBOL(topology_phys_to_logical_pkg); |
---|
| 314 | +/** |
---|
| 315 | + * topology_phys_to_logical_die - Map a physical die id to logical |
---|
| 316 | + * |
---|
| 317 | + * Returns logical die id or -1 if not found |
---|
| 318 | + */ |
---|
| 319 | +int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) |
---|
| 320 | +{ |
---|
| 321 | + int cpu; |
---|
| 322 | + int proc_id = cpu_data(cur_cpu).phys_proc_id; |
---|
| 323 | + |
---|
| 324 | + for_each_possible_cpu(cpu) { |
---|
| 325 | + struct cpuinfo_x86 *c = &cpu_data(cpu); |
---|
| 326 | + |
---|
| 327 | + if (c->initialized && c->cpu_die_id == die_id && |
---|
| 328 | + c->phys_proc_id == proc_id) |
---|
| 329 | + return c->logical_die_id; |
---|
| 330 | + } |
---|
| 331 | + return -1; |
---|
| 332 | +} |
---|
| 333 | +EXPORT_SYMBOL(topology_phys_to_logical_die); |
---|
317 | 334 | |
---|
318 | 335 | /** |
---|
319 | 336 | * topology_update_package_map - Update the physical to logical package map |
---|
.. | .. |
---|
338 | 355 | cpu_data(cpu).logical_proc_id = new; |
---|
339 | 356 | return 0; |
---|
340 | 357 | } |
---|
| 358 | +/** |
---|
| 359 | + * topology_update_die_map - Update the physical to logical die map |
---|
| 360 | + * @die: The die id as retrieved via CPUID |
---|
| 361 | + * @cpu: The cpu for which this is updated |
---|
| 362 | + */ |
---|
| 363 | +int topology_update_die_map(unsigned int die, unsigned int cpu) |
---|
| 364 | +{ |
---|
| 365 | + int new; |
---|
| 366 | + |
---|
| 367 | + /* Already available somewhere? */ |
---|
| 368 | + new = topology_phys_to_logical_die(die, cpu); |
---|
| 369 | + if (new >= 0) |
---|
| 370 | + goto found; |
---|
| 371 | + |
---|
| 372 | + new = logical_die++; |
---|
| 373 | + if (new != die) { |
---|
| 374 | + pr_info("CPU %u Converting physical %u to logical die %u\n", |
---|
| 375 | + cpu, die, new); |
---|
| 376 | + } |
---|
| 377 | +found: |
---|
| 378 | + cpu_data(cpu).logical_die_id = new; |
---|
| 379 | + return 0; |
---|
| 380 | +} |
---|
341 | 381 | |
---|
342 | 382 | void __init smp_store_boot_cpu_info(void) |
---|
343 | 383 | { |
---|
.. | .. |
---|
347 | 387 | *c = boot_cpu_data; |
---|
348 | 388 | c->cpu_index = id; |
---|
349 | 389 | topology_update_package_map(c->phys_proc_id, id); |
---|
| 390 | + topology_update_die_map(c->cpu_die_id, id); |
---|
350 | 391 | c->initialized = true; |
---|
351 | 392 | } |
---|
352 | 393 | |
---|
.. | .. |
---|
401 | 442 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
---|
402 | 443 | |
---|
403 | 444 | if (c->phys_proc_id == o->phys_proc_id && |
---|
| 445 | + c->cpu_die_id == o->cpu_die_id && |
---|
404 | 446 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { |
---|
405 | 447 | if (c->cpu_core_id == o->cpu_core_id) |
---|
406 | 448 | return topology_sane(c, o, "smt"); |
---|
.. | .. |
---|
412 | 454 | } |
---|
413 | 455 | |
---|
414 | 456 | } else if (c->phys_proc_id == o->phys_proc_id && |
---|
| 457 | + c->cpu_die_id == o->cpu_die_id && |
---|
415 | 458 | c->cpu_core_id == o->cpu_core_id) { |
---|
416 | 459 | return topology_sane(c, o, "smt"); |
---|
417 | 460 | } |
---|
.. | .. |
---|
419 | 462 | return false; |
---|
420 | 463 | } |
---|
421 | 464 | |
---|
| 465 | +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
| 466 | +{ |
---|
| 467 | + if (c->phys_proc_id == o->phys_proc_id && |
---|
| 468 | + c->cpu_die_id == o->cpu_die_id) |
---|
| 469 | + return true; |
---|
| 470 | + return false; |
---|
| 471 | +} |
---|
| 472 | + |
---|
422 | 473 | /* |
---|
423 | | - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. |
---|
| 474 | + * Unlike the other levels, we do not enforce keeping a |
---|
| 475 | + * multicore group inside a NUMA node. If this happens, we will |
---|
| 476 | + * discard the MC level of the topology later. |
---|
| 477 | + */ |
---|
| 478 | +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
| 479 | +{ |
---|
| 480 | + if (c->phys_proc_id == o->phys_proc_id) |
---|
| 481 | + return true; |
---|
| 482 | + return false; |
---|
| 483 | +} |
---|
| 484 | + |
---|
| 485 | +/* |
---|
| 486 | + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. |
---|
424 | 487 | * |
---|
425 | | - * These are Intel CPUs that enumerate an LLC that is shared by |
---|
426 | | - * multiple NUMA nodes. The LLC on these systems is shared for |
---|
427 | | - * off-package data access but private to the NUMA node (half |
---|
428 | | - * of the package) for on-package access. |
---|
| 488 | + * Any Intel CPU that has multiple nodes per package and does not |
---|
| 489 | + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. |
---|
429 | 490 | * |
---|
430 | | - * CPUID (the source of the information about the LLC) can only |
---|
431 | | - * enumerate the cache as being shared *or* unshared, but not |
---|
432 | | - * this particular configuration. The CPU in this case enumerates |
---|
433 | | - * the cache to be shared across the entire package (spanning both |
---|
434 | | - * NUMA nodes). |
---|
| 491 | + * When in SNC mode, these CPUs enumerate an LLC that is shared |
---|
| 492 | + * by multiple NUMA nodes. The LLC is shared for off-package data |
---|
| 493 | + * access but private to the NUMA node (half of the package) for |
---|
| 494 | + * on-package access. CPUID (the source of the information about |
---|
| 495 | + * the LLC) can only enumerate the cache as shared or unshared, |
---|
| 496 | + * but not this particular configuration. |
---|
435 | 497 | */ |
---|
436 | 498 | |
---|
437 | | -static const struct x86_cpu_id snc_cpu[] = { |
---|
438 | | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, |
---|
| 499 | +static const struct x86_cpu_id intel_cod_cpu[] = { |
---|
| 500 | + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ |
---|
| 501 | + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ |
---|
| 502 | + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ |
---|
439 | 503 | {} |
---|
440 | 504 | }; |
---|
441 | 505 | |
---|
442 | 506 | static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
443 | 507 | { |
---|
| 508 | + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); |
---|
444 | 509 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
---|
| 510 | + bool intel_snc = id && id->driver_data; |
---|
445 | 511 | |
---|
446 | 512 | /* Do not match if we do not have a valid APICID for cpu: */ |
---|
447 | 513 | if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) |
---|
.. | .. |
---|
456 | 522 | * means 'c' does not share the LLC of 'o'. This will be |
---|
457 | 523 | * reflected to userspace. |
---|
458 | 524 | */ |
---|
459 | | - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) |
---|
| 525 | + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) |
---|
460 | 526 | return false; |
---|
461 | 527 | |
---|
462 | 528 | return topology_sane(c, o, "llc"); |
---|
463 | 529 | } |
---|
464 | 530 | |
---|
465 | | -/* |
---|
466 | | - * Unlike the other levels, we do not enforce keeping a |
---|
467 | | - * multicore group inside a NUMA node. If this happens, we will |
---|
468 | | - * discard the MC level of the topology later. |
---|
469 | | - */ |
---|
470 | | -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
471 | | -{ |
---|
472 | | - if (c->phys_proc_id == o->phys_proc_id) |
---|
473 | | - return true; |
---|
474 | | - return false; |
---|
475 | | -} |
---|
476 | 531 | |
---|
477 | 532 | #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) |
---|
478 | 533 | static inline int x86_sched_itmt_flags(void) |
---|
.. | .. |
---|
536 | 591 | cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); |
---|
537 | 592 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); |
---|
538 | 593 | cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); |
---|
| 594 | + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); |
---|
539 | 595 | c->booted_cores = 1; |
---|
540 | 596 | return; |
---|
541 | 597 | } |
---|
.. | .. |
---|
543 | 599 | for_each_cpu(i, cpu_sibling_setup_mask) { |
---|
544 | 600 | o = &cpu_data(i); |
---|
545 | 601 | |
---|
| 602 | + if (match_pkg(c, o) && !topology_same_node(c, o)) |
---|
| 603 | + x86_has_numa_in_package = true; |
---|
| 604 | + |
---|
546 | 605 | if ((i == cpu) || (has_smt && match_smt(c, o))) |
---|
547 | 606 | link_mask(topology_sibling_cpumask, cpu, i); |
---|
548 | 607 | |
---|
549 | 608 | if ((i == cpu) || (has_mp && match_llc(c, o))) |
---|
550 | 609 | link_mask(cpu_llc_shared_mask, cpu, i); |
---|
551 | 610 | |
---|
| 611 | + if ((i == cpu) || (has_mp && match_die(c, o))) |
---|
| 612 | + link_mask(topology_die_cpumask, cpu, i); |
---|
552 | 613 | } |
---|
| 614 | + |
---|
| 615 | + threads = cpumask_weight(topology_sibling_cpumask(cpu)); |
---|
| 616 | + if (threads > __max_smt_threads) |
---|
| 617 | + __max_smt_threads = threads; |
---|
553 | 618 | |
---|
554 | 619 | /* |
---|
555 | 620 | * This needs a separate iteration over the cpus because we rely on all |
---|
.. | .. |
---|
558 | 623 | for_each_cpu(i, cpu_sibling_setup_mask) { |
---|
559 | 624 | o = &cpu_data(i); |
---|
560 | 625 | |
---|
561 | | - if ((i == cpu) || (has_mp && match_die(c, o))) { |
---|
| 626 | + if ((i == cpu) || (has_mp && match_pkg(c, o))) { |
---|
562 | 627 | link_mask(topology_core_cpumask, cpu, i); |
---|
563 | 628 | |
---|
564 | 629 | /* |
---|
565 | 630 | * Does this new cpu bringup a new core? |
---|
566 | 631 | */ |
---|
567 | | - if (cpumask_weight( |
---|
568 | | - topology_sibling_cpumask(cpu)) == 1) { |
---|
| 632 | + if (threads == 1) { |
---|
569 | 633 | /* |
---|
570 | 634 | * for each core in package, increment |
---|
571 | 635 | * the booted_cores for this new cpu |
---|
.. | .. |
---|
582 | 646 | } else if (i != cpu && !c->booted_cores) |
---|
583 | 647 | c->booted_cores = cpu_data(i).booted_cores; |
---|
584 | 648 | } |
---|
585 | | - if (match_die(c, o) && !topology_same_node(c, o)) |
---|
586 | | - x86_has_numa_in_package = true; |
---|
587 | 649 | } |
---|
588 | | - |
---|
589 | | - threads = cpumask_weight(topology_sibling_cpumask(cpu)); |
---|
590 | | - if (threads > __max_smt_threads) |
---|
591 | | - __max_smt_threads = threads; |
---|
592 | 650 | } |
---|
593 | 651 | |
---|
594 | 652 | /* maps the cpu to the sched domain representing multi-core */ |
---|
.. | .. |
---|
684 | 742 | |
---|
685 | 743 | /* if modern processor, use no delay */ |
---|
686 | 744 | if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || |
---|
| 745 | + ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || |
---|
687 | 746 | ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { |
---|
688 | 747 | init_udelay = 0; |
---|
689 | 748 | return; |
---|
.. | .. |
---|
848 | 907 | /* reduce the number of lines printed when booting a large cpu count system */ |
---|
849 | 908 | static void announce_cpu(int cpu, int apicid) |
---|
850 | 909 | { |
---|
851 | | - static int current_node = -1; |
---|
| 910 | + static int current_node = NUMA_NO_NODE; |
---|
852 | 911 | int node = early_cpu_to_node(cpu); |
---|
853 | 912 | static int width, node_width; |
---|
854 | 913 | |
---|
.. | .. |
---|
946 | 1005 | return boot_error; |
---|
947 | 1006 | } |
---|
948 | 1007 | |
---|
949 | | -void common_cpu_up(unsigned int cpu, struct task_struct *idle) |
---|
| 1008 | +int common_cpu_up(unsigned int cpu, struct task_struct *idle) |
---|
950 | 1009 | { |
---|
| 1010 | + int ret; |
---|
| 1011 | + |
---|
951 | 1012 | /* Just in case we booted with a single CPU. */ |
---|
952 | 1013 | alternatives_enable_smp(); |
---|
953 | 1014 | |
---|
954 | 1015 | per_cpu(current_task, cpu) = idle; |
---|
| 1016 | + cpu_init_stack_canary(cpu, idle); |
---|
| 1017 | + |
---|
| 1018 | + /* Initialize the interrupt stack(s) */ |
---|
| 1019 | + ret = irq_init_percpu_irqstack(cpu); |
---|
| 1020 | + if (ret) |
---|
| 1021 | + return ret; |
---|
955 | 1022 | |
---|
956 | 1023 | #ifdef CONFIG_X86_32 |
---|
957 | 1024 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
---|
958 | | - irq_ctx_init(cpu); |
---|
959 | 1025 | per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); |
---|
960 | 1026 | #else |
---|
961 | 1027 | initial_gs = per_cpu_offset(cpu); |
---|
962 | 1028 | #endif |
---|
| 1029 | + return 0; |
---|
963 | 1030 | } |
---|
964 | 1031 | |
---|
965 | 1032 | /* |
---|
.. | .. |
---|
971 | 1038 | static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, |
---|
972 | 1039 | int *cpu0_nmi_registered) |
---|
973 | 1040 | { |
---|
974 | | - volatile u32 *trampoline_status = |
---|
975 | | - (volatile u32 *) __va(real_mode_header->trampoline_status); |
---|
976 | 1041 | /* start_ip had better be page-aligned! */ |
---|
977 | 1042 | unsigned long start_ip = real_mode_header->trampoline_start; |
---|
978 | 1043 | |
---|
.. | .. |
---|
1064 | 1129 | } |
---|
1065 | 1130 | } |
---|
1066 | 1131 | |
---|
1067 | | - /* mark "stuck" area as not stuck */ |
---|
1068 | | - *trampoline_status = 0; |
---|
1069 | | - |
---|
1070 | 1132 | if (x86_platform.legacy.warm_reset) { |
---|
1071 | 1133 | /* |
---|
1072 | 1134 | * Cleanup possible dangling ends... |
---|
.. | .. |
---|
1117 | 1179 | /* the FPU context is blank, nobody can own it */ |
---|
1118 | 1180 | per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; |
---|
1119 | 1181 | |
---|
1120 | | - common_cpu_up(cpu, tidle); |
---|
| 1182 | + err = common_cpu_up(cpu, tidle); |
---|
| 1183 | + if (err) |
---|
| 1184 | + return err; |
---|
1121 | 1185 | |
---|
1122 | 1186 | err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); |
---|
1123 | 1187 | if (err) { |
---|
.. | .. |
---|
1178 | 1242 | physid_set_mask_of_physid(0, &phys_cpu_present_map); |
---|
1179 | 1243 | cpumask_set_cpu(0, topology_sibling_cpumask(0)); |
---|
1180 | 1244 | cpumask_set_cpu(0, topology_core_cpumask(0)); |
---|
| 1245 | + cpumask_set_cpu(0, topology_die_cpumask(0)); |
---|
1181 | 1246 | } |
---|
1182 | 1247 | |
---|
1183 | 1248 | /* |
---|
.. | .. |
---|
1273 | 1338 | for_each_possible_cpu(i) { |
---|
1274 | 1339 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); |
---|
1275 | 1340 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); |
---|
| 1341 | + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); |
---|
1276 | 1342 | zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); |
---|
1277 | 1343 | } |
---|
1278 | 1344 | |
---|
.. | .. |
---|
1286 | 1352 | set_sched_topology(x86_topology); |
---|
1287 | 1353 | |
---|
1288 | 1354 | set_cpu_sibling_map(0); |
---|
1289 | | - |
---|
| 1355 | + init_freq_invariance(false); |
---|
1290 | 1356 | smp_sanity_check(); |
---|
1291 | 1357 | |
---|
1292 | 1358 | switch (apic_intr_mode) { |
---|
.. | .. |
---|
1312 | 1378 | pr_info("CPU0: "); |
---|
1313 | 1379 | print_cpu_info(&cpu_data(0)); |
---|
1314 | 1380 | |
---|
1315 | | - native_pv_lock_init(); |
---|
1316 | | - |
---|
1317 | 1381 | uv_system_init(); |
---|
1318 | 1382 | |
---|
1319 | 1383 | set_mtrr_aps_delayed_init(); |
---|
.. | .. |
---|
1323 | 1387 | speculative_store_bypass_ht_init(); |
---|
1324 | 1388 | } |
---|
1325 | 1389 | |
---|
1326 | | -void arch_enable_nonboot_cpus_begin(void) |
---|
| 1390 | +void arch_thaw_secondary_cpus_begin(void) |
---|
1327 | 1391 | { |
---|
1328 | 1392 | set_mtrr_aps_delayed_init(); |
---|
1329 | 1393 | } |
---|
1330 | 1394 | |
---|
1331 | | -void arch_enable_nonboot_cpus_end(void) |
---|
| 1395 | +void arch_thaw_secondary_cpus_end(void) |
---|
1332 | 1396 | { |
---|
1333 | 1397 | mtrr_aps_init(); |
---|
1334 | 1398 | } |
---|
.. | .. |
---|
1343 | 1407 | /* already set me in cpu_online_mask in boot_cpu_init() */ |
---|
1344 | 1408 | cpumask_set_cpu(me, cpu_callout_mask); |
---|
1345 | 1409 | cpu_set_state_online(me); |
---|
| 1410 | + native_pv_lock_init(); |
---|
1346 | 1411 | } |
---|
1347 | 1412 | |
---|
1348 | 1413 | void __init calculate_max_logical_packages(void) |
---|
.. | .. |
---|
1384 | 1449 | /* |
---|
1385 | 1450 | * cpu_possible_mask should be static, it cannot change as cpu's |
---|
1386 | 1451 | * are onlined, or offlined. The reason is per-cpu data-structures |
---|
1387 | | - * are allocated by some modules at init time, and dont expect to |
---|
| 1452 | + * are allocated by some modules at init time, and don't expect to |
---|
1388 | 1453 | * do this dynamically on cpu arrival/departure. |
---|
1389 | 1454 | * cpu_present_mask on the other hand can change dynamically. |
---|
1390 | 1455 | * In case when cpu_hotplug is not compiled, then we resort to current |
---|
.. | .. |
---|
1493 | 1558 | cpu_data(sibling).booted_cores--; |
---|
1494 | 1559 | } |
---|
1495 | 1560 | |
---|
| 1561 | + for_each_cpu(sibling, topology_die_cpumask(cpu)) |
---|
| 1562 | + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); |
---|
1496 | 1563 | for_each_cpu(sibling, topology_sibling_cpumask(cpu)) |
---|
1497 | 1564 | cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); |
---|
1498 | 1565 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) |
---|
.. | .. |
---|
1500 | 1567 | cpumask_clear(cpu_llc_shared_mask(cpu)); |
---|
1501 | 1568 | cpumask_clear(topology_sibling_cpumask(cpu)); |
---|
1502 | 1569 | cpumask_clear(topology_core_cpumask(cpu)); |
---|
| 1570 | + cpumask_clear(topology_die_cpumask(cpu)); |
---|
1503 | 1571 | c->cpu_core_id = 0; |
---|
1504 | 1572 | c->booted_cores = 0; |
---|
1505 | 1573 | cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); |
---|
.. | .. |
---|
1538 | 1606 | if (ret) |
---|
1539 | 1607 | return ret; |
---|
1540 | 1608 | |
---|
1541 | | - clear_local_APIC(); |
---|
1542 | 1609 | cpu_disable_common(); |
---|
| 1610 | + |
---|
| 1611 | + /* |
---|
| 1612 | + * Disable the local APIC. Otherwise IPI broadcasts will reach |
---|
| 1613 | + * it. It still responds normally to INIT, NMI, SMI, and SIPI |
---|
| 1614 | + * messages. |
---|
| 1615 | + * |
---|
| 1616 | + * Disabling the APIC must happen after cpu_disable_common() |
---|
| 1617 | + * which invokes fixup_irqs(). |
---|
| 1618 | + * |
---|
| 1619 | + * Disabling the APIC preserves already set bits in IRR, but |
---|
| 1620 | + * an interrupt arriving after disabling the local APIC does not |
---|
| 1621 | + * set the corresponding IRR bit. |
---|
| 1622 | + * |
---|
| 1623 | + * fixup_irqs() scans IRR for set bits so it can raise a not |
---|
| 1624 | + * yet handled interrupt on the new destination CPU via an IPI |
---|
| 1625 | + * but obviously it can't do so for IRR bits which are not set. |
---|
| 1626 | + * IOW, interrupts arriving after disabling the local APIC will |
---|
| 1627 | + * be lost. |
---|
| 1628 | + */ |
---|
| 1629 | + apic_soft_disable(); |
---|
1543 | 1630 | |
---|
1544 | 1631 | return 0; |
---|
1545 | 1632 | } |
---|
.. | .. |
---|
1580 | 1667 | local_irq_disable(); |
---|
1581 | 1668 | } |
---|
1582 | 1669 | |
---|
1583 | | -static bool wakeup_cpu0(void) |
---|
| 1670 | +/** |
---|
| 1671 | + * cond_wakeup_cpu0 - Wake up CPU0 if needed. |
---|
| 1672 | + * |
---|
| 1673 | + * If NMI wants to wake up CPU0, start CPU0. |
---|
| 1674 | + */ |
---|
| 1675 | +void cond_wakeup_cpu0(void) |
---|
1584 | 1676 | { |
---|
1585 | 1677 | if (smp_processor_id() == 0 && enable_start_cpu0) |
---|
1586 | | - return true; |
---|
1587 | | - |
---|
1588 | | - return false; |
---|
| 1678 | + start_cpu0(); |
---|
1589 | 1679 | } |
---|
| 1680 | +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); |
---|
1590 | 1681 | |
---|
1591 | 1682 | /* |
---|
1592 | 1683 | * We need to flush the caches before going to sleep, lest we have |
---|
.. | .. |
---|
1594 | 1685 | */ |
---|
1595 | 1686 | static inline void mwait_play_dead(void) |
---|
1596 | 1687 | { |
---|
| 1688 | + struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); |
---|
1597 | 1689 | unsigned int eax, ebx, ecx, edx; |
---|
1598 | 1690 | unsigned int highest_cstate = 0; |
---|
1599 | 1691 | unsigned int highest_subcstate = 0; |
---|
1600 | | - void *mwait_ptr; |
---|
1601 | 1692 | int i; |
---|
1602 | 1693 | |
---|
1603 | | - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
---|
| 1694 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || |
---|
| 1695 | + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) |
---|
1604 | 1696 | return; |
---|
1605 | 1697 | if (!this_cpu_has(X86_FEATURE_MWAIT)) |
---|
1606 | 1698 | return; |
---|
.. | .. |
---|
1631 | 1723 | (highest_subcstate - 1); |
---|
1632 | 1724 | } |
---|
1633 | 1725 | |
---|
1634 | | - /* |
---|
1635 | | - * This should be a memory location in a cache line which is |
---|
1636 | | - * unlikely to be touched by other processors. The actual |
---|
1637 | | - * content is immaterial as it is not actually modified in any way. |
---|
1638 | | - */ |
---|
1639 | | - mwait_ptr = ¤t_thread_info()->flags; |
---|
1640 | | - |
---|
1641 | 1726 | wbinvd(); |
---|
1642 | 1727 | |
---|
1643 | 1728 | while (1) { |
---|
.. | .. |
---|
1649 | 1734 | * case where we return around the loop. |
---|
1650 | 1735 | */ |
---|
1651 | 1736 | mb(); |
---|
1652 | | - clflush(mwait_ptr); |
---|
| 1737 | + clflush(md); |
---|
1653 | 1738 | mb(); |
---|
1654 | | - __monitor(mwait_ptr, 0, 0); |
---|
| 1739 | + __monitor(md, 0, 0); |
---|
1655 | 1740 | mb(); |
---|
1656 | 1741 | __mwait(eax, 0); |
---|
1657 | | - /* |
---|
1658 | | - * If NMI wants to wake up CPU0, start CPU0. |
---|
1659 | | - */ |
---|
1660 | | - if (wakeup_cpu0()) |
---|
1661 | | - start_cpu0(); |
---|
| 1742 | + |
---|
| 1743 | + cond_wakeup_cpu0(); |
---|
1662 | 1744 | } |
---|
1663 | 1745 | } |
---|
1664 | 1746 | |
---|
.. | .. |
---|
1669 | 1751 | |
---|
1670 | 1752 | while (1) { |
---|
1671 | 1753 | native_halt(); |
---|
1672 | | - /* |
---|
1673 | | - * If NMI wants to wake up CPU0, start CPU0. |
---|
1674 | | - */ |
---|
1675 | | - if (wakeup_cpu0()) |
---|
1676 | | - start_cpu0(); |
---|
| 1754 | + |
---|
| 1755 | + cond_wakeup_cpu0(); |
---|
1677 | 1756 | } |
---|
1678 | 1757 | } |
---|
1679 | 1758 | |
---|
.. | .. |
---|
1705 | 1784 | } |
---|
1706 | 1785 | |
---|
1707 | 1786 | #endif |
---|
| 1787 | + |
---|
| 1788 | +#ifdef CONFIG_X86_64 |
---|
| 1789 | +/* |
---|
| 1790 | + * APERF/MPERF frequency ratio computation. |
---|
| 1791 | + * |
---|
| 1792 | + * The scheduler wants to do frequency invariant accounting and needs a <1 |
---|
| 1793 | + * ratio to account for the 'current' frequency, corresponding to |
---|
| 1794 | + * freq_curr / freq_max. |
---|
| 1795 | + * |
---|
| 1796 | + * Since the frequency freq_curr on x86 is controlled by micro-controller and |
---|
| 1797 | + * our P-state setting is little more than a request/hint, we need to observe |
---|
| 1798 | + * the effective frequency 'BusyMHz', i.e. the average frequency over a time |
---|
| 1799 | + * interval after discarding idle time. This is given by: |
---|
| 1800 | + * |
---|
| 1801 | + * BusyMHz = delta_APERF / delta_MPERF * freq_base |
---|
| 1802 | + * |
---|
| 1803 | + * where freq_base is the max non-turbo P-state. |
---|
| 1804 | + * |
---|
| 1805 | + * The freq_max term has to be set to a somewhat arbitrary value, because we |
---|
| 1806 | + * can't know which turbo states will be available at a given point in time: |
---|
| 1807 | + * it all depends on the thermal headroom of the entire package. We set it to |
---|
| 1808 | + * the turbo level with 4 cores active. |
---|
| 1809 | + * |
---|
| 1810 | + * Benchmarks show that's a good compromise between the 1C turbo ratio |
---|
| 1811 | + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, |
---|
| 1812 | + * which would ignore the entire turbo range (a conspicuous part, making |
---|
| 1813 | + * freq_curr/freq_max always maxed out). |
---|
| 1814 | + * |
---|
| 1815 | + * An exception to the heuristic above is the Atom uarch, where we choose the |
---|
| 1816 | + * highest turbo level for freq_max since Atom's are generally oriented towards |
---|
| 1817 | + * power efficiency. |
---|
| 1818 | + * |
---|
| 1819 | + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio |
---|
| 1820 | + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. |
---|
| 1821 | + */ |
---|
| 1822 | + |
---|
| 1823 | +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); |
---|
| 1824 | + |
---|
| 1825 | +static DEFINE_PER_CPU(u64, arch_prev_aperf); |
---|
| 1826 | +static DEFINE_PER_CPU(u64, arch_prev_mperf); |
---|
| 1827 | +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; |
---|
| 1828 | +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; |
---|
| 1829 | + |
---|
| 1830 | +void arch_set_max_freq_ratio(bool turbo_disabled) |
---|
| 1831 | +{ |
---|
| 1832 | + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : |
---|
| 1833 | + arch_turbo_freq_ratio; |
---|
| 1834 | +} |
---|
| 1835 | +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); |
---|
| 1836 | + |
---|
| 1837 | +static bool turbo_disabled(void) |
---|
| 1838 | +{ |
---|
| 1839 | + u64 misc_en; |
---|
| 1840 | + int err; |
---|
| 1841 | + |
---|
| 1842 | + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); |
---|
| 1843 | + if (err) |
---|
| 1844 | + return false; |
---|
| 1845 | + |
---|
| 1846 | + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); |
---|
| 1847 | +} |
---|
| 1848 | + |
---|
| 1849 | +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
---|
| 1850 | +{ |
---|
| 1851 | + int err; |
---|
| 1852 | + |
---|
| 1853 | + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); |
---|
| 1854 | + if (err) |
---|
| 1855 | + return false; |
---|
| 1856 | + |
---|
| 1857 | + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); |
---|
| 1858 | + if (err) |
---|
| 1859 | + return false; |
---|
| 1860 | + |
---|
| 1861 | + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ |
---|
| 1862 | + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ |
---|
| 1863 | + |
---|
| 1864 | + return true; |
---|
| 1865 | +} |
---|
| 1866 | + |
---|
| 1867 | +#include <asm/cpu_device_id.h> |
---|
| 1868 | +#include <asm/intel-family.h> |
---|
| 1869 | + |
---|
| 1870 | +#define X86_MATCH(model) \ |
---|
| 1871 | + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ |
---|
| 1872 | + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) |
---|
| 1873 | + |
---|
| 1874 | +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { |
---|
| 1875 | + X86_MATCH(XEON_PHI_KNL), |
---|
| 1876 | + X86_MATCH(XEON_PHI_KNM), |
---|
| 1877 | + {} |
---|
| 1878 | +}; |
---|
| 1879 | + |
---|
| 1880 | +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { |
---|
| 1881 | + X86_MATCH(SKYLAKE_X), |
---|
| 1882 | + {} |
---|
| 1883 | +}; |
---|
| 1884 | + |
---|
| 1885 | +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { |
---|
| 1886 | + X86_MATCH(ATOM_GOLDMONT), |
---|
| 1887 | + X86_MATCH(ATOM_GOLDMONT_D), |
---|
| 1888 | + X86_MATCH(ATOM_GOLDMONT_PLUS), |
---|
| 1889 | + {} |
---|
| 1890 | +}; |
---|
| 1891 | + |
---|
| 1892 | +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, |
---|
| 1893 | + int num_delta_fratio) |
---|
| 1894 | +{ |
---|
| 1895 | + int fratio, delta_fratio, found; |
---|
| 1896 | + int err, i; |
---|
| 1897 | + u64 msr; |
---|
| 1898 | + |
---|
| 1899 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1900 | + if (err) |
---|
| 1901 | + return false; |
---|
| 1902 | + |
---|
| 1903 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1904 | + |
---|
| 1905 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); |
---|
| 1906 | + if (err) |
---|
| 1907 | + return false; |
---|
| 1908 | + |
---|
| 1909 | + fratio = (msr >> 8) & 0xFF; |
---|
| 1910 | + i = 16; |
---|
| 1911 | + found = 0; |
---|
| 1912 | + do { |
---|
| 1913 | + if (found >= num_delta_fratio) { |
---|
| 1914 | + *turbo_freq = fratio; |
---|
| 1915 | + return true; |
---|
| 1916 | + } |
---|
| 1917 | + |
---|
| 1918 | + delta_fratio = (msr >> (i + 5)) & 0x7; |
---|
| 1919 | + |
---|
| 1920 | + if (delta_fratio) { |
---|
| 1921 | + found += 1; |
---|
| 1922 | + fratio -= delta_fratio; |
---|
| 1923 | + } |
---|
| 1924 | + |
---|
| 1925 | + i += 8; |
---|
| 1926 | + } while (i < 64); |
---|
| 1927 | + |
---|
| 1928 | + return true; |
---|
| 1929 | +} |
---|
| 1930 | + |
---|
| 1931 | +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) |
---|
| 1932 | +{ |
---|
| 1933 | + u64 ratios, counts; |
---|
| 1934 | + u32 group_size; |
---|
| 1935 | + int err, i; |
---|
| 1936 | + |
---|
| 1937 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1938 | + if (err) |
---|
| 1939 | + return false; |
---|
| 1940 | + |
---|
| 1941 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1942 | + |
---|
| 1943 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); |
---|
| 1944 | + if (err) |
---|
| 1945 | + return false; |
---|
| 1946 | + |
---|
| 1947 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); |
---|
| 1948 | + if (err) |
---|
| 1949 | + return false; |
---|
| 1950 | + |
---|
| 1951 | + for (i = 0; i < 64; i += 8) { |
---|
| 1952 | + group_size = (counts >> i) & 0xFF; |
---|
| 1953 | + if (group_size >= size) { |
---|
| 1954 | + *turbo_freq = (ratios >> i) & 0xFF; |
---|
| 1955 | + return true; |
---|
| 1956 | + } |
---|
| 1957 | + } |
---|
| 1958 | + |
---|
| 1959 | + return false; |
---|
| 1960 | +} |
---|
| 1961 | + |
---|
| 1962 | +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
---|
| 1963 | +{ |
---|
| 1964 | + u64 msr; |
---|
| 1965 | + int err; |
---|
| 1966 | + |
---|
| 1967 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1968 | + if (err) |
---|
| 1969 | + return false; |
---|
| 1970 | + |
---|
| 1971 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); |
---|
| 1972 | + if (err) |
---|
| 1973 | + return false; |
---|
| 1974 | + |
---|
| 1975 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1976 | + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ |
---|
| 1977 | + |
---|
| 1978 | + /* The CPU may have less than 4 cores */ |
---|
| 1979 | + if (!*turbo_freq) |
---|
| 1980 | + *turbo_freq = msr & 0xFF; /* 1C turbo */ |
---|
| 1981 | + |
---|
| 1982 | + return true; |
---|
| 1983 | +} |
---|
| 1984 | + |
---|
| 1985 | +static bool intel_set_max_freq_ratio(void) |
---|
| 1986 | +{ |
---|
| 1987 | + u64 base_freq, turbo_freq; |
---|
| 1988 | + u64 turbo_ratio; |
---|
| 1989 | + |
---|
| 1990 | + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) |
---|
| 1991 | + goto out; |
---|
| 1992 | + |
---|
| 1993 | + if (x86_match_cpu(has_glm_turbo_ratio_limits) && |
---|
| 1994 | + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) |
---|
| 1995 | + goto out; |
---|
| 1996 | + |
---|
| 1997 | + if (x86_match_cpu(has_knl_turbo_ratio_limits) && |
---|
| 1998 | + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) |
---|
| 1999 | + goto out; |
---|
| 2000 | + |
---|
| 2001 | + if (x86_match_cpu(has_skx_turbo_ratio_limits) && |
---|
| 2002 | + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) |
---|
| 2003 | + goto out; |
---|
| 2004 | + |
---|
| 2005 | + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) |
---|
| 2006 | + goto out; |
---|
| 2007 | + |
---|
| 2008 | + return false; |
---|
| 2009 | + |
---|
| 2010 | +out: |
---|
| 2011 | + /* |
---|
| 2012 | + * Some hypervisors advertise X86_FEATURE_APERFMPERF |
---|
| 2013 | + * but then fill all MSR's with zeroes. |
---|
| 2014 | + * Some CPUs have turbo boost but don't declare any turbo ratio |
---|
| 2015 | + * in MSR_TURBO_RATIO_LIMIT. |
---|
| 2016 | + */ |
---|
| 2017 | + if (!base_freq || !turbo_freq) { |
---|
| 2018 | + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); |
---|
| 2019 | + return false; |
---|
| 2020 | + } |
---|
| 2021 | + |
---|
| 2022 | + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); |
---|
| 2023 | + if (!turbo_ratio) { |
---|
| 2024 | + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); |
---|
| 2025 | + return false; |
---|
| 2026 | + } |
---|
| 2027 | + |
---|
| 2028 | + arch_turbo_freq_ratio = turbo_ratio; |
---|
| 2029 | + arch_set_max_freq_ratio(turbo_disabled()); |
---|
| 2030 | + |
---|
| 2031 | + return true; |
---|
| 2032 | +} |
---|
| 2033 | + |
---|
| 2034 | +static void init_counter_refs(void) |
---|
| 2035 | +{ |
---|
| 2036 | + u64 aperf, mperf; |
---|
| 2037 | + |
---|
| 2038 | + rdmsrl(MSR_IA32_APERF, aperf); |
---|
| 2039 | + rdmsrl(MSR_IA32_MPERF, mperf); |
---|
| 2040 | + |
---|
| 2041 | + this_cpu_write(arch_prev_aperf, aperf); |
---|
| 2042 | + this_cpu_write(arch_prev_mperf, mperf); |
---|
| 2043 | +} |
---|
| 2044 | + |
---|
| 2045 | +static void init_freq_invariance(bool secondary) |
---|
| 2046 | +{ |
---|
| 2047 | + bool ret = false; |
---|
| 2048 | + |
---|
| 2049 | + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) |
---|
| 2050 | + return; |
---|
| 2051 | + |
---|
| 2052 | + if (secondary) { |
---|
| 2053 | + if (static_branch_likely(&arch_scale_freq_key)) { |
---|
| 2054 | + init_counter_refs(); |
---|
| 2055 | + } |
---|
| 2056 | + return; |
---|
| 2057 | + } |
---|
| 2058 | + |
---|
| 2059 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) |
---|
| 2060 | + ret = intel_set_max_freq_ratio(); |
---|
| 2061 | + |
---|
| 2062 | + if (ret) { |
---|
| 2063 | + init_counter_refs(); |
---|
| 2064 | + static_branch_enable(&arch_scale_freq_key); |
---|
| 2065 | + } else { |
---|
| 2066 | + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); |
---|
| 2067 | + } |
---|
| 2068 | +} |
---|
| 2069 | + |
---|
| 2070 | +static void disable_freq_invariance_workfn(struct work_struct *work) |
---|
| 2071 | +{ |
---|
| 2072 | + static_branch_disable(&arch_scale_freq_key); |
---|
| 2073 | +} |
---|
| 2074 | + |
---|
| 2075 | +static DECLARE_WORK(disable_freq_invariance_work, |
---|
| 2076 | + disable_freq_invariance_workfn); |
---|
| 2077 | + |
---|
| 2078 | +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; |
---|
| 2079 | + |
---|
| 2080 | +void arch_scale_freq_tick(void) |
---|
| 2081 | +{ |
---|
| 2082 | + u64 freq_scale = SCHED_CAPACITY_SCALE; |
---|
| 2083 | + u64 aperf, mperf; |
---|
| 2084 | + u64 acnt, mcnt; |
---|
| 2085 | + |
---|
| 2086 | + if (!arch_scale_freq_invariant()) |
---|
| 2087 | + return; |
---|
| 2088 | + |
---|
| 2089 | + rdmsrl(MSR_IA32_APERF, aperf); |
---|
| 2090 | + rdmsrl(MSR_IA32_MPERF, mperf); |
---|
| 2091 | + |
---|
| 2092 | + acnt = aperf - this_cpu_read(arch_prev_aperf); |
---|
| 2093 | + mcnt = mperf - this_cpu_read(arch_prev_mperf); |
---|
| 2094 | + |
---|
| 2095 | + this_cpu_write(arch_prev_aperf, aperf); |
---|
| 2096 | + this_cpu_write(arch_prev_mperf, mperf); |
---|
| 2097 | + |
---|
| 2098 | + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) |
---|
| 2099 | + goto error; |
---|
| 2100 | + |
---|
| 2101 | + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) |
---|
| 2102 | + goto error; |
---|
| 2103 | + |
---|
| 2104 | + freq_scale = div64_u64(acnt, mcnt); |
---|
| 2105 | + if (!freq_scale) |
---|
| 2106 | + goto error; |
---|
| 2107 | + |
---|
| 2108 | + if (freq_scale > SCHED_CAPACITY_SCALE) |
---|
| 2109 | + freq_scale = SCHED_CAPACITY_SCALE; |
---|
| 2110 | + |
---|
| 2111 | + this_cpu_write(arch_freq_scale, freq_scale); |
---|
| 2112 | + return; |
---|
| 2113 | + |
---|
| 2114 | +error: |
---|
| 2115 | + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); |
---|
| 2116 | + schedule_work(&disable_freq_invariance_work); |
---|
| 2117 | +} |
---|
| 2118 | +#else |
---|
| 2119 | +static inline void init_freq_invariance(bool secondary) |
---|
| 2120 | +{ |
---|
| 2121 | +} |
---|
| 2122 | +#endif /* CONFIG_X86_64 */ |
---|