.. | .. |
---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
---|
1 | 2 | /* |
---|
2 | 3 | * x86 SMP booting functions |
---|
3 | 4 | * |
---|
.. | .. |
---|
11 | 12 | * Thanks to Intel for making available several different Pentium, |
---|
12 | 13 | * Pentium Pro and Pentium-II/Xeon MP machines. |
---|
13 | 14 | * Original development of Linux SMP code supported by Caldera. |
---|
14 | | - * |
---|
15 | | - * This code is released under the GNU General Public License version 2 or |
---|
16 | | - * later. |
---|
17 | 15 | * |
---|
18 | 16 | * Fixes |
---|
19 | 17 | * Felix Koop : NR_CPUS used properly |
---|
.. | .. |
---|
49 | 47 | #include <linux/sched/hotplug.h> |
---|
50 | 48 | #include <linux/sched/task_stack.h> |
---|
51 | 49 | #include <linux/percpu.h> |
---|
52 | | -#include <linux/bootmem.h> |
---|
| 50 | +#include <linux/memblock.h> |
---|
53 | 51 | #include <linux/err.h> |
---|
54 | 52 | #include <linux/nmi.h> |
---|
55 | 53 | #include <linux/tboot.h> |
---|
56 | | -#include <linux/stackprotector.h> |
---|
57 | 54 | #include <linux/gfp.h> |
---|
58 | 55 | #include <linux/cpuidle.h> |
---|
| 56 | +#include <linux/numa.h> |
---|
| 57 | +#include <linux/pgtable.h> |
---|
| 58 | +#include <linux/overflow.h> |
---|
59 | 59 | |
---|
60 | 60 | #include <asm/acpi.h> |
---|
61 | 61 | #include <asm/desc.h> |
---|
.. | .. |
---|
64 | 64 | #include <asm/realmode.h> |
---|
65 | 65 | #include <asm/cpu.h> |
---|
66 | 66 | #include <asm/numa.h> |
---|
67 | | -#include <asm/pgtable.h> |
---|
68 | 67 | #include <asm/tlbflush.h> |
---|
69 | 68 | #include <asm/mtrr.h> |
---|
70 | 69 | #include <asm/mwait.h> |
---|
.. | .. |
---|
81 | 80 | #include <asm/cpu_device_id.h> |
---|
82 | 81 | #include <asm/spec-ctrl.h> |
---|
83 | 82 | #include <asm/hw_irq.h> |
---|
| 83 | +#include <asm/stackprotector.h> |
---|
84 | 84 | |
---|
85 | 85 | /* representing HT siblings of each logical CPU */ |
---|
86 | 86 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); |
---|
.. | .. |
---|
89 | 89 | /* representing HT and core siblings of each logical CPU */ |
---|
90 | 90 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); |
---|
91 | 91 | EXPORT_PER_CPU_SYMBOL(cpu_core_map); |
---|
| 92 | + |
---|
| 93 | +/* representing HT, core, and die siblings of each logical CPU */ |
---|
| 94 | +DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map); |
---|
| 95 | +EXPORT_PER_CPU_SYMBOL(cpu_die_map); |
---|
92 | 96 | |
---|
93 | 97 | DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); |
---|
94 | 98 | |
---|
.. | .. |
---|
100 | 104 | unsigned int __max_logical_packages __read_mostly; |
---|
101 | 105 | EXPORT_SYMBOL(__max_logical_packages); |
---|
102 | 106 | static unsigned int logical_packages __read_mostly; |
---|
| 107 | +static unsigned int logical_die __read_mostly; |
---|
103 | 108 | |
---|
104 | 109 | /* Maximum number of SMT threads on any online core */ |
---|
105 | 110 | int __read_mostly __max_smt_threads = 1; |
---|
.. | .. |
---|
143 | 148 | *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; |
---|
144 | 149 | } |
---|
145 | 150 | |
---|
| 151 | +static void init_freq_invariance(bool secondary); |
---|
| 152 | + |
---|
146 | 153 | /* |
---|
147 | 154 | * Report back to the Boot Processor during boot time or to the caller processor |
---|
148 | 155 | * during CPU online. |
---|
149 | 156 | */ |
---|
150 | 157 | static void smp_callin(void) |
---|
151 | 158 | { |
---|
152 | | - int cpuid, phys_id; |
---|
| 159 | + int cpuid; |
---|
153 | 160 | |
---|
154 | 161 | /* |
---|
155 | 162 | * If waken up by an INIT in an 82489DX configuration |
---|
.. | .. |
---|
158 | 165 | * now safe to touch our local APIC. |
---|
159 | 166 | */ |
---|
160 | 167 | cpuid = smp_processor_id(); |
---|
161 | | - |
---|
162 | | - /* |
---|
163 | | - * (This works even if the APIC is not enabled.) |
---|
164 | | - */ |
---|
165 | | - phys_id = read_apic_id(); |
---|
166 | 168 | |
---|
167 | 169 | /* |
---|
168 | 170 | * the boot CPU has finished the init stage and is spinning |
---|
.. | .. |
---|
183 | 185 | * calibrate_delay() and notify_cpu_starting(). |
---|
184 | 186 | */ |
---|
185 | 187 | set_cpu_sibling_map(raw_smp_processor_id()); |
---|
| 188 | + |
---|
| 189 | + init_freq_invariance(true); |
---|
186 | 190 | |
---|
187 | 191 | /* |
---|
188 | 192 | * Get our bogomips. |
---|
.. | .. |
---|
216 | 220 | * before cpu_init(), SMP booting is too fragile that we want to |
---|
217 | 221 | * limit the things done here to the most necessary things. |
---|
218 | 222 | */ |
---|
219 | | - if (boot_cpu_has(X86_FEATURE_PCID)) |
---|
220 | | - __write_cr4(__read_cr4() | X86_CR4_PCIDE); |
---|
| 223 | + cr4_init(); |
---|
221 | 224 | |
---|
222 | 225 | #ifdef CONFIG_X86_32 |
---|
223 | 226 | /* switch away from the initial page table */ |
---|
224 | 227 | load_cr3(swapper_pg_dir); |
---|
225 | | - /* |
---|
226 | | - * Initialize the CR4 shadow before doing anything that could |
---|
227 | | - * try to read it. |
---|
228 | | - */ |
---|
229 | | - cr4_init_shadow(); |
---|
230 | 228 | __flush_tlb_all(); |
---|
231 | 229 | #endif |
---|
232 | | - load_current_idt(); |
---|
| 230 | + cpu_init_exception_handling(); |
---|
233 | 231 | cpu_init(); |
---|
| 232 | + rcu_cpu_starting(raw_smp_processor_id()); |
---|
234 | 233 | x86_cpuinit.early_percpu_clock_init(); |
---|
235 | | - preempt_disable(); |
---|
236 | 234 | smp_callin(); |
---|
237 | 235 | |
---|
238 | 236 | enable_start_cpu0 = 0; |
---|
.. | .. |
---|
262 | 260 | /* enable local interrupts */ |
---|
263 | 261 | local_irq_enable(); |
---|
264 | 262 | |
---|
265 | | - /* to prevent fake stack check failure in clock setup */ |
---|
266 | | - boot_init_stack_canary(); |
---|
267 | | - |
---|
268 | 263 | x86_cpuinit.setup_percpu_clockev(); |
---|
269 | 264 | |
---|
270 | 265 | wmb(); |
---|
271 | 266 | cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); |
---|
272 | | - |
---|
273 | | - /* |
---|
274 | | - * Prevent tail call to cpu_startup_entry() because the stack protector |
---|
275 | | - * guard has been changed a couple of function calls up, in |
---|
276 | | - * boot_init_stack_canary() and must not be checked before tail calling |
---|
277 | | - * another function. |
---|
278 | | - */ |
---|
279 | | - prevent_tail_call_optimization(); |
---|
280 | 267 | } |
---|
281 | 268 | |
---|
282 | 269 | /** |
---|
.. | .. |
---|
314 | 301 | return -1; |
---|
315 | 302 | } |
---|
316 | 303 | EXPORT_SYMBOL(topology_phys_to_logical_pkg); |
---|
| 304 | +/** |
---|
| 305 | + * topology_phys_to_logical_die - Map a physical die id to logical |
---|
| 306 | + * |
---|
| 307 | + * Returns logical die id or -1 if not found |
---|
| 308 | + */ |
---|
| 309 | +int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu) |
---|
| 310 | +{ |
---|
| 311 | + int cpu; |
---|
| 312 | + int proc_id = cpu_data(cur_cpu).phys_proc_id; |
---|
| 313 | + |
---|
| 314 | + for_each_possible_cpu(cpu) { |
---|
| 315 | + struct cpuinfo_x86 *c = &cpu_data(cpu); |
---|
| 316 | + |
---|
| 317 | + if (c->initialized && c->cpu_die_id == die_id && |
---|
| 318 | + c->phys_proc_id == proc_id) |
---|
| 319 | + return c->logical_die_id; |
---|
| 320 | + } |
---|
| 321 | + return -1; |
---|
| 322 | +} |
---|
| 323 | +EXPORT_SYMBOL(topology_phys_to_logical_die); |
---|
317 | 324 | |
---|
318 | 325 | /** |
---|
319 | 326 | * topology_update_package_map - Update the physical to logical package map |
---|
.. | .. |
---|
338 | 345 | cpu_data(cpu).logical_proc_id = new; |
---|
339 | 346 | return 0; |
---|
340 | 347 | } |
---|
| 348 | +/** |
---|
| 349 | + * topology_update_die_map - Update the physical to logical die map |
---|
| 350 | + * @die: The die id as retrieved via CPUID |
---|
| 351 | + * @cpu: The cpu for which this is updated |
---|
| 352 | + */ |
---|
| 353 | +int topology_update_die_map(unsigned int die, unsigned int cpu) |
---|
| 354 | +{ |
---|
| 355 | + int new; |
---|
| 356 | + |
---|
| 357 | + /* Already available somewhere? */ |
---|
| 358 | + new = topology_phys_to_logical_die(die, cpu); |
---|
| 359 | + if (new >= 0) |
---|
| 360 | + goto found; |
---|
| 361 | + |
---|
| 362 | + new = logical_die++; |
---|
| 363 | + if (new != die) { |
---|
| 364 | + pr_info("CPU %u Converting physical %u to logical die %u\n", |
---|
| 365 | + cpu, die, new); |
---|
| 366 | + } |
---|
| 367 | +found: |
---|
| 368 | + cpu_data(cpu).logical_die_id = new; |
---|
| 369 | + return 0; |
---|
| 370 | +} |
---|
341 | 371 | |
---|
342 | 372 | void __init smp_store_boot_cpu_info(void) |
---|
343 | 373 | { |
---|
.. | .. |
---|
347 | 377 | *c = boot_cpu_data; |
---|
348 | 378 | c->cpu_index = id; |
---|
349 | 379 | topology_update_package_map(c->phys_proc_id, id); |
---|
| 380 | + topology_update_die_map(c->cpu_die_id, id); |
---|
350 | 381 | c->initialized = true; |
---|
351 | 382 | } |
---|
352 | 383 | |
---|
.. | .. |
---|
401 | 432 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
---|
402 | 433 | |
---|
403 | 434 | if (c->phys_proc_id == o->phys_proc_id && |
---|
| 435 | + c->cpu_die_id == o->cpu_die_id && |
---|
404 | 436 | per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) { |
---|
405 | 437 | if (c->cpu_core_id == o->cpu_core_id) |
---|
406 | 438 | return topology_sane(c, o, "smt"); |
---|
.. | .. |
---|
412 | 444 | } |
---|
413 | 445 | |
---|
414 | 446 | } else if (c->phys_proc_id == o->phys_proc_id && |
---|
| 447 | + c->cpu_die_id == o->cpu_die_id && |
---|
415 | 448 | c->cpu_core_id == o->cpu_core_id) { |
---|
416 | 449 | return topology_sane(c, o, "smt"); |
---|
417 | 450 | } |
---|
.. | .. |
---|
419 | 452 | return false; |
---|
420 | 453 | } |
---|
421 | 454 | |
---|
| 455 | +static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
| 456 | +{ |
---|
| 457 | + if (c->phys_proc_id == o->phys_proc_id && |
---|
| 458 | + c->cpu_die_id == o->cpu_die_id) |
---|
| 459 | + return true; |
---|
| 460 | + return false; |
---|
| 461 | +} |
---|
| 462 | + |
---|
422 | 463 | /* |
---|
423 | | - * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs. |
---|
| 464 | + * Unlike the other levels, we do not enforce keeping a |
---|
| 465 | + * multicore group inside a NUMA node. If this happens, we will |
---|
| 466 | + * discard the MC level of the topology later. |
---|
| 467 | + */ |
---|
| 468 | +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
| 469 | +{ |
---|
| 470 | + if (c->phys_proc_id == o->phys_proc_id) |
---|
| 471 | + return true; |
---|
| 472 | + return false; |
---|
| 473 | +} |
---|
| 474 | + |
---|
| 475 | +/* |
---|
| 476 | + * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs. |
---|
424 | 477 | * |
---|
425 | | - * These are Intel CPUs that enumerate an LLC that is shared by |
---|
426 | | - * multiple NUMA nodes. The LLC on these systems is shared for |
---|
427 | | - * off-package data access but private to the NUMA node (half |
---|
428 | | - * of the package) for on-package access. |
---|
| 478 | + * Any Intel CPU that has multiple nodes per package and does not |
---|
| 479 | + * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology. |
---|
429 | 480 | * |
---|
430 | | - * CPUID (the source of the information about the LLC) can only |
---|
431 | | - * enumerate the cache as being shared *or* unshared, but not |
---|
432 | | - * this particular configuration. The CPU in this case enumerates |
---|
433 | | - * the cache to be shared across the entire package (spanning both |
---|
434 | | - * NUMA nodes). |
---|
| 481 | + * When in SNC mode, these CPUs enumerate an LLC that is shared |
---|
| 482 | + * by multiple NUMA nodes. The LLC is shared for off-package data |
---|
| 483 | + * access but private to the NUMA node (half of the package) for |
---|
| 484 | + * on-package access. CPUID (the source of the information about |
---|
| 485 | + * the LLC) can only enumerate the cache as shared or unshared, |
---|
| 486 | + * but not this particular configuration. |
---|
435 | 487 | */ |
---|
436 | 488 | |
---|
437 | | -static const struct x86_cpu_id snc_cpu[] = { |
---|
438 | | - { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X }, |
---|
| 489 | +static const struct x86_cpu_id intel_cod_cpu[] = { |
---|
| 490 | + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */ |
---|
| 491 | + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */ |
---|
| 492 | + X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */ |
---|
439 | 493 | {} |
---|
440 | 494 | }; |
---|
441 | 495 | |
---|
442 | 496 | static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
443 | 497 | { |
---|
| 498 | + const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu); |
---|
444 | 499 | int cpu1 = c->cpu_index, cpu2 = o->cpu_index; |
---|
| 500 | + bool intel_snc = id && id->driver_data; |
---|
445 | 501 | |
---|
446 | 502 | /* Do not match if we do not have a valid APICID for cpu: */ |
---|
447 | 503 | if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID) |
---|
.. | .. |
---|
456 | 512 | * means 'c' does not share the LLC of 'o'. This will be |
---|
457 | 513 | * reflected to userspace. |
---|
458 | 514 | */ |
---|
459 | | - if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu)) |
---|
| 515 | + if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc) |
---|
460 | 516 | return false; |
---|
461 | 517 | |
---|
462 | 518 | return topology_sane(c, o, "llc"); |
---|
463 | 519 | } |
---|
464 | 520 | |
---|
465 | | -/* |
---|
466 | | - * Unlike the other levels, we do not enforce keeping a |
---|
467 | | - * multicore group inside a NUMA node. If this happens, we will |
---|
468 | | - * discard the MC level of the topology later. |
---|
469 | | - */ |
---|
470 | | -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) |
---|
471 | | -{ |
---|
472 | | - if (c->phys_proc_id == o->phys_proc_id) |
---|
473 | | - return true; |
---|
474 | | - return false; |
---|
475 | | -} |
---|
476 | 521 | |
---|
477 | 522 | #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) |
---|
478 | 523 | static inline int x86_sched_itmt_flags(void) |
---|
.. | .. |
---|
536 | 581 | cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu)); |
---|
537 | 582 | cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu)); |
---|
538 | 583 | cpumask_set_cpu(cpu, topology_core_cpumask(cpu)); |
---|
| 584 | + cpumask_set_cpu(cpu, topology_die_cpumask(cpu)); |
---|
539 | 585 | c->booted_cores = 1; |
---|
540 | 586 | return; |
---|
541 | 587 | } |
---|
.. | .. |
---|
543 | 589 | for_each_cpu(i, cpu_sibling_setup_mask) { |
---|
544 | 590 | o = &cpu_data(i); |
---|
545 | 591 | |
---|
| 592 | + if (match_pkg(c, o) && !topology_same_node(c, o)) |
---|
| 593 | + x86_has_numa_in_package = true; |
---|
| 594 | + |
---|
546 | 595 | if ((i == cpu) || (has_smt && match_smt(c, o))) |
---|
547 | 596 | link_mask(topology_sibling_cpumask, cpu, i); |
---|
548 | 597 | |
---|
549 | 598 | if ((i == cpu) || (has_mp && match_llc(c, o))) |
---|
550 | 599 | link_mask(cpu_llc_shared_mask, cpu, i); |
---|
551 | 600 | |
---|
| 601 | + if ((i == cpu) || (has_mp && match_die(c, o))) |
---|
| 602 | + link_mask(topology_die_cpumask, cpu, i); |
---|
552 | 603 | } |
---|
| 604 | + |
---|
| 605 | + threads = cpumask_weight(topology_sibling_cpumask(cpu)); |
---|
| 606 | + if (threads > __max_smt_threads) |
---|
| 607 | + __max_smt_threads = threads; |
---|
553 | 608 | |
---|
554 | 609 | /* |
---|
555 | 610 | * This needs a separate iteration over the cpus because we rely on all |
---|
.. | .. |
---|
558 | 613 | for_each_cpu(i, cpu_sibling_setup_mask) { |
---|
559 | 614 | o = &cpu_data(i); |
---|
560 | 615 | |
---|
561 | | - if ((i == cpu) || (has_mp && match_die(c, o))) { |
---|
| 616 | + if ((i == cpu) || (has_mp && match_pkg(c, o))) { |
---|
562 | 617 | link_mask(topology_core_cpumask, cpu, i); |
---|
563 | 618 | |
---|
564 | 619 | /* |
---|
565 | 620 | * Does this new cpu bringup a new core? |
---|
566 | 621 | */ |
---|
567 | | - if (cpumask_weight( |
---|
568 | | - topology_sibling_cpumask(cpu)) == 1) { |
---|
| 622 | + if (threads == 1) { |
---|
569 | 623 | /* |
---|
570 | 624 | * for each core in package, increment |
---|
571 | 625 | * the booted_cores for this new cpu |
---|
.. | .. |
---|
582 | 636 | } else if (i != cpu && !c->booted_cores) |
---|
583 | 637 | c->booted_cores = cpu_data(i).booted_cores; |
---|
584 | 638 | } |
---|
585 | | - if (match_die(c, o) && !topology_same_node(c, o)) |
---|
586 | | - x86_has_numa_in_package = true; |
---|
587 | 639 | } |
---|
588 | | - |
---|
589 | | - threads = cpumask_weight(topology_sibling_cpumask(cpu)); |
---|
590 | | - if (threads > __max_smt_threads) |
---|
591 | | - __max_smt_threads = threads; |
---|
592 | 640 | } |
---|
593 | 641 | |
---|
594 | 642 | /* maps the cpu to the sched domain representing multi-core */ |
---|
.. | .. |
---|
684 | 732 | |
---|
685 | 733 | /* if modern processor, use no delay */ |
---|
686 | 734 | if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) || |
---|
| 735 | + ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) || |
---|
687 | 736 | ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) { |
---|
688 | 737 | init_udelay = 0; |
---|
689 | 738 | return; |
---|
.. | .. |
---|
848 | 897 | /* reduce the number of lines printed when booting a large cpu count system */ |
---|
849 | 898 | static void announce_cpu(int cpu, int apicid) |
---|
850 | 899 | { |
---|
851 | | - static int current_node = -1; |
---|
| 900 | + static int current_node = NUMA_NO_NODE; |
---|
852 | 901 | int node = early_cpu_to_node(cpu); |
---|
853 | 902 | static int width, node_width; |
---|
854 | 903 | |
---|
.. | .. |
---|
946 | 995 | return boot_error; |
---|
947 | 996 | } |
---|
948 | 997 | |
---|
949 | | -void common_cpu_up(unsigned int cpu, struct task_struct *idle) |
---|
| 998 | +int common_cpu_up(unsigned int cpu, struct task_struct *idle) |
---|
950 | 999 | { |
---|
| 1000 | + int ret; |
---|
| 1001 | + |
---|
951 | 1002 | /* Just in case we booted with a single CPU. */ |
---|
952 | 1003 | alternatives_enable_smp(); |
---|
953 | 1004 | |
---|
954 | 1005 | per_cpu(current_task, cpu) = idle; |
---|
| 1006 | + cpu_init_stack_canary(cpu, idle); |
---|
| 1007 | + |
---|
| 1008 | + /* Initialize the interrupt stack(s) */ |
---|
| 1009 | + ret = irq_init_percpu_irqstack(cpu); |
---|
| 1010 | + if (ret) |
---|
| 1011 | + return ret; |
---|
955 | 1012 | |
---|
956 | 1013 | #ifdef CONFIG_X86_32 |
---|
957 | 1014 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
---|
958 | | - irq_ctx_init(cpu); |
---|
959 | 1015 | per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); |
---|
960 | 1016 | #else |
---|
961 | 1017 | initial_gs = per_cpu_offset(cpu); |
---|
962 | 1018 | #endif |
---|
| 1019 | + return 0; |
---|
963 | 1020 | } |
---|
964 | 1021 | |
---|
965 | 1022 | /* |
---|
.. | .. |
---|
971 | 1028 | static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, |
---|
972 | 1029 | int *cpu0_nmi_registered) |
---|
973 | 1030 | { |
---|
974 | | - volatile u32 *trampoline_status = |
---|
975 | | - (volatile u32 *) __va(real_mode_header->trampoline_status); |
---|
976 | 1031 | /* start_ip had better be page-aligned! */ |
---|
977 | 1032 | unsigned long start_ip = real_mode_header->trampoline_start; |
---|
978 | 1033 | |
---|
.. | .. |
---|
1064 | 1119 | } |
---|
1065 | 1120 | } |
---|
1066 | 1121 | |
---|
1067 | | - /* mark "stuck" area as not stuck */ |
---|
1068 | | - *trampoline_status = 0; |
---|
1069 | | - |
---|
1070 | 1122 | if (x86_platform.legacy.warm_reset) { |
---|
1071 | 1123 | /* |
---|
1072 | 1124 | * Cleanup possible dangling ends... |
---|
.. | .. |
---|
1117 | 1169 | /* the FPU context is blank, nobody can own it */ |
---|
1118 | 1170 | per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; |
---|
1119 | 1171 | |
---|
1120 | | - common_cpu_up(cpu, tidle); |
---|
| 1172 | + err = common_cpu_up(cpu, tidle); |
---|
| 1173 | + if (err) |
---|
| 1174 | + return err; |
---|
1121 | 1175 | |
---|
1122 | 1176 | err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); |
---|
1123 | 1177 | if (err) { |
---|
.. | .. |
---|
1178 | 1232 | physid_set_mask_of_physid(0, &phys_cpu_present_map); |
---|
1179 | 1233 | cpumask_set_cpu(0, topology_sibling_cpumask(0)); |
---|
1180 | 1234 | cpumask_set_cpu(0, topology_core_cpumask(0)); |
---|
| 1235 | + cpumask_set_cpu(0, topology_die_cpumask(0)); |
---|
1181 | 1236 | } |
---|
1182 | 1237 | |
---|
1183 | 1238 | /* |
---|
.. | .. |
---|
1273 | 1328 | for_each_possible_cpu(i) { |
---|
1274 | 1329 | zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); |
---|
1275 | 1330 | zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); |
---|
| 1331 | + zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL); |
---|
1276 | 1332 | zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); |
---|
1277 | 1333 | } |
---|
1278 | 1334 | |
---|
.. | .. |
---|
1286 | 1342 | set_sched_topology(x86_topology); |
---|
1287 | 1343 | |
---|
1288 | 1344 | set_cpu_sibling_map(0); |
---|
1289 | | - |
---|
| 1345 | + init_freq_invariance(false); |
---|
1290 | 1346 | smp_sanity_check(); |
---|
1291 | 1347 | |
---|
1292 | 1348 | switch (apic_intr_mode) { |
---|
.. | .. |
---|
1312 | 1368 | pr_info("CPU0: "); |
---|
1313 | 1369 | print_cpu_info(&cpu_data(0)); |
---|
1314 | 1370 | |
---|
1315 | | - native_pv_lock_init(); |
---|
1316 | | - |
---|
1317 | 1371 | uv_system_init(); |
---|
1318 | 1372 | |
---|
1319 | 1373 | set_mtrr_aps_delayed_init(); |
---|
.. | .. |
---|
1323 | 1377 | speculative_store_bypass_ht_init(); |
---|
1324 | 1378 | } |
---|
1325 | 1379 | |
---|
1326 | | -void arch_enable_nonboot_cpus_begin(void) |
---|
| 1380 | +void arch_thaw_secondary_cpus_begin(void) |
---|
1327 | 1381 | { |
---|
1328 | 1382 | set_mtrr_aps_delayed_init(); |
---|
1329 | 1383 | } |
---|
1330 | 1384 | |
---|
1331 | | -void arch_enable_nonboot_cpus_end(void) |
---|
| 1385 | +void arch_thaw_secondary_cpus_end(void) |
---|
1332 | 1386 | { |
---|
1333 | 1387 | mtrr_aps_init(); |
---|
1334 | 1388 | } |
---|
.. | .. |
---|
1343 | 1397 | /* already set me in cpu_online_mask in boot_cpu_init() */ |
---|
1344 | 1398 | cpumask_set_cpu(me, cpu_callout_mask); |
---|
1345 | 1399 | cpu_set_state_online(me); |
---|
| 1400 | + native_pv_lock_init(); |
---|
1346 | 1401 | } |
---|
1347 | 1402 | |
---|
1348 | 1403 | void __init calculate_max_logical_packages(void) |
---|
.. | .. |
---|
1384 | 1439 | /* |
---|
1385 | 1440 | * cpu_possible_mask should be static, it cannot change as cpu's |
---|
1386 | 1441 | * are onlined, or offlined. The reason is per-cpu data-structures |
---|
1387 | | - * are allocated by some modules at init time, and dont expect to |
---|
| 1442 | + * are allocated by some modules at init time, and don't expect to |
---|
1388 | 1443 | * do this dynamically on cpu arrival/departure. |
---|
1389 | 1444 | * cpu_present_mask on the other hand can change dynamically. |
---|
1390 | 1445 | * In case when cpu_hotplug is not compiled, then we resort to current |
---|
.. | .. |
---|
1493 | 1548 | cpu_data(sibling).booted_cores--; |
---|
1494 | 1549 | } |
---|
1495 | 1550 | |
---|
| 1551 | + for_each_cpu(sibling, topology_die_cpumask(cpu)) |
---|
| 1552 | + cpumask_clear_cpu(cpu, topology_die_cpumask(sibling)); |
---|
1496 | 1553 | for_each_cpu(sibling, topology_sibling_cpumask(cpu)) |
---|
1497 | 1554 | cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling)); |
---|
1498 | 1555 | for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) |
---|
.. | .. |
---|
1500 | 1557 | cpumask_clear(cpu_llc_shared_mask(cpu)); |
---|
1501 | 1558 | cpumask_clear(topology_sibling_cpumask(cpu)); |
---|
1502 | 1559 | cpumask_clear(topology_core_cpumask(cpu)); |
---|
| 1560 | + cpumask_clear(topology_die_cpumask(cpu)); |
---|
1503 | 1561 | c->cpu_core_id = 0; |
---|
1504 | 1562 | c->booted_cores = 0; |
---|
1505 | 1563 | cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); |
---|
.. | .. |
---|
1538 | 1596 | if (ret) |
---|
1539 | 1597 | return ret; |
---|
1540 | 1598 | |
---|
1541 | | - clear_local_APIC(); |
---|
1542 | 1599 | cpu_disable_common(); |
---|
| 1600 | + |
---|
| 1601 | + /* |
---|
| 1602 | + * Disable the local APIC. Otherwise IPI broadcasts will reach |
---|
| 1603 | + * it. It still responds normally to INIT, NMI, SMI, and SIPI |
---|
| 1604 | + * messages. |
---|
| 1605 | + * |
---|
| 1606 | + * Disabling the APIC must happen after cpu_disable_common() |
---|
| 1607 | + * which invokes fixup_irqs(). |
---|
| 1608 | + * |
---|
| 1609 | + * Disabling the APIC preserves already set bits in IRR, but |
---|
| 1610 | + * an interrupt arriving after disabling the local APIC does not |
---|
| 1611 | + * set the corresponding IRR bit. |
---|
| 1612 | + * |
---|
| 1613 | + * fixup_irqs() scans IRR for set bits so it can raise a not |
---|
| 1614 | + * yet handled interrupt on the new destination CPU via an IPI |
---|
| 1615 | + * but obviously it can't do so for IRR bits which are not set. |
---|
| 1616 | + * IOW, interrupts arriving after disabling the local APIC will |
---|
| 1617 | + * be lost. |
---|
| 1618 | + */ |
---|
| 1619 | + apic_soft_disable(); |
---|
1543 | 1620 | |
---|
1544 | 1621 | return 0; |
---|
1545 | 1622 | } |
---|
.. | .. |
---|
1580 | 1657 | local_irq_disable(); |
---|
1581 | 1658 | } |
---|
1582 | 1659 | |
---|
1583 | | -static bool wakeup_cpu0(void) |
---|
| 1660 | +/** |
---|
| 1661 | + * cond_wakeup_cpu0 - Wake up CPU0 if needed. |
---|
| 1662 | + * |
---|
| 1663 | + * If NMI wants to wake up CPU0, start CPU0. |
---|
| 1664 | + */ |
---|
| 1665 | +void cond_wakeup_cpu0(void) |
---|
1584 | 1666 | { |
---|
1585 | 1667 | if (smp_processor_id() == 0 && enable_start_cpu0) |
---|
1586 | | - return true; |
---|
1587 | | - |
---|
1588 | | - return false; |
---|
| 1668 | + start_cpu0(); |
---|
1589 | 1669 | } |
---|
| 1670 | +EXPORT_SYMBOL_GPL(cond_wakeup_cpu0); |
---|
1590 | 1671 | |
---|
1591 | 1672 | /* |
---|
1592 | 1673 | * We need to flush the caches before going to sleep, lest we have |
---|
.. | .. |
---|
1600 | 1681 | void *mwait_ptr; |
---|
1601 | 1682 | int i; |
---|
1602 | 1683 | |
---|
1603 | | - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) |
---|
| 1684 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || |
---|
| 1685 | + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) |
---|
1604 | 1686 | return; |
---|
1605 | 1687 | if (!this_cpu_has(X86_FEATURE_MWAIT)) |
---|
1606 | 1688 | return; |
---|
.. | .. |
---|
1654 | 1736 | __monitor(mwait_ptr, 0, 0); |
---|
1655 | 1737 | mb(); |
---|
1656 | 1738 | __mwait(eax, 0); |
---|
1657 | | - /* |
---|
1658 | | - * If NMI wants to wake up CPU0, start CPU0. |
---|
1659 | | - */ |
---|
1660 | | - if (wakeup_cpu0()) |
---|
1661 | | - start_cpu0(); |
---|
| 1739 | + |
---|
| 1740 | + cond_wakeup_cpu0(); |
---|
1662 | 1741 | } |
---|
1663 | 1742 | } |
---|
1664 | 1743 | |
---|
.. | .. |
---|
1669 | 1748 | |
---|
1670 | 1749 | while (1) { |
---|
1671 | 1750 | native_halt(); |
---|
1672 | | - /* |
---|
1673 | | - * If NMI wants to wake up CPU0, start CPU0. |
---|
1674 | | - */ |
---|
1675 | | - if (wakeup_cpu0()) |
---|
1676 | | - start_cpu0(); |
---|
| 1751 | + |
---|
| 1752 | + cond_wakeup_cpu0(); |
---|
1677 | 1753 | } |
---|
1678 | 1754 | } |
---|
1679 | 1755 | |
---|
.. | .. |
---|
1705 | 1781 | } |
---|
1706 | 1782 | |
---|
1707 | 1783 | #endif |
---|
| 1784 | + |
---|
| 1785 | +#ifdef CONFIG_X86_64 |
---|
| 1786 | +/* |
---|
| 1787 | + * APERF/MPERF frequency ratio computation. |
---|
| 1788 | + * |
---|
| 1789 | + * The scheduler wants to do frequency invariant accounting and needs a <1 |
---|
| 1790 | + * ratio to account for the 'current' frequency, corresponding to |
---|
| 1791 | + * freq_curr / freq_max. |
---|
| 1792 | + * |
---|
| 1793 | + * Since the frequency freq_curr on x86 is controlled by micro-controller and |
---|
| 1794 | + * our P-state setting is little more than a request/hint, we need to observe |
---|
| 1795 | + * the effective frequency 'BusyMHz', i.e. the average frequency over a time |
---|
| 1796 | + * interval after discarding idle time. This is given by: |
---|
| 1797 | + * |
---|
| 1798 | + * BusyMHz = delta_APERF / delta_MPERF * freq_base |
---|
| 1799 | + * |
---|
| 1800 | + * where freq_base is the max non-turbo P-state. |
---|
| 1801 | + * |
---|
| 1802 | + * The freq_max term has to be set to a somewhat arbitrary value, because we |
---|
| 1803 | + * can't know which turbo states will be available at a given point in time: |
---|
| 1804 | + * it all depends on the thermal headroom of the entire package. We set it to |
---|
| 1805 | + * the turbo level with 4 cores active. |
---|
| 1806 | + * |
---|
| 1807 | + * Benchmarks show that's a good compromise between the 1C turbo ratio |
---|
| 1808 | + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, |
---|
| 1809 | + * which would ignore the entire turbo range (a conspicuous part, making |
---|
| 1810 | + * freq_curr/freq_max always maxed out). |
---|
| 1811 | + * |
---|
| 1812 | + * An exception to the heuristic above is the Atom uarch, where we choose the |
---|
| 1813 | + * highest turbo level for freq_max since Atom's are generally oriented towards |
---|
| 1814 | + * power efficiency. |
---|
| 1815 | + * |
---|
| 1816 | + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio |
---|
| 1817 | + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. |
---|
| 1818 | + */ |
---|
| 1819 | + |
---|
| 1820 | +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); |
---|
| 1821 | + |
---|
| 1822 | +static DEFINE_PER_CPU(u64, arch_prev_aperf); |
---|
| 1823 | +static DEFINE_PER_CPU(u64, arch_prev_mperf); |
---|
| 1824 | +static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE; |
---|
| 1825 | +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; |
---|
| 1826 | + |
---|
| 1827 | +void arch_set_max_freq_ratio(bool turbo_disabled) |
---|
| 1828 | +{ |
---|
| 1829 | + arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE : |
---|
| 1830 | + arch_turbo_freq_ratio; |
---|
| 1831 | +} |
---|
| 1832 | +EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); |
---|
| 1833 | + |
---|
| 1834 | +static bool turbo_disabled(void) |
---|
| 1835 | +{ |
---|
| 1836 | + u64 misc_en; |
---|
| 1837 | + int err; |
---|
| 1838 | + |
---|
| 1839 | + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); |
---|
| 1840 | + if (err) |
---|
| 1841 | + return false; |
---|
| 1842 | + |
---|
| 1843 | + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); |
---|
| 1844 | +} |
---|
| 1845 | + |
---|
| 1846 | +static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
---|
| 1847 | +{ |
---|
| 1848 | + int err; |
---|
| 1849 | + |
---|
| 1850 | + err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); |
---|
| 1851 | + if (err) |
---|
| 1852 | + return false; |
---|
| 1853 | + |
---|
| 1854 | + err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); |
---|
| 1855 | + if (err) |
---|
| 1856 | + return false; |
---|
| 1857 | + |
---|
| 1858 | + *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */ |
---|
| 1859 | + *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */ |
---|
| 1860 | + |
---|
| 1861 | + return true; |
---|
| 1862 | +} |
---|
| 1863 | + |
---|
| 1864 | +#include <asm/cpu_device_id.h> |
---|
| 1865 | +#include <asm/intel-family.h> |
---|
| 1866 | + |
---|
| 1867 | +#define X86_MATCH(model) \ |
---|
| 1868 | + X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ |
---|
| 1869 | + INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) |
---|
| 1870 | + |
---|
| 1871 | +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { |
---|
| 1872 | + X86_MATCH(XEON_PHI_KNL), |
---|
| 1873 | + X86_MATCH(XEON_PHI_KNM), |
---|
| 1874 | + {} |
---|
| 1875 | +}; |
---|
| 1876 | + |
---|
| 1877 | +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { |
---|
| 1878 | + X86_MATCH(SKYLAKE_X), |
---|
| 1879 | + {} |
---|
| 1880 | +}; |
---|
| 1881 | + |
---|
| 1882 | +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { |
---|
| 1883 | + X86_MATCH(ATOM_GOLDMONT), |
---|
| 1884 | + X86_MATCH(ATOM_GOLDMONT_D), |
---|
| 1885 | + X86_MATCH(ATOM_GOLDMONT_PLUS), |
---|
| 1886 | + {} |
---|
| 1887 | +}; |
---|
| 1888 | + |
---|
| 1889 | +static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, |
---|
| 1890 | + int num_delta_fratio) |
---|
| 1891 | +{ |
---|
| 1892 | + int fratio, delta_fratio, found; |
---|
| 1893 | + int err, i; |
---|
| 1894 | + u64 msr; |
---|
| 1895 | + |
---|
| 1896 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1897 | + if (err) |
---|
| 1898 | + return false; |
---|
| 1899 | + |
---|
| 1900 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1901 | + |
---|
| 1902 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); |
---|
| 1903 | + if (err) |
---|
| 1904 | + return false; |
---|
| 1905 | + |
---|
| 1906 | + fratio = (msr >> 8) & 0xFF; |
---|
| 1907 | + i = 16; |
---|
| 1908 | + found = 0; |
---|
| 1909 | + do { |
---|
| 1910 | + if (found >= num_delta_fratio) { |
---|
| 1911 | + *turbo_freq = fratio; |
---|
| 1912 | + return true; |
---|
| 1913 | + } |
---|
| 1914 | + |
---|
| 1915 | + delta_fratio = (msr >> (i + 5)) & 0x7; |
---|
| 1916 | + |
---|
| 1917 | + if (delta_fratio) { |
---|
| 1918 | + found += 1; |
---|
| 1919 | + fratio -= delta_fratio; |
---|
| 1920 | + } |
---|
| 1921 | + |
---|
| 1922 | + i += 8; |
---|
| 1923 | + } while (i < 64); |
---|
| 1924 | + |
---|
| 1925 | + return true; |
---|
| 1926 | +} |
---|
| 1927 | + |
---|
| 1928 | +static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) |
---|
| 1929 | +{ |
---|
| 1930 | + u64 ratios, counts; |
---|
| 1931 | + u32 group_size; |
---|
| 1932 | + int err, i; |
---|
| 1933 | + |
---|
| 1934 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1935 | + if (err) |
---|
| 1936 | + return false; |
---|
| 1937 | + |
---|
| 1938 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1939 | + |
---|
| 1940 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); |
---|
| 1941 | + if (err) |
---|
| 1942 | + return false; |
---|
| 1943 | + |
---|
| 1944 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); |
---|
| 1945 | + if (err) |
---|
| 1946 | + return false; |
---|
| 1947 | + |
---|
| 1948 | + for (i = 0; i < 64; i += 8) { |
---|
| 1949 | + group_size = (counts >> i) & 0xFF; |
---|
| 1950 | + if (group_size >= size) { |
---|
| 1951 | + *turbo_freq = (ratios >> i) & 0xFF; |
---|
| 1952 | + return true; |
---|
| 1953 | + } |
---|
| 1954 | + } |
---|
| 1955 | + |
---|
| 1956 | + return false; |
---|
| 1957 | +} |
---|
| 1958 | + |
---|
| 1959 | +static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) |
---|
| 1960 | +{ |
---|
| 1961 | + u64 msr; |
---|
| 1962 | + int err; |
---|
| 1963 | + |
---|
| 1964 | + err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); |
---|
| 1965 | + if (err) |
---|
| 1966 | + return false; |
---|
| 1967 | + |
---|
| 1968 | + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); |
---|
| 1969 | + if (err) |
---|
| 1970 | + return false; |
---|
| 1971 | + |
---|
| 1972 | + *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ |
---|
| 1973 | + *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */ |
---|
| 1974 | + |
---|
| 1975 | + /* The CPU may have less than 4 cores */ |
---|
| 1976 | + if (!*turbo_freq) |
---|
| 1977 | + *turbo_freq = msr & 0xFF; /* 1C turbo */ |
---|
| 1978 | + |
---|
| 1979 | + return true; |
---|
| 1980 | +} |
---|
| 1981 | + |
---|
| 1982 | +static bool intel_set_max_freq_ratio(void) |
---|
| 1983 | +{ |
---|
| 1984 | + u64 base_freq, turbo_freq; |
---|
| 1985 | + u64 turbo_ratio; |
---|
| 1986 | + |
---|
| 1987 | + if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) |
---|
| 1988 | + goto out; |
---|
| 1989 | + |
---|
| 1990 | + if (x86_match_cpu(has_glm_turbo_ratio_limits) && |
---|
| 1991 | + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) |
---|
| 1992 | + goto out; |
---|
| 1993 | + |
---|
| 1994 | + if (x86_match_cpu(has_knl_turbo_ratio_limits) && |
---|
| 1995 | + knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1)) |
---|
| 1996 | + goto out; |
---|
| 1997 | + |
---|
| 1998 | + if (x86_match_cpu(has_skx_turbo_ratio_limits) && |
---|
| 1999 | + skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4)) |
---|
| 2000 | + goto out; |
---|
| 2001 | + |
---|
| 2002 | + if (core_set_max_freq_ratio(&base_freq, &turbo_freq)) |
---|
| 2003 | + goto out; |
---|
| 2004 | + |
---|
| 2005 | + return false; |
---|
| 2006 | + |
---|
| 2007 | +out: |
---|
| 2008 | + /* |
---|
| 2009 | + * Some hypervisors advertise X86_FEATURE_APERFMPERF |
---|
| 2010 | + * but then fill all MSR's with zeroes. |
---|
| 2011 | + * Some CPUs have turbo boost but don't declare any turbo ratio |
---|
| 2012 | + * in MSR_TURBO_RATIO_LIMIT. |
---|
| 2013 | + */ |
---|
| 2014 | + if (!base_freq || !turbo_freq) { |
---|
| 2015 | + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); |
---|
| 2016 | + return false; |
---|
| 2017 | + } |
---|
| 2018 | + |
---|
| 2019 | + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); |
---|
| 2020 | + if (!turbo_ratio) { |
---|
| 2021 | + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); |
---|
| 2022 | + return false; |
---|
| 2023 | + } |
---|
| 2024 | + |
---|
| 2025 | + arch_turbo_freq_ratio = turbo_ratio; |
---|
| 2026 | + arch_set_max_freq_ratio(turbo_disabled()); |
---|
| 2027 | + |
---|
| 2028 | + return true; |
---|
| 2029 | +} |
---|
| 2030 | + |
---|
| 2031 | +static void init_counter_refs(void) |
---|
| 2032 | +{ |
---|
| 2033 | + u64 aperf, mperf; |
---|
| 2034 | + |
---|
| 2035 | + rdmsrl(MSR_IA32_APERF, aperf); |
---|
| 2036 | + rdmsrl(MSR_IA32_MPERF, mperf); |
---|
| 2037 | + |
---|
| 2038 | + this_cpu_write(arch_prev_aperf, aperf); |
---|
| 2039 | + this_cpu_write(arch_prev_mperf, mperf); |
---|
| 2040 | +} |
---|
| 2041 | + |
---|
| 2042 | +static void init_freq_invariance(bool secondary) |
---|
| 2043 | +{ |
---|
| 2044 | + bool ret = false; |
---|
| 2045 | + |
---|
| 2046 | + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) |
---|
| 2047 | + return; |
---|
| 2048 | + |
---|
| 2049 | + if (secondary) { |
---|
| 2050 | + if (static_branch_likely(&arch_scale_freq_key)) { |
---|
| 2051 | + init_counter_refs(); |
---|
| 2052 | + } |
---|
| 2053 | + return; |
---|
| 2054 | + } |
---|
| 2055 | + |
---|
| 2056 | + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) |
---|
| 2057 | + ret = intel_set_max_freq_ratio(); |
---|
| 2058 | + |
---|
| 2059 | + if (ret) { |
---|
| 2060 | + init_counter_refs(); |
---|
| 2061 | + static_branch_enable(&arch_scale_freq_key); |
---|
| 2062 | + } else { |
---|
| 2063 | + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); |
---|
| 2064 | + } |
---|
| 2065 | +} |
---|
| 2066 | + |
---|
| 2067 | +static void disable_freq_invariance_workfn(struct work_struct *work) |
---|
| 2068 | +{ |
---|
| 2069 | + static_branch_disable(&arch_scale_freq_key); |
---|
| 2070 | +} |
---|
| 2071 | + |
---|
| 2072 | +static DECLARE_WORK(disable_freq_invariance_work, |
---|
| 2073 | + disable_freq_invariance_workfn); |
---|
| 2074 | + |
---|
| 2075 | +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; |
---|
| 2076 | + |
---|
| 2077 | +void arch_scale_freq_tick(void) |
---|
| 2078 | +{ |
---|
| 2079 | + u64 freq_scale = SCHED_CAPACITY_SCALE; |
---|
| 2080 | + u64 aperf, mperf; |
---|
| 2081 | + u64 acnt, mcnt; |
---|
| 2082 | + |
---|
| 2083 | + if (!arch_scale_freq_invariant()) |
---|
| 2084 | + return; |
---|
| 2085 | + |
---|
| 2086 | + rdmsrl(MSR_IA32_APERF, aperf); |
---|
| 2087 | + rdmsrl(MSR_IA32_MPERF, mperf); |
---|
| 2088 | + |
---|
| 2089 | + acnt = aperf - this_cpu_read(arch_prev_aperf); |
---|
| 2090 | + mcnt = mperf - this_cpu_read(arch_prev_mperf); |
---|
| 2091 | + |
---|
| 2092 | + this_cpu_write(arch_prev_aperf, aperf); |
---|
| 2093 | + this_cpu_write(arch_prev_mperf, mperf); |
---|
| 2094 | + |
---|
| 2095 | + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) |
---|
| 2096 | + goto error; |
---|
| 2097 | + |
---|
| 2098 | + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) |
---|
| 2099 | + goto error; |
---|
| 2100 | + |
---|
| 2101 | + freq_scale = div64_u64(acnt, mcnt); |
---|
| 2102 | + if (!freq_scale) |
---|
| 2103 | + goto error; |
---|
| 2104 | + |
---|
| 2105 | + if (freq_scale > SCHED_CAPACITY_SCALE) |
---|
| 2106 | + freq_scale = SCHED_CAPACITY_SCALE; |
---|
| 2107 | + |
---|
| 2108 | + this_cpu_write(arch_freq_scale, freq_scale); |
---|
| 2109 | + return; |
---|
| 2110 | + |
---|
| 2111 | +error: |
---|
| 2112 | + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); |
---|
| 2113 | + schedule_work(&disable_freq_invariance_work); |
---|
| 2114 | +} |
---|
| 2115 | +#else |
---|
| 2116 | +static inline void init_freq_invariance(bool secondary) |
---|
| 2117 | +{ |
---|
| 2118 | +} |
---|
| 2119 | +#endif /* CONFIG_X86_64 */ |
---|