hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/x86/kernel/smpboot.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * x86 SMP booting functions
34 *
....@@ -11,9 +12,6 @@
1112 * Thanks to Intel for making available several different Pentium,
1213 * Pentium Pro and Pentium-II/Xeon MP machines.
1314 * Original development of Linux SMP code supported by Caldera.
14
- *
15
- * This code is released under the GNU General Public License version 2 or
16
- * later.
1715 *
1816 * Fixes
1917 * Felix Koop : NR_CPUS used properly
....@@ -49,13 +47,15 @@
4947 #include <linux/sched/hotplug.h>
5048 #include <linux/sched/task_stack.h>
5149 #include <linux/percpu.h>
52
-#include <linux/bootmem.h>
50
+#include <linux/memblock.h>
5351 #include <linux/err.h>
5452 #include <linux/nmi.h>
5553 #include <linux/tboot.h>
56
-#include <linux/stackprotector.h>
5754 #include <linux/gfp.h>
5855 #include <linux/cpuidle.h>
56
+#include <linux/numa.h>
57
+#include <linux/pgtable.h>
58
+#include <linux/overflow.h>
5959
6060 #include <asm/acpi.h>
6161 #include <asm/desc.h>
....@@ -64,7 +64,6 @@
6464 #include <asm/realmode.h>
6565 #include <asm/cpu.h>
6666 #include <asm/numa.h>
67
-#include <asm/pgtable.h>
6867 #include <asm/tlbflush.h>
6968 #include <asm/mtrr.h>
7069 #include <asm/mwait.h>
....@@ -81,6 +80,7 @@
8180 #include <asm/cpu_device_id.h>
8281 #include <asm/spec-ctrl.h>
8382 #include <asm/hw_irq.h>
83
+#include <asm/stackprotector.h>
8484
8585 /* representing HT siblings of each logical CPU */
8686 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
....@@ -89,6 +89,10 @@
8989 /* representing HT and core siblings of each logical CPU */
9090 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
9191 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
92
+
93
+/* representing HT, core, and die siblings of each logical CPU */
94
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
95
+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
9296
9397 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
9498
....@@ -100,6 +104,7 @@
100104 unsigned int __max_logical_packages __read_mostly;
101105 EXPORT_SYMBOL(__max_logical_packages);
102106 static unsigned int logical_packages __read_mostly;
107
+static unsigned int logical_die __read_mostly;
103108
104109 /* Maximum number of SMT threads on any online core */
105110 int __read_mostly __max_smt_threads = 1;
....@@ -143,13 +148,15 @@
143148 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
144149 }
145150
151
+static void init_freq_invariance(bool secondary);
152
+
146153 /*
147154 * Report back to the Boot Processor during boot time or to the caller processor
148155 * during CPU online.
149156 */
150157 static void smp_callin(void)
151158 {
152
- int cpuid, phys_id;
159
+ int cpuid;
153160
154161 /*
155162 * If waken up by an INIT in an 82489DX configuration
....@@ -158,11 +165,6 @@
158165 * now safe to touch our local APIC.
159166 */
160167 cpuid = smp_processor_id();
161
-
162
- /*
163
- * (This works even if the APIC is not enabled.)
164
- */
165
- phys_id = read_apic_id();
166168
167169 /*
168170 * the boot CPU has finished the init stage and is spinning
....@@ -183,6 +185,8 @@
183185 * calibrate_delay() and notify_cpu_starting().
184186 */
185187 set_cpu_sibling_map(raw_smp_processor_id());
188
+
189
+ init_freq_invariance(true);
186190
187191 /*
188192 * Get our bogomips.
....@@ -216,23 +220,17 @@
216220 * before cpu_init(), SMP booting is too fragile that we want to
217221 * limit the things done here to the most necessary things.
218222 */
219
- if (boot_cpu_has(X86_FEATURE_PCID))
220
- __write_cr4(__read_cr4() | X86_CR4_PCIDE);
223
+ cr4_init();
221224
222225 #ifdef CONFIG_X86_32
223226 /* switch away from the initial page table */
224227 load_cr3(swapper_pg_dir);
225
- /*
226
- * Initialize the CR4 shadow before doing anything that could
227
- * try to read it.
228
- */
229
- cr4_init_shadow();
230228 __flush_tlb_all();
231229 #endif
232
- load_current_idt();
230
+ cpu_init_exception_handling();
233231 cpu_init();
232
+ rcu_cpu_starting(raw_smp_processor_id());
234233 x86_cpuinit.early_percpu_clock_init();
235
- preempt_disable();
236234 smp_callin();
237235
238236 enable_start_cpu0 = 0;
....@@ -262,21 +260,10 @@
262260 /* enable local interrupts */
263261 local_irq_enable();
264262
265
- /* to prevent fake stack check failure in clock setup */
266
- boot_init_stack_canary();
267
-
268263 x86_cpuinit.setup_percpu_clockev();
269264
270265 wmb();
271266 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
272
-
273
- /*
274
- * Prevent tail call to cpu_startup_entry() because the stack protector
275
- * guard has been changed a couple of function calls up, in
276
- * boot_init_stack_canary() and must not be checked before tail calling
277
- * another function.
278
- */
279
- prevent_tail_call_optimization();
280267 }
281268
282269 /**
....@@ -314,6 +301,26 @@
314301 return -1;
315302 }
316303 EXPORT_SYMBOL(topology_phys_to_logical_pkg);
304
+/**
305
+ * topology_phys_to_logical_die - Map a physical die id to logical
306
+ *
307
+ * Returns logical die id or -1 if not found
308
+ */
309
+int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
310
+{
311
+ int cpu;
312
+ int proc_id = cpu_data(cur_cpu).phys_proc_id;
313
+
314
+ for_each_possible_cpu(cpu) {
315
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
316
+
317
+ if (c->initialized && c->cpu_die_id == die_id &&
318
+ c->phys_proc_id == proc_id)
319
+ return c->logical_die_id;
320
+ }
321
+ return -1;
322
+}
323
+EXPORT_SYMBOL(topology_phys_to_logical_die);
317324
318325 /**
319326 * topology_update_package_map - Update the physical to logical package map
....@@ -338,6 +345,29 @@
338345 cpu_data(cpu).logical_proc_id = new;
339346 return 0;
340347 }
348
+/**
349
+ * topology_update_die_map - Update the physical to logical die map
350
+ * @die: The die id as retrieved via CPUID
351
+ * @cpu: The cpu for which this is updated
352
+ */
353
+int topology_update_die_map(unsigned int die, unsigned int cpu)
354
+{
355
+ int new;
356
+
357
+ /* Already available somewhere? */
358
+ new = topology_phys_to_logical_die(die, cpu);
359
+ if (new >= 0)
360
+ goto found;
361
+
362
+ new = logical_die++;
363
+ if (new != die) {
364
+ pr_info("CPU %u Converting physical %u to logical die %u\n",
365
+ cpu, die, new);
366
+ }
367
+found:
368
+ cpu_data(cpu).logical_die_id = new;
369
+ return 0;
370
+}
341371
342372 void __init smp_store_boot_cpu_info(void)
343373 {
....@@ -347,6 +377,7 @@
347377 *c = boot_cpu_data;
348378 c->cpu_index = id;
349379 topology_update_package_map(c->phys_proc_id, id);
380
+ topology_update_die_map(c->cpu_die_id, id);
350381 c->initialized = true;
351382 }
352383
....@@ -401,6 +432,7 @@
401432 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
402433
403434 if (c->phys_proc_id == o->phys_proc_id &&
435
+ c->cpu_die_id == o->cpu_die_id &&
404436 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
405437 if (c->cpu_core_id == o->cpu_core_id)
406438 return topology_sane(c, o, "smt");
....@@ -412,6 +444,7 @@
412444 }
413445
414446 } else if (c->phys_proc_id == o->phys_proc_id &&
447
+ c->cpu_die_id == o->cpu_die_id &&
415448 c->cpu_core_id == o->cpu_core_id) {
416449 return topology_sane(c, o, "smt");
417450 }
....@@ -419,29 +452,52 @@
419452 return false;
420453 }
421454
455
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
456
+{
457
+ if (c->phys_proc_id == o->phys_proc_id &&
458
+ c->cpu_die_id == o->cpu_die_id)
459
+ return true;
460
+ return false;
461
+}
462
+
422463 /*
423
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
464
+ * Unlike the other levels, we do not enforce keeping a
465
+ * multicore group inside a NUMA node. If this happens, we will
466
+ * discard the MC level of the topology later.
467
+ */
468
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
469
+{
470
+ if (c->phys_proc_id == o->phys_proc_id)
471
+ return true;
472
+ return false;
473
+}
474
+
475
+/*
476
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
424477 *
425
- * These are Intel CPUs that enumerate an LLC that is shared by
426
- * multiple NUMA nodes. The LLC on these systems is shared for
427
- * off-package data access but private to the NUMA node (half
428
- * of the package) for on-package access.
478
+ * Any Intel CPU that has multiple nodes per package and does not
479
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
429480 *
430
- * CPUID (the source of the information about the LLC) can only
431
- * enumerate the cache as being shared *or* unshared, but not
432
- * this particular configuration. The CPU in this case enumerates
433
- * the cache to be shared across the entire package (spanning both
434
- * NUMA nodes).
481
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
482
+ * by multiple NUMA nodes. The LLC is shared for off-package data
483
+ * access but private to the NUMA node (half of the package) for
484
+ * on-package access. CPUID (the source of the information about
485
+ * the LLC) can only enumerate the cache as shared or unshared,
486
+ * but not this particular configuration.
435487 */
436488
437
-static const struct x86_cpu_id snc_cpu[] = {
438
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
489
+static const struct x86_cpu_id intel_cod_cpu[] = {
490
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
491
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
492
+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
439493 {}
440494 };
441495
442496 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
443497 {
498
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
444499 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
500
+ bool intel_snc = id && id->driver_data;
445501
446502 /* Do not match if we do not have a valid APICID for cpu: */
447503 if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
....@@ -456,23 +512,12 @@
456512 * means 'c' does not share the LLC of 'o'. This will be
457513 * reflected to userspace.
458514 */
459
- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
515
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
460516 return false;
461517
462518 return topology_sane(c, o, "llc");
463519 }
464520
465
-/*
466
- * Unlike the other levels, we do not enforce keeping a
467
- * multicore group inside a NUMA node. If this happens, we will
468
- * discard the MC level of the topology later.
469
- */
470
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
471
-{
472
- if (c->phys_proc_id == o->phys_proc_id)
473
- return true;
474
- return false;
475
-}
476521
477522 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
478523 static inline int x86_sched_itmt_flags(void)
....@@ -536,6 +581,7 @@
536581 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
537582 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
538583 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
584
+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
539585 c->booted_cores = 1;
540586 return;
541587 }
....@@ -543,13 +589,22 @@
543589 for_each_cpu(i, cpu_sibling_setup_mask) {
544590 o = &cpu_data(i);
545591
592
+ if (match_pkg(c, o) && !topology_same_node(c, o))
593
+ x86_has_numa_in_package = true;
594
+
546595 if ((i == cpu) || (has_smt && match_smt(c, o)))
547596 link_mask(topology_sibling_cpumask, cpu, i);
548597
549598 if ((i == cpu) || (has_mp && match_llc(c, o)))
550599 link_mask(cpu_llc_shared_mask, cpu, i);
551600
601
+ if ((i == cpu) || (has_mp && match_die(c, o)))
602
+ link_mask(topology_die_cpumask, cpu, i);
552603 }
604
+
605
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
606
+ if (threads > __max_smt_threads)
607
+ __max_smt_threads = threads;
553608
554609 /*
555610 * This needs a separate iteration over the cpus because we rely on all
....@@ -558,14 +613,13 @@
558613 for_each_cpu(i, cpu_sibling_setup_mask) {
559614 o = &cpu_data(i);
560615
561
- if ((i == cpu) || (has_mp && match_die(c, o))) {
616
+ if ((i == cpu) || (has_mp && match_pkg(c, o))) {
562617 link_mask(topology_core_cpumask, cpu, i);
563618
564619 /*
565620 * Does this new cpu bringup a new core?
566621 */
567
- if (cpumask_weight(
568
- topology_sibling_cpumask(cpu)) == 1) {
622
+ if (threads == 1) {
569623 /*
570624 * for each core in package, increment
571625 * the booted_cores for this new cpu
....@@ -582,13 +636,7 @@
582636 } else if (i != cpu && !c->booted_cores)
583637 c->booted_cores = cpu_data(i).booted_cores;
584638 }
585
- if (match_die(c, o) && !topology_same_node(c, o))
586
- x86_has_numa_in_package = true;
587639 }
588
-
589
- threads = cpumask_weight(topology_sibling_cpumask(cpu));
590
- if (threads > __max_smt_threads)
591
- __max_smt_threads = threads;
592640 }
593641
594642 /* maps the cpu to the sched domain representing multi-core */
....@@ -684,6 +732,7 @@
684732
685733 /* if modern processor, use no delay */
686734 if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
735
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
687736 ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
688737 init_udelay = 0;
689738 return;
....@@ -848,7 +897,7 @@
848897 /* reduce the number of lines printed when booting a large cpu count system */
849898 static void announce_cpu(int cpu, int apicid)
850899 {
851
- static int current_node = -1;
900
+ static int current_node = NUMA_NO_NODE;
852901 int node = early_cpu_to_node(cpu);
853902 static int width, node_width;
854903
....@@ -946,20 +995,28 @@
946995 return boot_error;
947996 }
948997
949
-void common_cpu_up(unsigned int cpu, struct task_struct *idle)
998
+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
950999 {
1000
+ int ret;
1001
+
9511002 /* Just in case we booted with a single CPU. */
9521003 alternatives_enable_smp();
9531004
9541005 per_cpu(current_task, cpu) = idle;
1006
+ cpu_init_stack_canary(cpu, idle);
1007
+
1008
+ /* Initialize the interrupt stack(s) */
1009
+ ret = irq_init_percpu_irqstack(cpu);
1010
+ if (ret)
1011
+ return ret;
9551012
9561013 #ifdef CONFIG_X86_32
9571014 /* Stack for startup_32 can be just as for start_secondary onwards */
958
- irq_ctx_init(cpu);
9591015 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
9601016 #else
9611017 initial_gs = per_cpu_offset(cpu);
9621018 #endif
1019
+ return 0;
9631020 }
9641021
9651022 /*
....@@ -971,8 +1028,6 @@
9711028 static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
9721029 int *cpu0_nmi_registered)
9731030 {
974
- volatile u32 *trampoline_status =
975
- (volatile u32 *) __va(real_mode_header->trampoline_status);
9761031 /* start_ip had better be page-aligned! */
9771032 unsigned long start_ip = real_mode_header->trampoline_start;
9781033
....@@ -1064,9 +1119,6 @@
10641119 }
10651120 }
10661121
1067
- /* mark "stuck" area as not stuck */
1068
- *trampoline_status = 0;
1069
-
10701122 if (x86_platform.legacy.warm_reset) {
10711123 /*
10721124 * Cleanup possible dangling ends...
....@@ -1117,7 +1169,9 @@
11171169 /* the FPU context is blank, nobody can own it */
11181170 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
11191171
1120
- common_cpu_up(cpu, tidle);
1172
+ err = common_cpu_up(cpu, tidle);
1173
+ if (err)
1174
+ return err;
11211175
11221176 err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
11231177 if (err) {
....@@ -1178,6 +1232,7 @@
11781232 physid_set_mask_of_physid(0, &phys_cpu_present_map);
11791233 cpumask_set_cpu(0, topology_sibling_cpumask(0));
11801234 cpumask_set_cpu(0, topology_core_cpumask(0));
1235
+ cpumask_set_cpu(0, topology_die_cpumask(0));
11811236 }
11821237
11831238 /*
....@@ -1273,6 +1328,7 @@
12731328 for_each_possible_cpu(i) {
12741329 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
12751330 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1331
+ zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
12761332 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
12771333 }
12781334
....@@ -1286,7 +1342,7 @@
12861342 set_sched_topology(x86_topology);
12871343
12881344 set_cpu_sibling_map(0);
1289
-
1345
+ init_freq_invariance(false);
12901346 smp_sanity_check();
12911347
12921348 switch (apic_intr_mode) {
....@@ -1312,8 +1368,6 @@
13121368 pr_info("CPU0: ");
13131369 print_cpu_info(&cpu_data(0));
13141370
1315
- native_pv_lock_init();
1316
-
13171371 uv_system_init();
13181372
13191373 set_mtrr_aps_delayed_init();
....@@ -1323,12 +1377,12 @@
13231377 speculative_store_bypass_ht_init();
13241378 }
13251379
1326
-void arch_enable_nonboot_cpus_begin(void)
1380
+void arch_thaw_secondary_cpus_begin(void)
13271381 {
13281382 set_mtrr_aps_delayed_init();
13291383 }
13301384
1331
-void arch_enable_nonboot_cpus_end(void)
1385
+void arch_thaw_secondary_cpus_end(void)
13321386 {
13331387 mtrr_aps_init();
13341388 }
....@@ -1343,6 +1397,7 @@
13431397 /* already set me in cpu_online_mask in boot_cpu_init() */
13441398 cpumask_set_cpu(me, cpu_callout_mask);
13451399 cpu_set_state_online(me);
1400
+ native_pv_lock_init();
13461401 }
13471402
13481403 void __init calculate_max_logical_packages(void)
....@@ -1384,7 +1439,7 @@
13841439 /*
13851440 * cpu_possible_mask should be static, it cannot change as cpu's
13861441 * are onlined, or offlined. The reason is per-cpu data-structures
1387
- * are allocated by some modules at init time, and dont expect to
1442
+ * are allocated by some modules at init time, and don't expect to
13881443 * do this dynamically on cpu arrival/departure.
13891444 * cpu_present_mask on the other hand can change dynamically.
13901445 * In case when cpu_hotplug is not compiled, then we resort to current
....@@ -1493,6 +1548,8 @@
14931548 cpu_data(sibling).booted_cores--;
14941549 }
14951550
1551
+ for_each_cpu(sibling, topology_die_cpumask(cpu))
1552
+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
14961553 for_each_cpu(sibling, topology_sibling_cpumask(cpu))
14971554 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
14981555 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
....@@ -1500,6 +1557,7 @@
15001557 cpumask_clear(cpu_llc_shared_mask(cpu));
15011558 cpumask_clear(topology_sibling_cpumask(cpu));
15021559 cpumask_clear(topology_core_cpumask(cpu));
1560
+ cpumask_clear(topology_die_cpumask(cpu));
15031561 c->cpu_core_id = 0;
15041562 c->booted_cores = 0;
15051563 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
....@@ -1538,8 +1596,27 @@
15381596 if (ret)
15391597 return ret;
15401598
1541
- clear_local_APIC();
15421599 cpu_disable_common();
1600
+
1601
+ /*
1602
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
1603
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
1604
+ * messages.
1605
+ *
1606
+ * Disabling the APIC must happen after cpu_disable_common()
1607
+ * which invokes fixup_irqs().
1608
+ *
1609
+ * Disabling the APIC preserves already set bits in IRR, but
1610
+ * an interrupt arriving after disabling the local APIC does not
1611
+ * set the corresponding IRR bit.
1612
+ *
1613
+ * fixup_irqs() scans IRR for set bits so it can raise a not
1614
+ * yet handled interrupt on the new destination CPU via an IPI
1615
+ * but obviously it can't do so for IRR bits which are not set.
1616
+ * IOW, interrupts arriving after disabling the local APIC will
1617
+ * be lost.
1618
+ */
1619
+ apic_soft_disable();
15431620
15441621 return 0;
15451622 }
....@@ -1580,13 +1657,17 @@
15801657 local_irq_disable();
15811658 }
15821659
1583
-static bool wakeup_cpu0(void)
1660
+/**
1661
+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
1662
+ *
1663
+ * If NMI wants to wake up CPU0, start CPU0.
1664
+ */
1665
+void cond_wakeup_cpu0(void)
15841666 {
15851667 if (smp_processor_id() == 0 && enable_start_cpu0)
1586
- return true;
1587
-
1588
- return false;
1668
+ start_cpu0();
15891669 }
1670
+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
15901671
15911672 /*
15921673 * We need to flush the caches before going to sleep, lest we have
....@@ -1600,7 +1681,8 @@
16001681 void *mwait_ptr;
16011682 int i;
16021683
1603
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1684
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
1685
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
16041686 return;
16051687 if (!this_cpu_has(X86_FEATURE_MWAIT))
16061688 return;
....@@ -1654,11 +1736,8 @@
16541736 __monitor(mwait_ptr, 0, 0);
16551737 mb();
16561738 __mwait(eax, 0);
1657
- /*
1658
- * If NMI wants to wake up CPU0, start CPU0.
1659
- */
1660
- if (wakeup_cpu0())
1661
- start_cpu0();
1739
+
1740
+ cond_wakeup_cpu0();
16621741 }
16631742 }
16641743
....@@ -1669,11 +1748,8 @@
16691748
16701749 while (1) {
16711750 native_halt();
1672
- /*
1673
- * If NMI wants to wake up CPU0, start CPU0.
1674
- */
1675
- if (wakeup_cpu0())
1676
- start_cpu0();
1751
+
1752
+ cond_wakeup_cpu0();
16771753 }
16781754 }
16791755
....@@ -1705,3 +1781,339 @@
17051781 }
17061782
17071783 #endif
1784
+
1785
+#ifdef CONFIG_X86_64
1786
+/*
1787
+ * APERF/MPERF frequency ratio computation.
1788
+ *
1789
+ * The scheduler wants to do frequency invariant accounting and needs a <1
1790
+ * ratio to account for the 'current' frequency, corresponding to
1791
+ * freq_curr / freq_max.
1792
+ *
1793
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
1794
+ * our P-state setting is little more than a request/hint, we need to observe
1795
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
1796
+ * interval after discarding idle time. This is given by:
1797
+ *
1798
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
1799
+ *
1800
+ * where freq_base is the max non-turbo P-state.
1801
+ *
1802
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
1803
+ * can't know which turbo states will be available at a given point in time:
1804
+ * it all depends on the thermal headroom of the entire package. We set it to
1805
+ * the turbo level with 4 cores active.
1806
+ *
1807
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
1808
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
1809
+ * which would ignore the entire turbo range (a conspicuous part, making
1810
+ * freq_curr/freq_max always maxed out).
1811
+ *
1812
+ * An exception to the heuristic above is the Atom uarch, where we choose the
1813
+ * highest turbo level for freq_max since Atom's are generally oriented towards
1814
+ * power efficiency.
1815
+ *
1816
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
1817
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
1818
+ */
1819
+
1820
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
1821
+
1822
+static DEFINE_PER_CPU(u64, arch_prev_aperf);
1823
+static DEFINE_PER_CPU(u64, arch_prev_mperf);
1824
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
1825
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
1826
+
1827
+void arch_set_max_freq_ratio(bool turbo_disabled)
1828
+{
1829
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
1830
+ arch_turbo_freq_ratio;
1831
+}
1832
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
1833
+
1834
+static bool turbo_disabled(void)
1835
+{
1836
+ u64 misc_en;
1837
+ int err;
1838
+
1839
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
1840
+ if (err)
1841
+ return false;
1842
+
1843
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
1844
+}
1845
+
1846
+static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
1847
+{
1848
+ int err;
1849
+
1850
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
1851
+ if (err)
1852
+ return false;
1853
+
1854
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
1855
+ if (err)
1856
+ return false;
1857
+
1858
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
1859
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
1860
+
1861
+ return true;
1862
+}
1863
+
1864
+#include <asm/cpu_device_id.h>
1865
+#include <asm/intel-family.h>
1866
+
1867
+#define X86_MATCH(model) \
1868
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
1869
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
1870
+
1871
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
1872
+ X86_MATCH(XEON_PHI_KNL),
1873
+ X86_MATCH(XEON_PHI_KNM),
1874
+ {}
1875
+};
1876
+
1877
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
1878
+ X86_MATCH(SKYLAKE_X),
1879
+ {}
1880
+};
1881
+
1882
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
1883
+ X86_MATCH(ATOM_GOLDMONT),
1884
+ X86_MATCH(ATOM_GOLDMONT_D),
1885
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
1886
+ {}
1887
+};
1888
+
1889
+static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
1890
+ int num_delta_fratio)
1891
+{
1892
+ int fratio, delta_fratio, found;
1893
+ int err, i;
1894
+ u64 msr;
1895
+
1896
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1897
+ if (err)
1898
+ return false;
1899
+
1900
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1901
+
1902
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
1903
+ if (err)
1904
+ return false;
1905
+
1906
+ fratio = (msr >> 8) & 0xFF;
1907
+ i = 16;
1908
+ found = 0;
1909
+ do {
1910
+ if (found >= num_delta_fratio) {
1911
+ *turbo_freq = fratio;
1912
+ return true;
1913
+ }
1914
+
1915
+ delta_fratio = (msr >> (i + 5)) & 0x7;
1916
+
1917
+ if (delta_fratio) {
1918
+ found += 1;
1919
+ fratio -= delta_fratio;
1920
+ }
1921
+
1922
+ i += 8;
1923
+ } while (i < 64);
1924
+
1925
+ return true;
1926
+}
1927
+
1928
+static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
1929
+{
1930
+ u64 ratios, counts;
1931
+ u32 group_size;
1932
+ int err, i;
1933
+
1934
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1935
+ if (err)
1936
+ return false;
1937
+
1938
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1939
+
1940
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
1941
+ if (err)
1942
+ return false;
1943
+
1944
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
1945
+ if (err)
1946
+ return false;
1947
+
1948
+ for (i = 0; i < 64; i += 8) {
1949
+ group_size = (counts >> i) & 0xFF;
1950
+ if (group_size >= size) {
1951
+ *turbo_freq = (ratios >> i) & 0xFF;
1952
+ return true;
1953
+ }
1954
+ }
1955
+
1956
+ return false;
1957
+}
1958
+
1959
+static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
1960
+{
1961
+ u64 msr;
1962
+ int err;
1963
+
1964
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1965
+ if (err)
1966
+ return false;
1967
+
1968
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
1969
+ if (err)
1970
+ return false;
1971
+
1972
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1973
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
1974
+
1975
+ /* The CPU may have less than 4 cores */
1976
+ if (!*turbo_freq)
1977
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
1978
+
1979
+ return true;
1980
+}
1981
+
1982
+static bool intel_set_max_freq_ratio(void)
1983
+{
1984
+ u64 base_freq, turbo_freq;
1985
+ u64 turbo_ratio;
1986
+
1987
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
1988
+ goto out;
1989
+
1990
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
1991
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
1992
+ goto out;
1993
+
1994
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
1995
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
1996
+ goto out;
1997
+
1998
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
1999
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
2000
+ goto out;
2001
+
2002
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
2003
+ goto out;
2004
+
2005
+ return false;
2006
+
2007
+out:
2008
+ /*
2009
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
2010
+ * but then fill all MSR's with zeroes.
2011
+ * Some CPUs have turbo boost but don't declare any turbo ratio
2012
+ * in MSR_TURBO_RATIO_LIMIT.
2013
+ */
2014
+ if (!base_freq || !turbo_freq) {
2015
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
2016
+ return false;
2017
+ }
2018
+
2019
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
2020
+ if (!turbo_ratio) {
2021
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
2022
+ return false;
2023
+ }
2024
+
2025
+ arch_turbo_freq_ratio = turbo_ratio;
2026
+ arch_set_max_freq_ratio(turbo_disabled());
2027
+
2028
+ return true;
2029
+}
2030
+
2031
+static void init_counter_refs(void)
2032
+{
2033
+ u64 aperf, mperf;
2034
+
2035
+ rdmsrl(MSR_IA32_APERF, aperf);
2036
+ rdmsrl(MSR_IA32_MPERF, mperf);
2037
+
2038
+ this_cpu_write(arch_prev_aperf, aperf);
2039
+ this_cpu_write(arch_prev_mperf, mperf);
2040
+}
2041
+
2042
+static void init_freq_invariance(bool secondary)
2043
+{
2044
+ bool ret = false;
2045
+
2046
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
2047
+ return;
2048
+
2049
+ if (secondary) {
2050
+ if (static_branch_likely(&arch_scale_freq_key)) {
2051
+ init_counter_refs();
2052
+ }
2053
+ return;
2054
+ }
2055
+
2056
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2057
+ ret = intel_set_max_freq_ratio();
2058
+
2059
+ if (ret) {
2060
+ init_counter_refs();
2061
+ static_branch_enable(&arch_scale_freq_key);
2062
+ } else {
2063
+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
2064
+ }
2065
+}
2066
+
2067
+static void disable_freq_invariance_workfn(struct work_struct *work)
2068
+{
2069
+ static_branch_disable(&arch_scale_freq_key);
2070
+}
2071
+
2072
+static DECLARE_WORK(disable_freq_invariance_work,
2073
+ disable_freq_invariance_workfn);
2074
+
2075
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
2076
+
2077
+void arch_scale_freq_tick(void)
2078
+{
2079
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
2080
+ u64 aperf, mperf;
2081
+ u64 acnt, mcnt;
2082
+
2083
+ if (!arch_scale_freq_invariant())
2084
+ return;
2085
+
2086
+ rdmsrl(MSR_IA32_APERF, aperf);
2087
+ rdmsrl(MSR_IA32_MPERF, mperf);
2088
+
2089
+ acnt = aperf - this_cpu_read(arch_prev_aperf);
2090
+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
2091
+
2092
+ this_cpu_write(arch_prev_aperf, aperf);
2093
+ this_cpu_write(arch_prev_mperf, mperf);
2094
+
2095
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
2096
+ goto error;
2097
+
2098
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
2099
+ goto error;
2100
+
2101
+ freq_scale = div64_u64(acnt, mcnt);
2102
+ if (!freq_scale)
2103
+ goto error;
2104
+
2105
+ if (freq_scale > SCHED_CAPACITY_SCALE)
2106
+ freq_scale = SCHED_CAPACITY_SCALE;
2107
+
2108
+ this_cpu_write(arch_freq_scale, freq_scale);
2109
+ return;
2110
+
2111
+error:
2112
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
2113
+ schedule_work(&disable_freq_invariance_work);
2114
+}
2115
+#else
2116
+static inline void init_freq_invariance(bool secondary)
2117
+{
2118
+}
2119
+#endif /* CONFIG_X86_64 */