forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb
kernel/arch/x86/kernel/smpboot.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * x86 SMP booting functions
34 *
....@@ -11,9 +12,6 @@
1112 * Thanks to Intel for making available several different Pentium,
1213 * Pentium Pro and Pentium-II/Xeon MP machines.
1314 * Original development of Linux SMP code supported by Caldera.
14
- *
15
- * This code is released under the GNU General Public License version 2 or
16
- * later.
1715 *
1816 * Fixes
1917 * Felix Koop : NR_CPUS used properly
....@@ -49,13 +47,15 @@
4947 #include <linux/sched/hotplug.h>
5048 #include <linux/sched/task_stack.h>
5149 #include <linux/percpu.h>
52
-#include <linux/bootmem.h>
50
+#include <linux/memblock.h>
5351 #include <linux/err.h>
5452 #include <linux/nmi.h>
5553 #include <linux/tboot.h>
56
-#include <linux/stackprotector.h>
5754 #include <linux/gfp.h>
5855 #include <linux/cpuidle.h>
56
+#include <linux/numa.h>
57
+#include <linux/pgtable.h>
58
+#include <linux/overflow.h>
5959
6060 #include <asm/acpi.h>
6161 #include <asm/desc.h>
....@@ -64,7 +64,6 @@
6464 #include <asm/realmode.h>
6565 #include <asm/cpu.h>
6666 #include <asm/numa.h>
67
-#include <asm/pgtable.h>
6867 #include <asm/tlbflush.h>
6968 #include <asm/mtrr.h>
7069 #include <asm/mwait.h>
....@@ -81,6 +80,7 @@
8180 #include <asm/cpu_device_id.h>
8281 #include <asm/spec-ctrl.h>
8382 #include <asm/hw_irq.h>
83
+#include <asm/stackprotector.h>
8484
8585 /* representing HT siblings of each logical CPU */
8686 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
....@@ -90,16 +90,32 @@
9090 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
9191 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
9292
93
+/* representing HT, core, and die siblings of each logical CPU */
94
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
95
+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
96
+
9397 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
9498
9599 /* Per CPU bogomips and other parameters */
96100 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
97101 EXPORT_PER_CPU_SYMBOL(cpu_info);
98102
103
+struct mwait_cpu_dead {
104
+ unsigned int control;
105
+ unsigned int status;
106
+};
107
+
108
+/*
109
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
110
+ * that it's unlikely to be touched by other CPUs.
111
+ */
112
+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
113
+
99114 /* Logical package management. We might want to allocate that dynamically */
100115 unsigned int __max_logical_packages __read_mostly;
101116 EXPORT_SYMBOL(__max_logical_packages);
102117 static unsigned int logical_packages __read_mostly;
118
+static unsigned int logical_die __read_mostly;
103119
104120 /* Maximum number of SMT threads on any online core */
105121 int __read_mostly __max_smt_threads = 1;
....@@ -143,13 +159,15 @@
143159 *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
144160 }
145161
162
+static void init_freq_invariance(bool secondary);
163
+
146164 /*
147165 * Report back to the Boot Processor during boot time or to the caller processor
148166 * during CPU online.
149167 */
150168 static void smp_callin(void)
151169 {
152
- int cpuid, phys_id;
170
+ int cpuid;
153171
154172 /*
155173 * If waken up by an INIT in an 82489DX configuration
....@@ -158,11 +176,6 @@
158176 * now safe to touch our local APIC.
159177 */
160178 cpuid = smp_processor_id();
161
-
162
- /*
163
- * (This works even if the APIC is not enabled.)
164
- */
165
- phys_id = read_apic_id();
166179
167180 /*
168181 * the boot CPU has finished the init stage and is spinning
....@@ -183,6 +196,8 @@
183196 * calibrate_delay() and notify_cpu_starting().
184197 */
185198 set_cpu_sibling_map(raw_smp_processor_id());
199
+
200
+ init_freq_invariance(true);
186201
187202 /*
188203 * Get our bogomips.
....@@ -216,23 +231,16 @@
216231 * before cpu_init(), SMP booting is too fragile that we want to
217232 * limit the things done here to the most necessary things.
218233 */
219
- if (boot_cpu_has(X86_FEATURE_PCID))
220
- __write_cr4(__read_cr4() | X86_CR4_PCIDE);
234
+ cr4_init();
221235
222236 #ifdef CONFIG_X86_32
223237 /* switch away from the initial page table */
224238 load_cr3(swapper_pg_dir);
225
- /*
226
- * Initialize the CR4 shadow before doing anything that could
227
- * try to read it.
228
- */
229
- cr4_init_shadow();
230239 __flush_tlb_all();
231240 #endif
232
- load_current_idt();
233
- cpu_init();
241
+ cpu_init_secondary();
242
+ rcu_cpu_starting(raw_smp_processor_id());
234243 x86_cpuinit.early_percpu_clock_init();
235
- preempt_disable();
236244 smp_callin();
237245
238246 enable_start_cpu0 = 0;
....@@ -262,21 +270,10 @@
262270 /* enable local interrupts */
263271 local_irq_enable();
264272
265
- /* to prevent fake stack check failure in clock setup */
266
- boot_init_stack_canary();
267
-
268273 x86_cpuinit.setup_percpu_clockev();
269274
270275 wmb();
271276 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
272
-
273
- /*
274
- * Prevent tail call to cpu_startup_entry() because the stack protector
275
- * guard has been changed a couple of function calls up, in
276
- * boot_init_stack_canary() and must not be checked before tail calling
277
- * another function.
278
- */
279
- prevent_tail_call_optimization();
280277 }
281278
282279 /**
....@@ -314,6 +311,26 @@
314311 return -1;
315312 }
316313 EXPORT_SYMBOL(topology_phys_to_logical_pkg);
314
+/**
315
+ * topology_phys_to_logical_die - Map a physical die id to logical
316
+ *
317
+ * Returns logical die id or -1 if not found
318
+ */
319
+int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
320
+{
321
+ int cpu;
322
+ int proc_id = cpu_data(cur_cpu).phys_proc_id;
323
+
324
+ for_each_possible_cpu(cpu) {
325
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
326
+
327
+ if (c->initialized && c->cpu_die_id == die_id &&
328
+ c->phys_proc_id == proc_id)
329
+ return c->logical_die_id;
330
+ }
331
+ return -1;
332
+}
333
+EXPORT_SYMBOL(topology_phys_to_logical_die);
317334
318335 /**
319336 * topology_update_package_map - Update the physical to logical package map
....@@ -338,6 +355,29 @@
338355 cpu_data(cpu).logical_proc_id = new;
339356 return 0;
340357 }
358
+/**
359
+ * topology_update_die_map - Update the physical to logical die map
360
+ * @die: The die id as retrieved via CPUID
361
+ * @cpu: The cpu for which this is updated
362
+ */
363
+int topology_update_die_map(unsigned int die, unsigned int cpu)
364
+{
365
+ int new;
366
+
367
+ /* Already available somewhere? */
368
+ new = topology_phys_to_logical_die(die, cpu);
369
+ if (new >= 0)
370
+ goto found;
371
+
372
+ new = logical_die++;
373
+ if (new != die) {
374
+ pr_info("CPU %u Converting physical %u to logical die %u\n",
375
+ cpu, die, new);
376
+ }
377
+found:
378
+ cpu_data(cpu).logical_die_id = new;
379
+ return 0;
380
+}
341381
342382 void __init smp_store_boot_cpu_info(void)
343383 {
....@@ -347,6 +387,7 @@
347387 *c = boot_cpu_data;
348388 c->cpu_index = id;
349389 topology_update_package_map(c->phys_proc_id, id);
390
+ topology_update_die_map(c->cpu_die_id, id);
350391 c->initialized = true;
351392 }
352393
....@@ -401,6 +442,7 @@
401442 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
402443
403444 if (c->phys_proc_id == o->phys_proc_id &&
445
+ c->cpu_die_id == o->cpu_die_id &&
404446 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
405447 if (c->cpu_core_id == o->cpu_core_id)
406448 return topology_sane(c, o, "smt");
....@@ -412,6 +454,7 @@
412454 }
413455
414456 } else if (c->phys_proc_id == o->phys_proc_id &&
457
+ c->cpu_die_id == o->cpu_die_id &&
415458 c->cpu_core_id == o->cpu_core_id) {
416459 return topology_sane(c, o, "smt");
417460 }
....@@ -419,29 +462,52 @@
419462 return false;
420463 }
421464
465
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
466
+{
467
+ if (c->phys_proc_id == o->phys_proc_id &&
468
+ c->cpu_die_id == o->cpu_die_id)
469
+ return true;
470
+ return false;
471
+}
472
+
422473 /*
423
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
474
+ * Unlike the other levels, we do not enforce keeping a
475
+ * multicore group inside a NUMA node. If this happens, we will
476
+ * discard the MC level of the topology later.
477
+ */
478
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
479
+{
480
+ if (c->phys_proc_id == o->phys_proc_id)
481
+ return true;
482
+ return false;
483
+}
484
+
485
+/*
486
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
424487 *
425
- * These are Intel CPUs that enumerate an LLC that is shared by
426
- * multiple NUMA nodes. The LLC on these systems is shared for
427
- * off-package data access but private to the NUMA node (half
428
- * of the package) for on-package access.
488
+ * Any Intel CPU that has multiple nodes per package and does not
489
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
429490 *
430
- * CPUID (the source of the information about the LLC) can only
431
- * enumerate the cache as being shared *or* unshared, but not
432
- * this particular configuration. The CPU in this case enumerates
433
- * the cache to be shared across the entire package (spanning both
434
- * NUMA nodes).
491
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
492
+ * by multiple NUMA nodes. The LLC is shared for off-package data
493
+ * access but private to the NUMA node (half of the package) for
494
+ * on-package access. CPUID (the source of the information about
495
+ * the LLC) can only enumerate the cache as shared or unshared,
496
+ * but not this particular configuration.
435497 */
436498
437
-static const struct x86_cpu_id snc_cpu[] = {
438
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
499
+static const struct x86_cpu_id intel_cod_cpu[] = {
500
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
501
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
502
+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
439503 {}
440504 };
441505
442506 static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
443507 {
508
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
444509 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
510
+ bool intel_snc = id && id->driver_data;
445511
446512 /* Do not match if we do not have a valid APICID for cpu: */
447513 if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
....@@ -456,23 +522,12 @@
456522 * means 'c' does not share the LLC of 'o'. This will be
457523 * reflected to userspace.
458524 */
459
- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
525
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
460526 return false;
461527
462528 return topology_sane(c, o, "llc");
463529 }
464530
465
-/*
466
- * Unlike the other levels, we do not enforce keeping a
467
- * multicore group inside a NUMA node. If this happens, we will
468
- * discard the MC level of the topology later.
469
- */
470
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
471
-{
472
- if (c->phys_proc_id == o->phys_proc_id)
473
- return true;
474
- return false;
475
-}
476531
477532 #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
478533 static inline int x86_sched_itmt_flags(void)
....@@ -536,6 +591,7 @@
536591 cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
537592 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
538593 cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
594
+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
539595 c->booted_cores = 1;
540596 return;
541597 }
....@@ -543,13 +599,22 @@
543599 for_each_cpu(i, cpu_sibling_setup_mask) {
544600 o = &cpu_data(i);
545601
602
+ if (match_pkg(c, o) && !topology_same_node(c, o))
603
+ x86_has_numa_in_package = true;
604
+
546605 if ((i == cpu) || (has_smt && match_smt(c, o)))
547606 link_mask(topology_sibling_cpumask, cpu, i);
548607
549608 if ((i == cpu) || (has_mp && match_llc(c, o)))
550609 link_mask(cpu_llc_shared_mask, cpu, i);
551610
611
+ if ((i == cpu) || (has_mp && match_die(c, o)))
612
+ link_mask(topology_die_cpumask, cpu, i);
552613 }
614
+
615
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
616
+ if (threads > __max_smt_threads)
617
+ __max_smt_threads = threads;
553618
554619 /*
555620 * This needs a separate iteration over the cpus because we rely on all
....@@ -558,14 +623,13 @@
558623 for_each_cpu(i, cpu_sibling_setup_mask) {
559624 o = &cpu_data(i);
560625
561
- if ((i == cpu) || (has_mp && match_die(c, o))) {
626
+ if ((i == cpu) || (has_mp && match_pkg(c, o))) {
562627 link_mask(topology_core_cpumask, cpu, i);
563628
564629 /*
565630 * Does this new cpu bringup a new core?
566631 */
567
- if (cpumask_weight(
568
- topology_sibling_cpumask(cpu)) == 1) {
632
+ if (threads == 1) {
569633 /*
570634 * for each core in package, increment
571635 * the booted_cores for this new cpu
....@@ -582,13 +646,7 @@
582646 } else if (i != cpu && !c->booted_cores)
583647 c->booted_cores = cpu_data(i).booted_cores;
584648 }
585
- if (match_die(c, o) && !topology_same_node(c, o))
586
- x86_has_numa_in_package = true;
587649 }
588
-
589
- threads = cpumask_weight(topology_sibling_cpumask(cpu));
590
- if (threads > __max_smt_threads)
591
- __max_smt_threads = threads;
592650 }
593651
594652 /* maps the cpu to the sched domain representing multi-core */
....@@ -684,6 +742,7 @@
684742
685743 /* if modern processor, use no delay */
686744 if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
745
+ ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) ||
687746 ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
688747 init_udelay = 0;
689748 return;
....@@ -848,7 +907,7 @@
848907 /* reduce the number of lines printed when booting a large cpu count system */
849908 static void announce_cpu(int cpu, int apicid)
850909 {
851
- static int current_node = -1;
910
+ static int current_node = NUMA_NO_NODE;
852911 int node = early_cpu_to_node(cpu);
853912 static int width, node_width;
854913
....@@ -946,20 +1005,28 @@
9461005 return boot_error;
9471006 }
9481007
949
-void common_cpu_up(unsigned int cpu, struct task_struct *idle)
1008
+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
9501009 {
1010
+ int ret;
1011
+
9511012 /* Just in case we booted with a single CPU. */
9521013 alternatives_enable_smp();
9531014
9541015 per_cpu(current_task, cpu) = idle;
1016
+ cpu_init_stack_canary(cpu, idle);
1017
+
1018
+ /* Initialize the interrupt stack(s) */
1019
+ ret = irq_init_percpu_irqstack(cpu);
1020
+ if (ret)
1021
+ return ret;
9551022
9561023 #ifdef CONFIG_X86_32
9571024 /* Stack for startup_32 can be just as for start_secondary onwards */
958
- irq_ctx_init(cpu);
9591025 per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
9601026 #else
9611027 initial_gs = per_cpu_offset(cpu);
9621028 #endif
1029
+ return 0;
9631030 }
9641031
9651032 /*
....@@ -971,8 +1038,6 @@
9711038 static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
9721039 int *cpu0_nmi_registered)
9731040 {
974
- volatile u32 *trampoline_status =
975
- (volatile u32 *) __va(real_mode_header->trampoline_status);
9761041 /* start_ip had better be page-aligned! */
9771042 unsigned long start_ip = real_mode_header->trampoline_start;
9781043
....@@ -1064,9 +1129,6 @@
10641129 }
10651130 }
10661131
1067
- /* mark "stuck" area as not stuck */
1068
- *trampoline_status = 0;
1069
-
10701132 if (x86_platform.legacy.warm_reset) {
10711133 /*
10721134 * Cleanup possible dangling ends...
....@@ -1117,7 +1179,9 @@
11171179 /* the FPU context is blank, nobody can own it */
11181180 per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
11191181
1120
- common_cpu_up(cpu, tidle);
1182
+ err = common_cpu_up(cpu, tidle);
1183
+ if (err)
1184
+ return err;
11211185
11221186 err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
11231187 if (err) {
....@@ -1178,6 +1242,7 @@
11781242 physid_set_mask_of_physid(0, &phys_cpu_present_map);
11791243 cpumask_set_cpu(0, topology_sibling_cpumask(0));
11801244 cpumask_set_cpu(0, topology_core_cpumask(0));
1245
+ cpumask_set_cpu(0, topology_die_cpumask(0));
11811246 }
11821247
11831248 /*
....@@ -1273,6 +1338,7 @@
12731338 for_each_possible_cpu(i) {
12741339 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
12751340 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1341
+ zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
12761342 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
12771343 }
12781344
....@@ -1286,7 +1352,7 @@
12861352 set_sched_topology(x86_topology);
12871353
12881354 set_cpu_sibling_map(0);
1289
-
1355
+ init_freq_invariance(false);
12901356 smp_sanity_check();
12911357
12921358 switch (apic_intr_mode) {
....@@ -1312,8 +1378,6 @@
13121378 pr_info("CPU0: ");
13131379 print_cpu_info(&cpu_data(0));
13141380
1315
- native_pv_lock_init();
1316
-
13171381 uv_system_init();
13181382
13191383 set_mtrr_aps_delayed_init();
....@@ -1323,12 +1387,12 @@
13231387 speculative_store_bypass_ht_init();
13241388 }
13251389
1326
-void arch_enable_nonboot_cpus_begin(void)
1390
+void arch_thaw_secondary_cpus_begin(void)
13271391 {
13281392 set_mtrr_aps_delayed_init();
13291393 }
13301394
1331
-void arch_enable_nonboot_cpus_end(void)
1395
+void arch_thaw_secondary_cpus_end(void)
13321396 {
13331397 mtrr_aps_init();
13341398 }
....@@ -1343,6 +1407,7 @@
13431407 /* already set me in cpu_online_mask in boot_cpu_init() */
13441408 cpumask_set_cpu(me, cpu_callout_mask);
13451409 cpu_set_state_online(me);
1410
+ native_pv_lock_init();
13461411 }
13471412
13481413 void __init calculate_max_logical_packages(void)
....@@ -1384,7 +1449,7 @@
13841449 /*
13851450 * cpu_possible_mask should be static, it cannot change as cpu's
13861451 * are onlined, or offlined. The reason is per-cpu data-structures
1387
- * are allocated by some modules at init time, and dont expect to
1452
+ * are allocated by some modules at init time, and don't expect to
13881453 * do this dynamically on cpu arrival/departure.
13891454 * cpu_present_mask on the other hand can change dynamically.
13901455 * In case when cpu_hotplug is not compiled, then we resort to current
....@@ -1493,6 +1558,8 @@
14931558 cpu_data(sibling).booted_cores--;
14941559 }
14951560
1561
+ for_each_cpu(sibling, topology_die_cpumask(cpu))
1562
+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
14961563 for_each_cpu(sibling, topology_sibling_cpumask(cpu))
14971564 cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
14981565 for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
....@@ -1500,6 +1567,7 @@
15001567 cpumask_clear(cpu_llc_shared_mask(cpu));
15011568 cpumask_clear(topology_sibling_cpumask(cpu));
15021569 cpumask_clear(topology_core_cpumask(cpu));
1570
+ cpumask_clear(topology_die_cpumask(cpu));
15031571 c->cpu_core_id = 0;
15041572 c->booted_cores = 0;
15051573 cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
....@@ -1538,8 +1606,27 @@
15381606 if (ret)
15391607 return ret;
15401608
1541
- clear_local_APIC();
15421609 cpu_disable_common();
1610
+
1611
+ /*
1612
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
1613
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
1614
+ * messages.
1615
+ *
1616
+ * Disabling the APIC must happen after cpu_disable_common()
1617
+ * which invokes fixup_irqs().
1618
+ *
1619
+ * Disabling the APIC preserves already set bits in IRR, but
1620
+ * an interrupt arriving after disabling the local APIC does not
1621
+ * set the corresponding IRR bit.
1622
+ *
1623
+ * fixup_irqs() scans IRR for set bits so it can raise a not
1624
+ * yet handled interrupt on the new destination CPU via an IPI
1625
+ * but obviously it can't do so for IRR bits which are not set.
1626
+ * IOW, interrupts arriving after disabling the local APIC will
1627
+ * be lost.
1628
+ */
1629
+ apic_soft_disable();
15431630
15441631 return 0;
15451632 }
....@@ -1580,13 +1667,17 @@
15801667 local_irq_disable();
15811668 }
15821669
1583
-static bool wakeup_cpu0(void)
1670
+/**
1671
+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
1672
+ *
1673
+ * If NMI wants to wake up CPU0, start CPU0.
1674
+ */
1675
+void cond_wakeup_cpu0(void)
15841676 {
15851677 if (smp_processor_id() == 0 && enable_start_cpu0)
1586
- return true;
1587
-
1588
- return false;
1678
+ start_cpu0();
15891679 }
1680
+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
15901681
15911682 /*
15921683 * We need to flush the caches before going to sleep, lest we have
....@@ -1594,13 +1685,14 @@
15941685 */
15951686 static inline void mwait_play_dead(void)
15961687 {
1688
+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
15971689 unsigned int eax, ebx, ecx, edx;
15981690 unsigned int highest_cstate = 0;
15991691 unsigned int highest_subcstate = 0;
1600
- void *mwait_ptr;
16011692 int i;
16021693
1603
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
1694
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
1695
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
16041696 return;
16051697 if (!this_cpu_has(X86_FEATURE_MWAIT))
16061698 return;
....@@ -1631,13 +1723,6 @@
16311723 (highest_subcstate - 1);
16321724 }
16331725
1634
- /*
1635
- * This should be a memory location in a cache line which is
1636
- * unlikely to be touched by other processors. The actual
1637
- * content is immaterial as it is not actually modified in any way.
1638
- */
1639
- mwait_ptr = &current_thread_info()->flags;
1640
-
16411726 wbinvd();
16421727
16431728 while (1) {
....@@ -1649,16 +1734,13 @@
16491734 * case where we return around the loop.
16501735 */
16511736 mb();
1652
- clflush(mwait_ptr);
1737
+ clflush(md);
16531738 mb();
1654
- __monitor(mwait_ptr, 0, 0);
1739
+ __monitor(md, 0, 0);
16551740 mb();
16561741 __mwait(eax, 0);
1657
- /*
1658
- * If NMI wants to wake up CPU0, start CPU0.
1659
- */
1660
- if (wakeup_cpu0())
1661
- start_cpu0();
1742
+
1743
+ cond_wakeup_cpu0();
16621744 }
16631745 }
16641746
....@@ -1669,11 +1751,8 @@
16691751
16701752 while (1) {
16711753 native_halt();
1672
- /*
1673
- * If NMI wants to wake up CPU0, start CPU0.
1674
- */
1675
- if (wakeup_cpu0())
1676
- start_cpu0();
1754
+
1755
+ cond_wakeup_cpu0();
16771756 }
16781757 }
16791758
....@@ -1705,3 +1784,339 @@
17051784 }
17061785
17071786 #endif
1787
+
1788
+#ifdef CONFIG_X86_64
1789
+/*
1790
+ * APERF/MPERF frequency ratio computation.
1791
+ *
1792
+ * The scheduler wants to do frequency invariant accounting and needs a <1
1793
+ * ratio to account for the 'current' frequency, corresponding to
1794
+ * freq_curr / freq_max.
1795
+ *
1796
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
1797
+ * our P-state setting is little more than a request/hint, we need to observe
1798
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
1799
+ * interval after discarding idle time. This is given by:
1800
+ *
1801
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
1802
+ *
1803
+ * where freq_base is the max non-turbo P-state.
1804
+ *
1805
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
1806
+ * can't know which turbo states will be available at a given point in time:
1807
+ * it all depends on the thermal headroom of the entire package. We set it to
1808
+ * the turbo level with 4 cores active.
1809
+ *
1810
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
1811
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
1812
+ * which would ignore the entire turbo range (a conspicuous part, making
1813
+ * freq_curr/freq_max always maxed out).
1814
+ *
1815
+ * An exception to the heuristic above is the Atom uarch, where we choose the
1816
+ * highest turbo level for freq_max since Atom's are generally oriented towards
1817
+ * power efficiency.
1818
+ *
1819
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
1820
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
1821
+ */
1822
+
1823
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
1824
+
1825
+static DEFINE_PER_CPU(u64, arch_prev_aperf);
1826
+static DEFINE_PER_CPU(u64, arch_prev_mperf);
1827
+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
1828
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
1829
+
1830
+void arch_set_max_freq_ratio(bool turbo_disabled)
1831
+{
1832
+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
1833
+ arch_turbo_freq_ratio;
1834
+}
1835
+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
1836
+
1837
+static bool turbo_disabled(void)
1838
+{
1839
+ u64 misc_en;
1840
+ int err;
1841
+
1842
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
1843
+ if (err)
1844
+ return false;
1845
+
1846
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
1847
+}
1848
+
1849
+static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
1850
+{
1851
+ int err;
1852
+
1853
+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
1854
+ if (err)
1855
+ return false;
1856
+
1857
+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
1858
+ if (err)
1859
+ return false;
1860
+
1861
+ *base_freq = (*base_freq >> 16) & 0x3F; /* max P state */
1862
+ *turbo_freq = *turbo_freq & 0x3F; /* 1C turbo */
1863
+
1864
+ return true;
1865
+}
1866
+
1867
+#include <asm/cpu_device_id.h>
1868
+#include <asm/intel-family.h>
1869
+
1870
+#define X86_MATCH(model) \
1871
+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
1872
+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
1873
+
1874
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
1875
+ X86_MATCH(XEON_PHI_KNL),
1876
+ X86_MATCH(XEON_PHI_KNM),
1877
+ {}
1878
+};
1879
+
1880
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
1881
+ X86_MATCH(SKYLAKE_X),
1882
+ {}
1883
+};
1884
+
1885
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
1886
+ X86_MATCH(ATOM_GOLDMONT),
1887
+ X86_MATCH(ATOM_GOLDMONT_D),
1888
+ X86_MATCH(ATOM_GOLDMONT_PLUS),
1889
+ {}
1890
+};
1891
+
1892
+static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
1893
+ int num_delta_fratio)
1894
+{
1895
+ int fratio, delta_fratio, found;
1896
+ int err, i;
1897
+ u64 msr;
1898
+
1899
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1900
+ if (err)
1901
+ return false;
1902
+
1903
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1904
+
1905
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
1906
+ if (err)
1907
+ return false;
1908
+
1909
+ fratio = (msr >> 8) & 0xFF;
1910
+ i = 16;
1911
+ found = 0;
1912
+ do {
1913
+ if (found >= num_delta_fratio) {
1914
+ *turbo_freq = fratio;
1915
+ return true;
1916
+ }
1917
+
1918
+ delta_fratio = (msr >> (i + 5)) & 0x7;
1919
+
1920
+ if (delta_fratio) {
1921
+ found += 1;
1922
+ fratio -= delta_fratio;
1923
+ }
1924
+
1925
+ i += 8;
1926
+ } while (i < 64);
1927
+
1928
+ return true;
1929
+}
1930
+
1931
+static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
1932
+{
1933
+ u64 ratios, counts;
1934
+ u32 group_size;
1935
+ int err, i;
1936
+
1937
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1938
+ if (err)
1939
+ return false;
1940
+
1941
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1942
+
1943
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
1944
+ if (err)
1945
+ return false;
1946
+
1947
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
1948
+ if (err)
1949
+ return false;
1950
+
1951
+ for (i = 0; i < 64; i += 8) {
1952
+ group_size = (counts >> i) & 0xFF;
1953
+ if (group_size >= size) {
1954
+ *turbo_freq = (ratios >> i) & 0xFF;
1955
+ return true;
1956
+ }
1957
+ }
1958
+
1959
+ return false;
1960
+}
1961
+
1962
+static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
1963
+{
1964
+ u64 msr;
1965
+ int err;
1966
+
1967
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
1968
+ if (err)
1969
+ return false;
1970
+
1971
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
1972
+ if (err)
1973
+ return false;
1974
+
1975
+ *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
1976
+ *turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
1977
+
1978
+ /* The CPU may have less than 4 cores */
1979
+ if (!*turbo_freq)
1980
+ *turbo_freq = msr & 0xFF; /* 1C turbo */
1981
+
1982
+ return true;
1983
+}
1984
+
1985
+static bool intel_set_max_freq_ratio(void)
1986
+{
1987
+ u64 base_freq, turbo_freq;
1988
+ u64 turbo_ratio;
1989
+
1990
+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
1991
+ goto out;
1992
+
1993
+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
1994
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
1995
+ goto out;
1996
+
1997
+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
1998
+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
1999
+ goto out;
2000
+
2001
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
2002
+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
2003
+ goto out;
2004
+
2005
+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
2006
+ goto out;
2007
+
2008
+ return false;
2009
+
2010
+out:
2011
+ /*
2012
+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
2013
+ * but then fill all MSR's with zeroes.
2014
+ * Some CPUs have turbo boost but don't declare any turbo ratio
2015
+ * in MSR_TURBO_RATIO_LIMIT.
2016
+ */
2017
+ if (!base_freq || !turbo_freq) {
2018
+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
2019
+ return false;
2020
+ }
2021
+
2022
+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
2023
+ if (!turbo_ratio) {
2024
+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
2025
+ return false;
2026
+ }
2027
+
2028
+ arch_turbo_freq_ratio = turbo_ratio;
2029
+ arch_set_max_freq_ratio(turbo_disabled());
2030
+
2031
+ return true;
2032
+}
2033
+
2034
+static void init_counter_refs(void)
2035
+{
2036
+ u64 aperf, mperf;
2037
+
2038
+ rdmsrl(MSR_IA32_APERF, aperf);
2039
+ rdmsrl(MSR_IA32_MPERF, mperf);
2040
+
2041
+ this_cpu_write(arch_prev_aperf, aperf);
2042
+ this_cpu_write(arch_prev_mperf, mperf);
2043
+}
2044
+
2045
+static void init_freq_invariance(bool secondary)
2046
+{
2047
+ bool ret = false;
2048
+
2049
+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
2050
+ return;
2051
+
2052
+ if (secondary) {
2053
+ if (static_branch_likely(&arch_scale_freq_key)) {
2054
+ init_counter_refs();
2055
+ }
2056
+ return;
2057
+ }
2058
+
2059
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2060
+ ret = intel_set_max_freq_ratio();
2061
+
2062
+ if (ret) {
2063
+ init_counter_refs();
2064
+ static_branch_enable(&arch_scale_freq_key);
2065
+ } else {
2066
+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
2067
+ }
2068
+}
2069
+
2070
+static void disable_freq_invariance_workfn(struct work_struct *work)
2071
+{
2072
+ static_branch_disable(&arch_scale_freq_key);
2073
+}
2074
+
2075
+static DECLARE_WORK(disable_freq_invariance_work,
2076
+ disable_freq_invariance_workfn);
2077
+
2078
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
2079
+
2080
+void arch_scale_freq_tick(void)
2081
+{
2082
+ u64 freq_scale = SCHED_CAPACITY_SCALE;
2083
+ u64 aperf, mperf;
2084
+ u64 acnt, mcnt;
2085
+
2086
+ if (!arch_scale_freq_invariant())
2087
+ return;
2088
+
2089
+ rdmsrl(MSR_IA32_APERF, aperf);
2090
+ rdmsrl(MSR_IA32_MPERF, mperf);
2091
+
2092
+ acnt = aperf - this_cpu_read(arch_prev_aperf);
2093
+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
2094
+
2095
+ this_cpu_write(arch_prev_aperf, aperf);
2096
+ this_cpu_write(arch_prev_mperf, mperf);
2097
+
2098
+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
2099
+ goto error;
2100
+
2101
+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt)
2102
+ goto error;
2103
+
2104
+ freq_scale = div64_u64(acnt, mcnt);
2105
+ if (!freq_scale)
2106
+ goto error;
2107
+
2108
+ if (freq_scale > SCHED_CAPACITY_SCALE)
2109
+ freq_scale = SCHED_CAPACITY_SCALE;
2110
+
2111
+ this_cpu_write(arch_freq_scale, freq_scale);
2112
+ return;
2113
+
2114
+error:
2115
+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
2116
+ schedule_work(&disable_freq_invariance_work);
2117
+}
2118
+#else
2119
+static inline void init_freq_invariance(bool secondary)
2120
+{
2121
+}
2122
+#endif /* CONFIG_X86_64 */