~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* x86 SMP booting functions
3	4	*
..	..	@@ -11,9 +12,6 @@
11	12	* Thanks to Intel for making available several different Pentium,
12	13	* Pentium Pro and Pentium-II/Xeon MP machines.
13	14	* Original development of Linux SMP code supported by Caldera.
14		- *
15		- * This code is released under the GNU General Public License version 2 or
16		- * later.
17	15	*
18	16	* Fixes
19	17	* Felix Koop : NR_CPUS used properly
..	..	@@ -49,13 +47,15 @@
49	47	#include <linux/sched/hotplug.h>
50	48	#include <linux/sched/task_stack.h>
51	49	#include <linux/percpu.h>
52		-#include <linux/bootmem.h>
	50	+#include <linux/memblock.h>
53	51	#include <linux/err.h>
54	52	#include <linux/nmi.h>
55	53	#include <linux/tboot.h>
56		-#include <linux/stackprotector.h>
57	54	#include <linux/gfp.h>
58	55	#include <linux/cpuidle.h>
	56	+#include <linux/numa.h>
	57	+#include <linux/pgtable.h>
	58	+#include <linux/overflow.h>
59	59
60	60	#include <asm/acpi.h>
61	61	#include <asm/desc.h>
..	..	@@ -64,7 +64,6 @@
64	64	#include <asm/realmode.h>
65	65	#include <asm/cpu.h>
66	66	#include <asm/numa.h>
67		-#include <asm/pgtable.h>
68	67	#include <asm/tlbflush.h>
69	68	#include <asm/mtrr.h>
70	69	#include <asm/mwait.h>
..	..	@@ -81,6 +80,7 @@
81	80	#include <asm/cpu_device_id.h>
82	81	#include <asm/spec-ctrl.h>
83	82	#include <asm/hw_irq.h>
	83	+#include <asm/stackprotector.h>
84	84
85	85	/* representing HT siblings of each logical CPU */
86	86	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
..	..	@@ -89,6 +89,10 @@
89	89	/* representing HT and core siblings of each logical CPU */
90	90	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
91	91	EXPORT_PER_CPU_SYMBOL(cpu_core_map);
	92	+
	93	+/* representing HT, core, and die siblings of each logical CPU */
	94	+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
	95	+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
92	96
93	97	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
94	98
..	..	@@ -100,6 +104,7 @@
100	104	unsigned int __max_logical_packages __read_mostly;
101	105	EXPORT_SYMBOL(__max_logical_packages);
102	106	static unsigned int logical_packages __read_mostly;
	107	+static unsigned int logical_die __read_mostly;
103	108
104	109	/* Maximum number of SMT threads on any online core */
105	110	int __read_mostly __max_smt_threads = 1;
..	..	@@ -143,13 +148,15 @@
143	148	((volatile u32 )phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
144	149	}
145	150
	151	+static void init_freq_invariance(bool secondary);
	152	+
146	153	/*
147	154	* Report back to the Boot Processor during boot time or to the caller processor
148	155	* during CPU online.
149	156	*/
150	157	static void smp_callin(void)
151	158	{
152		- int cpuid, phys_id;
	159	+ int cpuid;
153	160
154	161	/*
155	162	* If waken up by an INIT in an 82489DX configuration
..	..	@@ -158,11 +165,6 @@
158	165	* now safe to touch our local APIC.
159	166	*/
160	167	cpuid = smp_processor_id();
161		-
162		- /*
163		- * (This works even if the APIC is not enabled.)
164		- */
165		- phys_id = read_apic_id();
166	168
167	169	/*
168	170	* the boot CPU has finished the init stage and is spinning
..	..	@@ -183,6 +185,8 @@
183	185	* calibrate_delay() and notify_cpu_starting().
184	186	*/
185	187	set_cpu_sibling_map(raw_smp_processor_id());
	188	+
	189	+ init_freq_invariance(true);
186	190
187	191	/*
188	192	* Get our bogomips.
..	..	@@ -216,23 +220,17 @@
216	220	* before cpu_init(), SMP booting is too fragile that we want to
217	221	* limit the things done here to the most necessary things.
218	222	*/
219		- if (boot_cpu_has(X86_FEATURE_PCID))
220		- __write_cr4(__read_cr4() \| X86_CR4_PCIDE);
	223	+ cr4_init();
221	224
222	225	#ifdef CONFIG_X86_32
223	226	/* switch away from the initial page table */
224	227	load_cr3(swapper_pg_dir);
225		- /*
226		- * Initialize the CR4 shadow before doing anything that could
227		- * try to read it.
228		- */
229		- cr4_init_shadow();
230	228	__flush_tlb_all();
231	229	#endif
232		- load_current_idt();
	230	+ cpu_init_exception_handling();
233	231	cpu_init();
	232	+ rcu_cpu_starting(raw_smp_processor_id());
234	233	x86_cpuinit.early_percpu_clock_init();
235		- preempt_disable();
236	234	smp_callin();
237	235
238	236	enable_start_cpu0 = 0;
..	..	@@ -262,21 +260,10 @@
262	260	/* enable local interrupts */
263	261	local_irq_enable();
264	262
265		- /* to prevent fake stack check failure in clock setup */
266		- boot_init_stack_canary();
267		-
268	263	x86_cpuinit.setup_percpu_clockev();
269	264
270	265	wmb();
271	266	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
272		-
273		- /*
274		- * Prevent tail call to cpu_startup_entry() because the stack protector
275		- * guard has been changed a couple of function calls up, in
276		- * boot_init_stack_canary() and must not be checked before tail calling
277		- * another function.
278		- */
279		- prevent_tail_call_optimization();
280	267	}
281	268
282	269	/**
..	..	@@ -314,6 +301,26 @@
314	301	return -1;
315	302	}
316	303	EXPORT_SYMBOL(topology_phys_to_logical_pkg);
	304	+/**
	305	+ * topology_phys_to_logical_die - Map a physical die id to logical
	306	+ *
	307	+ * Returns logical die id or -1 if not found
	308	+ */
	309	+int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
	310	+{
	311	+ int cpu;
	312	+ int proc_id = cpu_data(cur_cpu).phys_proc_id;
	313	+
	314	+ for_each_possible_cpu(cpu) {
	315	+ struct cpuinfo_x86 *c = &cpu_data(cpu);
	316	+
	317	+ if (c->initialized && c->cpu_die_id == die_id &&
	318	+ c->phys_proc_id == proc_id)
	319	+ return c->logical_die_id;
	320	+ }
	321	+ return -1;
	322	+}
	323	+EXPORT_SYMBOL(topology_phys_to_logical_die);
317	324
318	325	/**
319	326	* topology_update_package_map - Update the physical to logical package map
..	..	@@ -338,6 +345,29 @@
338	345	cpu_data(cpu).logical_proc_id = new;
339	346	return 0;
340	347	}
	348	+/**
	349	+ * topology_update_die_map - Update the physical to logical die map
	350	+ * @die: The die id as retrieved via CPUID
	351	+ * @cpu: The cpu for which this is updated
	352	+ */
	353	+int topology_update_die_map(unsigned int die, unsigned int cpu)
	354	+{
	355	+ int new;
	356	+
	357	+ /* Already available somewhere? */
	358	+ new = topology_phys_to_logical_die(die, cpu);
	359	+ if (new >= 0)
	360	+ goto found;
	361	+
	362	+ new = logical_die++;
	363	+ if (new != die) {
	364	+ pr_info("CPU %u Converting physical %u to logical die %u\n",
	365	+ cpu, die, new);
	366	+ }
	367	+found:
	368	+ cpu_data(cpu).logical_die_id = new;
	369	+ return 0;
	370	+}
341	371
342	372	void __init smp_store_boot_cpu_info(void)
343	373	{
..	..	@@ -347,6 +377,7 @@
347	377	*c = boot_cpu_data;
348	378	c->cpu_index = id;
349	379	topology_update_package_map(c->phys_proc_id, id);
	380	+ topology_update_die_map(c->cpu_die_id, id);
350	381	c->initialized = true;
351	382	}
352	383
..	..	@@ -401,6 +432,7 @@
401	432	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
402	433
403	434	if (c->phys_proc_id == o->phys_proc_id &&
	435	+ c->cpu_die_id == o->cpu_die_id &&
404	436	per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
405	437	if (c->cpu_core_id == o->cpu_core_id)
406	438	return topology_sane(c, o, "smt");
..	..	@@ -412,6 +444,7 @@
412	444	}
413	445
414	446	} else if (c->phys_proc_id == o->phys_proc_id &&
	447	+ c->cpu_die_id == o->cpu_die_id &&
415	448	c->cpu_core_id == o->cpu_core_id) {
416	449	return topology_sane(c, o, "smt");
417	450	}
..	..	@@ -419,29 +452,52 @@
419	452	return false;
420	453	}
421	454
	455	+static bool match_die(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
	456	+{
	457	+ if (c->phys_proc_id == o->phys_proc_id &&
	458	+ c->cpu_die_id == o->cpu_die_id)
	459	+ return true;
	460	+ return false;
	461	+}
	462	+
422	463	/*
423		- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
	464	+ * Unlike the other levels, we do not enforce keeping a
	465	+ * multicore group inside a NUMA node. If this happens, we will
	466	+ * discard the MC level of the topology later.
	467	+ */
	468	+static bool match_pkg(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
	469	+{
	470	+ if (c->phys_proc_id == o->phys_proc_id)
	471	+ return true;
	472	+ return false;
	473	+}
	474	+
	475	+/*
	476	+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
424	477	*
425		- * These are Intel CPUs that enumerate an LLC that is shared by
426		- * multiple NUMA nodes. The LLC on these systems is shared for
427		- * off-package data access but private to the NUMA node (half
428		- * of the package) for on-package access.
	478	+ * Any Intel CPU that has multiple nodes per package and does not
	479	+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
429	480	*
430		- * CPUID (the source of the information about the LLC) can only
431		- * enumerate the cache as being shared or unshared, but not
432		- * this particular configuration. The CPU in this case enumerates
433		- * the cache to be shared across the entire package (spanning both
434		- * NUMA nodes).
	481	+ * When in SNC mode, these CPUs enumerate an LLC that is shared
	482	+ * by multiple NUMA nodes. The LLC is shared for off-package data
	483	+ * access but private to the NUMA node (half of the package) for
	484	+ * on-package access. CPUID (the source of the information about
	485	+ * the LLC) can only enumerate the cache as shared or unshared,
	486	+ * but not this particular configuration.
435	487	*/
436	488
437		-static const struct x86_cpu_id snc_cpu[] = {
438		- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
	489	+static const struct x86_cpu_id intel_cod_cpu[] = {
	490	+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
	491	+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
	492	+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
439	493	{}
440	494	};
441	495
442	496	static bool match_llc(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
443	497	{
	498	+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
444	499	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
	500	+ bool intel_snc = id && id->driver_data;
445	501
446	502	/* Do not match if we do not have a valid APICID for cpu: */
447	503	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
..	..	@@ -456,23 +512,12 @@
456	512	* means 'c' does not share the LLC of 'o'. This will be
457	513	* reflected to userspace.
458	514	*/
459		- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
	515	+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
460	516	return false;
461	517
462	518	return topology_sane(c, o, "llc");
463	519	}
464	520
465		-/*
466		- * Unlike the other levels, we do not enforce keeping a
467		- * multicore group inside a NUMA node. If this happens, we will
468		- * discard the MC level of the topology later.
469		- */
470		-static bool match_die(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
471		-{
472		- if (c->phys_proc_id == o->phys_proc_id)
473		- return true;
474		- return false;
475		-}
476	521
477	522	#if defined(CONFIG_SCHED_SMT) \|\| defined(CONFIG_SCHED_MC)
478	523	static inline int x86_sched_itmt_flags(void)
..	..	@@ -536,6 +581,7 @@
536	581	cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
537	582	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
538	583	cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
	584	+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
539	585	c->booted_cores = 1;
540	586	return;
541	587	}
..	..	@@ -543,13 +589,22 @@
543	589	for_each_cpu(i, cpu_sibling_setup_mask) {
544	590	o = &cpu_data(i);
545	591
	592	+ if (match_pkg(c, o) && !topology_same_node(c, o))
	593	+ x86_has_numa_in_package = true;
	594	+
546	595	if ((i == cpu) \|\| (has_smt && match_smt(c, o)))
547	596	link_mask(topology_sibling_cpumask, cpu, i);
548	597
549	598	if ((i == cpu) \|\| (has_mp && match_llc(c, o)))
550	599	link_mask(cpu_llc_shared_mask, cpu, i);
551	600
	601	+ if ((i == cpu) \|\| (has_mp && match_die(c, o)))
	602	+ link_mask(topology_die_cpumask, cpu, i);
552	603	}
	604	+
	605	+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
	606	+ if (threads > __max_smt_threads)
	607	+ __max_smt_threads = threads;
553	608
554	609	/*
555	610	* This needs a separate iteration over the cpus because we rely on all
..	..	@@ -558,14 +613,13 @@
558	613	for_each_cpu(i, cpu_sibling_setup_mask) {
559	614	o = &cpu_data(i);
560	615
561		- if ((i == cpu) \|\| (has_mp && match_die(c, o))) {
	616	+ if ((i == cpu) \|\| (has_mp && match_pkg(c, o))) {
562	617	link_mask(topology_core_cpumask, cpu, i);
563	618
564	619	/*
565	620	* Does this new cpu bringup a new core?
566	621	*/
567		- if (cpumask_weight(
568		- topology_sibling_cpumask(cpu)) == 1) {
	622	+ if (threads == 1) {
569	623	/*
570	624	* for each core in package, increment
571	625	* the booted_cores for this new cpu
..	..	@@ -582,13 +636,7 @@
582	636	} else if (i != cpu && !c->booted_cores)
583	637	c->booted_cores = cpu_data(i).booted_cores;
584	638	}
585		- if (match_die(c, o) && !topology_same_node(c, o))
586		- x86_has_numa_in_package = true;
587	639	}
588		-
589		- threads = cpumask_weight(topology_sibling_cpumask(cpu));
590		- if (threads > __max_smt_threads)
591		- __max_smt_threads = threads;
592	640	}
593	641
594	642	/* maps the cpu to the sched domain representing multi-core */
..	..	@@ -684,6 +732,7 @@
684	732
685	733	/* if modern processor, use no delay */
686	734	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) \|\|
	735	+ ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) \|\|
687	736	((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
688	737	init_udelay = 0;
689	738	return;
..	..	@@ -848,7 +897,7 @@
848	897	/* reduce the number of lines printed when booting a large cpu count system */
849	898	static void announce_cpu(int cpu, int apicid)
850	899	{
851		- static int current_node = -1;
	900	+ static int current_node = NUMA_NO_NODE;
852	901	int node = early_cpu_to_node(cpu);
853	902	static int width, node_width;
854	903
..	..	@@ -946,20 +995,28 @@
946	995	return boot_error;
947	996	}
948	997
949		-void common_cpu_up(unsigned int cpu, struct task_struct *idle)
	998	+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
950	999	{
	1000	+ int ret;
	1001	+
951	1002	/* Just in case we booted with a single CPU. */
952	1003	alternatives_enable_smp();
953	1004
954	1005	per_cpu(current_task, cpu) = idle;
	1006	+ cpu_init_stack_canary(cpu, idle);
	1007	+
	1008	+ /* Initialize the interrupt stack(s) */
	1009	+ ret = irq_init_percpu_irqstack(cpu);
	1010	+ if (ret)
	1011	+ return ret;
955	1012
956	1013	#ifdef CONFIG_X86_32
957	1014	/* Stack for startup_32 can be just as for start_secondary onwards */
958		- irq_ctx_init(cpu);
959	1015	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
960	1016	#else
961	1017	initial_gs = per_cpu_offset(cpu);
962	1018	#endif
	1019	+ return 0;
963	1020	}
964	1021
965	1022	/*
..	..	@@ -971,8 +1028,6 @@
971	1028	static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
972	1029	int *cpu0_nmi_registered)
973	1030	{
974		- volatile u32 *trampoline_status =
975		- (volatile u32 *) __va(real_mode_header->trampoline_status);
976	1031	/* start_ip had better be page-aligned! */
977	1032	unsigned long start_ip = real_mode_header->trampoline_start;
978	1033
..	..	@@ -1064,9 +1119,6 @@
1064	1119	}
1065	1120	}
1066	1121
1067		- /* mark "stuck" area as not stuck */
1068		- *trampoline_status = 0;
1069		-
1070	1122	if (x86_platform.legacy.warm_reset) {
1071	1123	/*
1072	1124	* Cleanup possible dangling ends...
..	..	@@ -1117,7 +1169,9 @@
1117	1169	/* the FPU context is blank, nobody can own it */
1118	1170	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1119	1171
1120		- common_cpu_up(cpu, tidle);
	1172	+ err = common_cpu_up(cpu, tidle);
	1173	+ if (err)
	1174	+ return err;
1121	1175
1122	1176	err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
1123	1177	if (err) {
..	..	@@ -1178,6 +1232,7 @@
1178	1232	physid_set_mask_of_physid(0, &phys_cpu_present_map);
1179	1233	cpumask_set_cpu(0, topology_sibling_cpumask(0));
1180	1234	cpumask_set_cpu(0, topology_core_cpumask(0));
	1235	+ cpumask_set_cpu(0, topology_die_cpumask(0));
1181	1236	}
1182	1237
1183	1238	/*
..	..	@@ -1273,6 +1328,7 @@
1273	1328	for_each_possible_cpu(i) {
1274	1329	zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1275	1330	zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
	1331	+ zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
1276	1332	zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1277	1333	}
1278	1334
..	..	@@ -1286,7 +1342,7 @@
1286	1342	set_sched_topology(x86_topology);
1287	1343
1288	1344	set_cpu_sibling_map(0);
1289		-
	1345	+ init_freq_invariance(false);
1290	1346	smp_sanity_check();
1291	1347
1292	1348	switch (apic_intr_mode) {
..	..	@@ -1312,8 +1368,6 @@
1312	1368	pr_info("CPU0: ");
1313	1369	print_cpu_info(&cpu_data(0));
1314	1370
1315		- native_pv_lock_init();
1316		-
1317	1371	uv_system_init();
1318	1372
1319	1373	set_mtrr_aps_delayed_init();
..	..	@@ -1323,12 +1377,12 @@
1323	1377	speculative_store_bypass_ht_init();
1324	1378	}
1325	1379
1326		-void arch_enable_nonboot_cpus_begin(void)
	1380	+void arch_thaw_secondary_cpus_begin(void)
1327	1381	{
1328	1382	set_mtrr_aps_delayed_init();
1329	1383	}
1330	1384
1331		-void arch_enable_nonboot_cpus_end(void)
	1385	+void arch_thaw_secondary_cpus_end(void)
1332	1386	{
1333	1387	mtrr_aps_init();
1334	1388	}
..	..	@@ -1343,6 +1397,7 @@
1343	1397	/* already set me in cpu_online_mask in boot_cpu_init() */
1344	1398	cpumask_set_cpu(me, cpu_callout_mask);
1345	1399	cpu_set_state_online(me);
	1400	+ native_pv_lock_init();
1346	1401	}
1347	1402
1348	1403	void __init calculate_max_logical_packages(void)
..	..	@@ -1384,7 +1439,7 @@
1384	1439	/*
1385	1440	* cpu_possible_mask should be static, it cannot change as cpu's
1386	1441	* are onlined, or offlined. The reason is per-cpu data-structures
1387		- * are allocated by some modules at init time, and dont expect to
	1442	+ * are allocated by some modules at init time, and don't expect to
1388	1443	* do this dynamically on cpu arrival/departure.
1389	1444	* cpu_present_mask on the other hand can change dynamically.
1390	1445	* In case when cpu_hotplug is not compiled, then we resort to current
..	..	@@ -1493,6 +1548,8 @@
1493	1548	cpu_data(sibling).booted_cores--;
1494	1549	}
1495	1550
	1551	+ for_each_cpu(sibling, topology_die_cpumask(cpu))
	1552	+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
1496	1553	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1497	1554	cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1498	1555	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
..	..	@@ -1500,6 +1557,7 @@
1500	1557	cpumask_clear(cpu_llc_shared_mask(cpu));
1501	1558	cpumask_clear(topology_sibling_cpumask(cpu));
1502	1559	cpumask_clear(topology_core_cpumask(cpu));
	1560	+ cpumask_clear(topology_die_cpumask(cpu));
1503	1561	c->cpu_core_id = 0;
1504	1562	c->booted_cores = 0;
1505	1563	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
..	..	@@ -1538,8 +1596,27 @@
1538	1596	if (ret)
1539	1597	return ret;
1540	1598
1541		- clear_local_APIC();
1542	1599	cpu_disable_common();
	1600	+
	1601	+ /*
	1602	+ * Disable the local APIC. Otherwise IPI broadcasts will reach
	1603	+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
	1604	+ * messages.
	1605	+ *
	1606	+ * Disabling the APIC must happen after cpu_disable_common()
	1607	+ * which invokes fixup_irqs().
	1608	+ *
	1609	+ * Disabling the APIC preserves already set bits in IRR, but
	1610	+ * an interrupt arriving after disabling the local APIC does not
	1611	+ * set the corresponding IRR bit.
	1612	+ *
	1613	+ * fixup_irqs() scans IRR for set bits so it can raise a not
	1614	+ * yet handled interrupt on the new destination CPU via an IPI
	1615	+ * but obviously it can't do so for IRR bits which are not set.
	1616	+ * IOW, interrupts arriving after disabling the local APIC will
	1617	+ * be lost.
	1618	+ */
	1619	+ apic_soft_disable();
1543	1620
1544	1621	return 0;
1545	1622	}
..	..	@@ -1580,13 +1657,17 @@
1580	1657	local_irq_disable();
1581	1658	}
1582	1659
1583		-static bool wakeup_cpu0(void)
	1660	+/**
	1661	+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
	1662	+ *
	1663	+ * If NMI wants to wake up CPU0, start CPU0.
	1664	+ */
	1665	+void cond_wakeup_cpu0(void)
1584	1666	{
1585	1667	if (smp_processor_id() == 0 && enable_start_cpu0)
1586		- return true;
1587		-
1588		- return false;
	1668	+ start_cpu0();
1589	1669	}
	1670	+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
1590	1671
1591	1672	/*
1592	1673	* We need to flush the caches before going to sleep, lest we have
..	..	@@ -1600,7 +1681,8 @@
1600	1681	void *mwait_ptr;
1601	1682	int i;
1602	1683
1603		- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
	1684	+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD \|\|
	1685	+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
1604	1686	return;
1605	1687	if (!this_cpu_has(X86_FEATURE_MWAIT))
1606	1688	return;
..	..	@@ -1654,11 +1736,8 @@
1654	1736	__monitor(mwait_ptr, 0, 0);
1655	1737	mb();
1656	1738	__mwait(eax, 0);
1657		- /*
1658		- * If NMI wants to wake up CPU0, start CPU0.
1659		- */
1660		- if (wakeup_cpu0())
1661		- start_cpu0();
	1739	+
	1740	+ cond_wakeup_cpu0();
1662	1741	}
1663	1742	}
1664	1743
..	..	@@ -1669,11 +1748,8 @@
1669	1748
1670	1749	while (1) {
1671	1750	native_halt();
1672		- /*
1673		- * If NMI wants to wake up CPU0, start CPU0.
1674		- */
1675		- if (wakeup_cpu0())
1676		- start_cpu0();
	1751	+
	1752	+ cond_wakeup_cpu0();
1677	1753	}
1678	1754	}
1679	1755
..	..	@@ -1705,3 +1781,339 @@
1705	1781	}
1706	1782
1707	1783	#endif
	1784	+
	1785	+#ifdef CONFIG_X86_64
	1786	+/*
	1787	+ * APERF/MPERF frequency ratio computation.
	1788	+ *
	1789	+ * The scheduler wants to do frequency invariant accounting and needs a <1
	1790	+ * ratio to account for the 'current' frequency, corresponding to
	1791	+ * freq_curr / freq_max.
	1792	+ *
	1793	+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
	1794	+ * our P-state setting is little more than a request/hint, we need to observe
	1795	+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
	1796	+ * interval after discarding idle time. This is given by:
	1797	+ *
	1798	+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
	1799	+ *
	1800	+ * where freq_base is the max non-turbo P-state.
	1801	+ *
	1802	+ * The freq_max term has to be set to a somewhat arbitrary value, because we
	1803	+ * can't know which turbo states will be available at a given point in time:
	1804	+ * it all depends on the thermal headroom of the entire package. We set it to
	1805	+ * the turbo level with 4 cores active.
	1806	+ *
	1807	+ * Benchmarks show that's a good compromise between the 1C turbo ratio
	1808	+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
	1809	+ * which would ignore the entire turbo range (a conspicuous part, making
	1810	+ * freq_curr/freq_max always maxed out).
	1811	+ *
	1812	+ * An exception to the heuristic above is the Atom uarch, where we choose the
	1813	+ * highest turbo level for freq_max since Atom's are generally oriented towards
	1814	+ * power efficiency.
	1815	+ *
	1816	+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
	1817	+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
	1818	+ */
	1819	+
	1820	+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
	1821	+
	1822	+static DEFINE_PER_CPU(u64, arch_prev_aperf);
	1823	+static DEFINE_PER_CPU(u64, arch_prev_mperf);
	1824	+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
	1825	+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
	1826	+
	1827	+void arch_set_max_freq_ratio(bool turbo_disabled)
	1828	+{
	1829	+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
	1830	+ arch_turbo_freq_ratio;
	1831	+}
	1832	+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
	1833	+
	1834	+static bool turbo_disabled(void)
	1835	+{
	1836	+ u64 misc_en;
	1837	+ int err;
	1838	+
	1839	+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
	1840	+ if (err)
	1841	+ return false;
	1842	+
	1843	+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
	1844	+}
	1845	+
	1846	+static bool slv_set_max_freq_ratio(u64 base_freq, u64 turbo_freq)
	1847	+{
	1848	+ int err;
	1849	+
	1850	+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
	1851	+ if (err)
	1852	+ return false;
	1853	+
	1854	+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
	1855	+ if (err)
	1856	+ return false;
	1857	+
	1858	+ base_freq = (base_freq >> 16) & 0x3F; /* max P state */
	1859	+ turbo_freq = turbo_freq & 0x3F; /* 1C turbo */
	1860	+
	1861	+ return true;
	1862	+}
	1863	+
	1864	+#include <asm/cpu_device_id.h>
	1865	+#include <asm/intel-family.h>
	1866	+
	1867	+#define X86_MATCH(model) \
	1868	+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
	1869	+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
	1870	+
	1871	+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
	1872	+ X86_MATCH(XEON_PHI_KNL),
	1873	+ X86_MATCH(XEON_PHI_KNM),
	1874	+ {}
	1875	+};
	1876	+
	1877	+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
	1878	+ X86_MATCH(SKYLAKE_X),
	1879	+ {}
	1880	+};
	1881	+
	1882	+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
	1883	+ X86_MATCH(ATOM_GOLDMONT),
	1884	+ X86_MATCH(ATOM_GOLDMONT_D),
	1885	+ X86_MATCH(ATOM_GOLDMONT_PLUS),
	1886	+ {}
	1887	+};
	1888	+
	1889	+static bool knl_set_max_freq_ratio(u64 base_freq, u64 turbo_freq,
	1890	+ int num_delta_fratio)
	1891	+{
	1892	+ int fratio, delta_fratio, found;
	1893	+ int err, i;
	1894	+ u64 msr;
	1895	+
	1896	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1897	+ if (err)
	1898	+ return false;
	1899	+
	1900	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1901	+
	1902	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
	1903	+ if (err)
	1904	+ return false;
	1905	+
	1906	+ fratio = (msr >> 8) & 0xFF;
	1907	+ i = 16;
	1908	+ found = 0;
	1909	+ do {
	1910	+ if (found >= num_delta_fratio) {
	1911	+ *turbo_freq = fratio;
	1912	+ return true;
	1913	+ }
	1914	+
	1915	+ delta_fratio = (msr >> (i + 5)) & 0x7;
	1916	+
	1917	+ if (delta_fratio) {
	1918	+ found += 1;
	1919	+ fratio -= delta_fratio;
	1920	+ }
	1921	+
	1922	+ i += 8;
	1923	+ } while (i < 64);
	1924	+
	1925	+ return true;
	1926	+}
	1927	+
	1928	+static bool skx_set_max_freq_ratio(u64 base_freq, u64 turbo_freq, int size)
	1929	+{
	1930	+ u64 ratios, counts;
	1931	+ u32 group_size;
	1932	+ int err, i;
	1933	+
	1934	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1935	+ if (err)
	1936	+ return false;
	1937	+
	1938	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1939	+
	1940	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
	1941	+ if (err)
	1942	+ return false;
	1943	+
	1944	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
	1945	+ if (err)
	1946	+ return false;
	1947	+
	1948	+ for (i = 0; i < 64; i += 8) {
	1949	+ group_size = (counts >> i) & 0xFF;
	1950	+ if (group_size >= size) {
	1951	+ *turbo_freq = (ratios >> i) & 0xFF;
	1952	+ return true;
	1953	+ }
	1954	+ }
	1955	+
	1956	+ return false;
	1957	+}
	1958	+
	1959	+static bool core_set_max_freq_ratio(u64 base_freq, u64 turbo_freq)
	1960	+{
	1961	+ u64 msr;
	1962	+ int err;
	1963	+
	1964	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1965	+ if (err)
	1966	+ return false;
	1967	+
	1968	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
	1969	+ if (err)
	1970	+ return false;
	1971	+
	1972	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1973	+ turbo_freq = (msr >> 24) & 0xFF; / 4C turbo */
	1974	+
	1975	+ /* The CPU may have less than 4 cores */
	1976	+ if (!*turbo_freq)
	1977	+ turbo_freq = msr & 0xFF; / 1C turbo */
	1978	+
	1979	+ return true;
	1980	+}
	1981	+
	1982	+static bool intel_set_max_freq_ratio(void)
	1983	+{
	1984	+ u64 base_freq, turbo_freq;
	1985	+ u64 turbo_ratio;
	1986	+
	1987	+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
	1988	+ goto out;
	1989	+
	1990	+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
	1991	+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
	1992	+ goto out;
	1993	+
	1994	+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
	1995	+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
	1996	+ goto out;
	1997	+
	1998	+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
	1999	+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
	2000	+ goto out;
	2001	+
	2002	+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
	2003	+ goto out;
	2004	+
	2005	+ return false;
	2006	+
	2007	+out:
	2008	+ /*
	2009	+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
	2010	+ * but then fill all MSR's with zeroes.
	2011	+ * Some CPUs have turbo boost but don't declare any turbo ratio
	2012	+ * in MSR_TURBO_RATIO_LIMIT.
	2013	+ */
	2014	+ if (!base_freq \|\| !turbo_freq) {
	2015	+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
	2016	+ return false;
	2017	+ }
	2018	+
	2019	+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
	2020	+ if (!turbo_ratio) {
	2021	+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
	2022	+ return false;
	2023	+ }
	2024	+
	2025	+ arch_turbo_freq_ratio = turbo_ratio;
	2026	+ arch_set_max_freq_ratio(turbo_disabled());
	2027	+
	2028	+ return true;
	2029	+}
	2030	+
	2031	+static void init_counter_refs(void)
	2032	+{
	2033	+ u64 aperf, mperf;
	2034	+
	2035	+ rdmsrl(MSR_IA32_APERF, aperf);
	2036	+ rdmsrl(MSR_IA32_MPERF, mperf);
	2037	+
	2038	+ this_cpu_write(arch_prev_aperf, aperf);
	2039	+ this_cpu_write(arch_prev_mperf, mperf);
	2040	+}
	2041	+
	2042	+static void init_freq_invariance(bool secondary)
	2043	+{
	2044	+ bool ret = false;
	2045	+
	2046	+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
	2047	+ return;
	2048	+
	2049	+ if (secondary) {
	2050	+ if (static_branch_likely(&arch_scale_freq_key)) {
	2051	+ init_counter_refs();
	2052	+ }
	2053	+ return;
	2054	+ }
	2055	+
	2056	+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
	2057	+ ret = intel_set_max_freq_ratio();
	2058	+
	2059	+ if (ret) {
	2060	+ init_counter_refs();
	2061	+ static_branch_enable(&arch_scale_freq_key);
	2062	+ } else {
	2063	+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
	2064	+ }
	2065	+}
	2066	+
	2067	+static void disable_freq_invariance_workfn(struct work_struct *work)
	2068	+{
	2069	+ static_branch_disable(&arch_scale_freq_key);
	2070	+}
	2071	+
	2072	+static DECLARE_WORK(disable_freq_invariance_work,
	2073	+ disable_freq_invariance_workfn);
	2074	+
	2075	+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
	2076	+
	2077	+void arch_scale_freq_tick(void)
	2078	+{
	2079	+ u64 freq_scale = SCHED_CAPACITY_SCALE;
	2080	+ u64 aperf, mperf;
	2081	+ u64 acnt, mcnt;
	2082	+
	2083	+ if (!arch_scale_freq_invariant())
	2084	+ return;
	2085	+
	2086	+ rdmsrl(MSR_IA32_APERF, aperf);
	2087	+ rdmsrl(MSR_IA32_MPERF, mperf);
	2088	+
	2089	+ acnt = aperf - this_cpu_read(arch_prev_aperf);
	2090	+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
	2091	+
	2092	+ this_cpu_write(arch_prev_aperf, aperf);
	2093	+ this_cpu_write(arch_prev_mperf, mperf);
	2094	+
	2095	+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
	2096	+ goto error;
	2097	+
	2098	+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) \|\| !mcnt)
	2099	+ goto error;
	2100	+
	2101	+ freq_scale = div64_u64(acnt, mcnt);
	2102	+ if (!freq_scale)
	2103	+ goto error;
	2104	+
	2105	+ if (freq_scale > SCHED_CAPACITY_SCALE)
	2106	+ freq_scale = SCHED_CAPACITY_SCALE;
	2107	+
	2108	+ this_cpu_write(arch_freq_scale, freq_scale);
	2109	+ return;
	2110	+
	2111	+error:
	2112	+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
	2113	+ schedule_work(&disable_freq_invariance_work);
	2114	+}
	2115	+#else
	2116	+static inline void init_freq_invariance(bool secondary)
	2117	+{
	2118	+}
	2119	+#endif /* CONFIG_X86_64 */