~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-or-later
1	2	/*
2	3	* x86 SMP booting functions
3	4	*
..	..	@@ -11,9 +12,6 @@
11	12	* Thanks to Intel for making available several different Pentium,
12	13	* Pentium Pro and Pentium-II/Xeon MP machines.
13	14	* Original development of Linux SMP code supported by Caldera.
14		- *
15		- * This code is released under the GNU General Public License version 2 or
16		- * later.
17	15	*
18	16	* Fixes
19	17	* Felix Koop : NR_CPUS used properly
..	..	@@ -49,13 +47,15 @@
49	47	#include <linux/sched/hotplug.h>
50	48	#include <linux/sched/task_stack.h>
51	49	#include <linux/percpu.h>
52		-#include <linux/bootmem.h>
	50	+#include <linux/memblock.h>
53	51	#include <linux/err.h>
54	52	#include <linux/nmi.h>
55	53	#include <linux/tboot.h>
56		-#include <linux/stackprotector.h>
57	54	#include <linux/gfp.h>
58	55	#include <linux/cpuidle.h>
	56	+#include <linux/numa.h>
	57	+#include <linux/pgtable.h>
	58	+#include <linux/overflow.h>
59	59
60	60	#include <asm/acpi.h>
61	61	#include <asm/desc.h>
..	..	@@ -64,7 +64,6 @@
64	64	#include <asm/realmode.h>
65	65	#include <asm/cpu.h>
66	66	#include <asm/numa.h>
67		-#include <asm/pgtable.h>
68	67	#include <asm/tlbflush.h>
69	68	#include <asm/mtrr.h>
70	69	#include <asm/mwait.h>
..	..	@@ -81,6 +80,7 @@
81	80	#include <asm/cpu_device_id.h>
82	81	#include <asm/spec-ctrl.h>
83	82	#include <asm/hw_irq.h>
	83	+#include <asm/stackprotector.h>
84	84
85	85	/* representing HT siblings of each logical CPU */
86	86	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
..	..	@@ -90,16 +90,32 @@
90	90	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
91	91	EXPORT_PER_CPU_SYMBOL(cpu_core_map);
92	92
	93	+/* representing HT, core, and die siblings of each logical CPU */
	94	+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_die_map);
	95	+EXPORT_PER_CPU_SYMBOL(cpu_die_map);
	96	+
93	97	DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
94	98
95	99	/* Per CPU bogomips and other parameters */
96	100	DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
97	101	EXPORT_PER_CPU_SYMBOL(cpu_info);
98	102
	103	+struct mwait_cpu_dead {
	104	+ unsigned int control;
	105	+ unsigned int status;
	106	+};
	107	+
	108	+/*
	109	+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
	110	+ * that it's unlikely to be touched by other CPUs.
	111	+ */
	112	+static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
	113	+
99	114	/* Logical package management. We might want to allocate that dynamically */
100	115	unsigned int __max_logical_packages __read_mostly;
101	116	EXPORT_SYMBOL(__max_logical_packages);
102	117	static unsigned int logical_packages __read_mostly;
	118	+static unsigned int logical_die __read_mostly;
103	119
104	120	/* Maximum number of SMT threads on any online core */
105	121	int __read_mostly __max_smt_threads = 1;
..	..	@@ -143,13 +159,15 @@
143	159	((volatile u32 )phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
144	160	}
145	161
	162	+static void init_freq_invariance(bool secondary);
	163	+
146	164	/*
147	165	* Report back to the Boot Processor during boot time or to the caller processor
148	166	* during CPU online.
149	167	*/
150	168	static void smp_callin(void)
151	169	{
152		- int cpuid, phys_id;
	170	+ int cpuid;
153	171
154	172	/*
155	173	* If waken up by an INIT in an 82489DX configuration
..	..	@@ -158,11 +176,6 @@
158	176	* now safe to touch our local APIC.
159	177	*/
160	178	cpuid = smp_processor_id();
161		-
162		- /*
163		- * (This works even if the APIC is not enabled.)
164		- */
165		- phys_id = read_apic_id();
166	179
167	180	/*
168	181	* the boot CPU has finished the init stage and is spinning
..	..	@@ -183,6 +196,8 @@
183	196	* calibrate_delay() and notify_cpu_starting().
184	197	*/
185	198	set_cpu_sibling_map(raw_smp_processor_id());
	199	+
	200	+ init_freq_invariance(true);
186	201
187	202	/*
188	203	* Get our bogomips.
..	..	@@ -216,23 +231,16 @@
216	231	* before cpu_init(), SMP booting is too fragile that we want to
217	232	* limit the things done here to the most necessary things.
218	233	*/
219		- if (boot_cpu_has(X86_FEATURE_PCID))
220		- __write_cr4(__read_cr4() \| X86_CR4_PCIDE);
	234	+ cr4_init();
221	235
222	236	#ifdef CONFIG_X86_32
223	237	/* switch away from the initial page table */
224	238	load_cr3(swapper_pg_dir);
225		- /*
226		- * Initialize the CR4 shadow before doing anything that could
227		- * try to read it.
228		- */
229		- cr4_init_shadow();
230	239	__flush_tlb_all();
231	240	#endif
232		- load_current_idt();
233		- cpu_init();
	241	+ cpu_init_secondary();
	242	+ rcu_cpu_starting(raw_smp_processor_id());
234	243	x86_cpuinit.early_percpu_clock_init();
235		- preempt_disable();
236	244	smp_callin();
237	245
238	246	enable_start_cpu0 = 0;
..	..	@@ -262,21 +270,10 @@
262	270	/* enable local interrupts */
263	271	local_irq_enable();
264	272
265		- /* to prevent fake stack check failure in clock setup */
266		- boot_init_stack_canary();
267		-
268	273	x86_cpuinit.setup_percpu_clockev();
269	274
270	275	wmb();
271	276	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
272		-
273		- /*
274		- * Prevent tail call to cpu_startup_entry() because the stack protector
275		- * guard has been changed a couple of function calls up, in
276		- * boot_init_stack_canary() and must not be checked before tail calling
277		- * another function.
278		- */
279		- prevent_tail_call_optimization();
280	277	}
281	278
282	279	/**
..	..	@@ -314,6 +311,26 @@
314	311	return -1;
315	312	}
316	313	EXPORT_SYMBOL(topology_phys_to_logical_pkg);
	314	+/**
	315	+ * topology_phys_to_logical_die - Map a physical die id to logical
	316	+ *
	317	+ * Returns logical die id or -1 if not found
	318	+ */
	319	+int topology_phys_to_logical_die(unsigned int die_id, unsigned int cur_cpu)
	320	+{
	321	+ int cpu;
	322	+ int proc_id = cpu_data(cur_cpu).phys_proc_id;
	323	+
	324	+ for_each_possible_cpu(cpu) {
	325	+ struct cpuinfo_x86 *c = &cpu_data(cpu);
	326	+
	327	+ if (c->initialized && c->cpu_die_id == die_id &&
	328	+ c->phys_proc_id == proc_id)
	329	+ return c->logical_die_id;
	330	+ }
	331	+ return -1;
	332	+}
	333	+EXPORT_SYMBOL(topology_phys_to_logical_die);
317	334
318	335	/**
319	336	* topology_update_package_map - Update the physical to logical package map
..	..	@@ -338,6 +355,29 @@
338	355	cpu_data(cpu).logical_proc_id = new;
339	356	return 0;
340	357	}
	358	+/**
	359	+ * topology_update_die_map - Update the physical to logical die map
	360	+ * @die: The die id as retrieved via CPUID
	361	+ * @cpu: The cpu for which this is updated
	362	+ */
	363	+int topology_update_die_map(unsigned int die, unsigned int cpu)
	364	+{
	365	+ int new;
	366	+
	367	+ /* Already available somewhere? */
	368	+ new = topology_phys_to_logical_die(die, cpu);
	369	+ if (new >= 0)
	370	+ goto found;
	371	+
	372	+ new = logical_die++;
	373	+ if (new != die) {
	374	+ pr_info("CPU %u Converting physical %u to logical die %u\n",
	375	+ cpu, die, new);
	376	+ }
	377	+found:
	378	+ cpu_data(cpu).logical_die_id = new;
	379	+ return 0;
	380	+}
341	381
342	382	void __init smp_store_boot_cpu_info(void)
343	383	{
..	..	@@ -347,6 +387,7 @@
347	387	*c = boot_cpu_data;
348	388	c->cpu_index = id;
349	389	topology_update_package_map(c->phys_proc_id, id);
	390	+ topology_update_die_map(c->cpu_die_id, id);
350	391	c->initialized = true;
351	392	}
352	393
..	..	@@ -401,6 +442,7 @@
401	442	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
402	443
403	444	if (c->phys_proc_id == o->phys_proc_id &&
	445	+ c->cpu_die_id == o->cpu_die_id &&
404	446	per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2)) {
405	447	if (c->cpu_core_id == o->cpu_core_id)
406	448	return topology_sane(c, o, "smt");
..	..	@@ -412,6 +454,7 @@
412	454	}
413	455
414	456	} else if (c->phys_proc_id == o->phys_proc_id &&
	457	+ c->cpu_die_id == o->cpu_die_id &&
415	458	c->cpu_core_id == o->cpu_core_id) {
416	459	return topology_sane(c, o, "smt");
417	460	}
..	..	@@ -419,29 +462,52 @@
419	462	return false;
420	463	}
421	464
	465	+static bool match_die(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
	466	+{
	467	+ if (c->phys_proc_id == o->phys_proc_id &&
	468	+ c->cpu_die_id == o->cpu_die_id)
	469	+ return true;
	470	+ return false;
	471	+}
	472	+
422	473	/*
423		- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
	474	+ * Unlike the other levels, we do not enforce keeping a
	475	+ * multicore group inside a NUMA node. If this happens, we will
	476	+ * discard the MC level of the topology later.
	477	+ */
	478	+static bool match_pkg(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
	479	+{
	480	+ if (c->phys_proc_id == o->phys_proc_id)
	481	+ return true;
	482	+ return false;
	483	+}
	484	+
	485	+/*
	486	+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
424	487	*
425		- * These are Intel CPUs that enumerate an LLC that is shared by
426		- * multiple NUMA nodes. The LLC on these systems is shared for
427		- * off-package data access but private to the NUMA node (half
428		- * of the package) for on-package access.
	488	+ * Any Intel CPU that has multiple nodes per package and does not
	489	+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
429	490	*
430		- * CPUID (the source of the information about the LLC) can only
431		- * enumerate the cache as being shared or unshared, but not
432		- * this particular configuration. The CPU in this case enumerates
433		- * the cache to be shared across the entire package (spanning both
434		- * NUMA nodes).
	491	+ * When in SNC mode, these CPUs enumerate an LLC that is shared
	492	+ * by multiple NUMA nodes. The LLC is shared for off-package data
	493	+ * access but private to the NUMA node (half of the package) for
	494	+ * on-package access. CPUID (the source of the information about
	495	+ * the LLC) can only enumerate the cache as shared or unshared,
	496	+ * but not this particular configuration.
435	497	*/
436	498
437		-static const struct x86_cpu_id snc_cpu[] = {
438		- { X86_VENDOR_INTEL, 6, INTEL_FAM6_SKYLAKE_X },
	499	+static const struct x86_cpu_id intel_cod_cpu[] = {
	500	+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
	501	+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
	502	+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
439	503	{}
440	504	};
441	505
442	506	static bool match_llc(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
443	507	{
	508	+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
444	509	int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
	510	+ bool intel_snc = id && id->driver_data;
445	511
446	512	/* Do not match if we do not have a valid APICID for cpu: */
447	513	if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
..	..	@@ -456,23 +522,12 @@
456	522	* means 'c' does not share the LLC of 'o'. This will be
457	523	* reflected to userspace.
458	524	*/
459		- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
	525	+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
460	526	return false;
461	527
462	528	return topology_sane(c, o, "llc");
463	529	}
464	530
465		-/*
466		- * Unlike the other levels, we do not enforce keeping a
467		- * multicore group inside a NUMA node. If this happens, we will
468		- * discard the MC level of the topology later.
469		- */
470		-static bool match_die(struct cpuinfo_x86 c, struct cpuinfo_x86 o)
471		-{
472		- if (c->phys_proc_id == o->phys_proc_id)
473		- return true;
474		- return false;
475		-}
476	531
477	532	#if defined(CONFIG_SCHED_SMT) \|\| defined(CONFIG_SCHED_MC)
478	533	static inline int x86_sched_itmt_flags(void)
..	..	@@ -536,6 +591,7 @@
536	591	cpumask_set_cpu(cpu, topology_sibling_cpumask(cpu));
537	592	cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
538	593	cpumask_set_cpu(cpu, topology_core_cpumask(cpu));
	594	+ cpumask_set_cpu(cpu, topology_die_cpumask(cpu));
539	595	c->booted_cores = 1;
540	596	return;
541	597	}
..	..	@@ -543,13 +599,22 @@
543	599	for_each_cpu(i, cpu_sibling_setup_mask) {
544	600	o = &cpu_data(i);
545	601
	602	+ if (match_pkg(c, o) && !topology_same_node(c, o))
	603	+ x86_has_numa_in_package = true;
	604	+
546	605	if ((i == cpu) \|\| (has_smt && match_smt(c, o)))
547	606	link_mask(topology_sibling_cpumask, cpu, i);
548	607
549	608	if ((i == cpu) \|\| (has_mp && match_llc(c, o)))
550	609	link_mask(cpu_llc_shared_mask, cpu, i);
551	610
	611	+ if ((i == cpu) \|\| (has_mp && match_die(c, o)))
	612	+ link_mask(topology_die_cpumask, cpu, i);
552	613	}
	614	+
	615	+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
	616	+ if (threads > __max_smt_threads)
	617	+ __max_smt_threads = threads;
553	618
554	619	/*
555	620	* This needs a separate iteration over the cpus because we rely on all
..	..	@@ -558,14 +623,13 @@
558	623	for_each_cpu(i, cpu_sibling_setup_mask) {
559	624	o = &cpu_data(i);
560	625
561		- if ((i == cpu) \|\| (has_mp && match_die(c, o))) {
	626	+ if ((i == cpu) \|\| (has_mp && match_pkg(c, o))) {
562	627	link_mask(topology_core_cpumask, cpu, i);
563	628
564	629	/*
565	630	* Does this new cpu bringup a new core?
566	631	*/
567		- if (cpumask_weight(
568		- topology_sibling_cpumask(cpu)) == 1) {
	632	+ if (threads == 1) {
569	633	/*
570	634	* for each core in package, increment
571	635	* the booted_cores for this new cpu
..	..	@@ -582,13 +646,7 @@
582	646	} else if (i != cpu && !c->booted_cores)
583	647	c->booted_cores = cpu_data(i).booted_cores;
584	648	}
585		- if (match_die(c, o) && !topology_same_node(c, o))
586		- x86_has_numa_in_package = true;
587	649	}
588		-
589		- threads = cpumask_weight(topology_sibling_cpumask(cpu));
590		- if (threads > __max_smt_threads)
591		- __max_smt_threads = threads;
592	650	}
593	651
594	652	/* maps the cpu to the sched domain representing multi-core */
..	..	@@ -684,6 +742,7 @@
684	742
685	743	/* if modern processor, use no delay */
686	744	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) \|\|
	745	+ ((boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) && (boot_cpu_data.x86 >= 0x18)) \|\|
687	746	((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF))) {
688	747	init_udelay = 0;
689	748	return;
..	..	@@ -848,7 +907,7 @@
848	907	/* reduce the number of lines printed when booting a large cpu count system */
849	908	static void announce_cpu(int cpu, int apicid)
850	909	{
851		- static int current_node = -1;
	910	+ static int current_node = NUMA_NO_NODE;
852	911	int node = early_cpu_to_node(cpu);
853	912	static int width, node_width;
854	913
..	..	@@ -946,20 +1005,28 @@
946	1005	return boot_error;
947	1006	}
948	1007
949		-void common_cpu_up(unsigned int cpu, struct task_struct *idle)
	1008	+int common_cpu_up(unsigned int cpu, struct task_struct *idle)
950	1009	{
	1010	+ int ret;
	1011	+
951	1012	/* Just in case we booted with a single CPU. */
952	1013	alternatives_enable_smp();
953	1014
954	1015	per_cpu(current_task, cpu) = idle;
	1016	+ cpu_init_stack_canary(cpu, idle);
	1017	+
	1018	+ /* Initialize the interrupt stack(s) */
	1019	+ ret = irq_init_percpu_irqstack(cpu);
	1020	+ if (ret)
	1021	+ return ret;
955	1022
956	1023	#ifdef CONFIG_X86_32
957	1024	/* Stack for startup_32 can be just as for start_secondary onwards */
958		- irq_ctx_init(cpu);
959	1025	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
960	1026	#else
961	1027	initial_gs = per_cpu_offset(cpu);
962	1028	#endif
	1029	+ return 0;
963	1030	}
964	1031
965	1032	/*
..	..	@@ -971,8 +1038,6 @@
971	1038	static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
972	1039	int *cpu0_nmi_registered)
973	1040	{
974		- volatile u32 *trampoline_status =
975		- (volatile u32 *) __va(real_mode_header->trampoline_status);
976	1041	/* start_ip had better be page-aligned! */
977	1042	unsigned long start_ip = real_mode_header->trampoline_start;
978	1043
..	..	@@ -1064,9 +1129,6 @@
1064	1129	}
1065	1130	}
1066	1131
1067		- /* mark "stuck" area as not stuck */
1068		- *trampoline_status = 0;
1069		-
1070	1132	if (x86_platform.legacy.warm_reset) {
1071	1133	/*
1072	1134	* Cleanup possible dangling ends...
..	..	@@ -1117,7 +1179,9 @@
1117	1179	/* the FPU context is blank, nobody can own it */
1118	1180	per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
1119	1181
1120		- common_cpu_up(cpu, tidle);
	1182	+ err = common_cpu_up(cpu, tidle);
	1183	+ if (err)
	1184	+ return err;
1121	1185
1122	1186	err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
1123	1187	if (err) {
..	..	@@ -1178,6 +1242,7 @@
1178	1242	physid_set_mask_of_physid(0, &phys_cpu_present_map);
1179	1243	cpumask_set_cpu(0, topology_sibling_cpumask(0));
1180	1244	cpumask_set_cpu(0, topology_core_cpumask(0));
	1245	+ cpumask_set_cpu(0, topology_die_cpumask(0));
1181	1246	}
1182	1247
1183	1248	/*
..	..	@@ -1273,6 +1338,7 @@
1273	1338	for_each_possible_cpu(i) {
1274	1339	zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1275	1340	zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
	1341	+ zalloc_cpumask_var(&per_cpu(cpu_die_map, i), GFP_KERNEL);
1276	1342	zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1277	1343	}
1278	1344
..	..	@@ -1286,7 +1352,7 @@
1286	1352	set_sched_topology(x86_topology);
1287	1353
1288	1354	set_cpu_sibling_map(0);
1289		-
	1355	+ init_freq_invariance(false);
1290	1356	smp_sanity_check();
1291	1357
1292	1358	switch (apic_intr_mode) {
..	..	@@ -1312,8 +1378,6 @@
1312	1378	pr_info("CPU0: ");
1313	1379	print_cpu_info(&cpu_data(0));
1314	1380
1315		- native_pv_lock_init();
1316		-
1317	1381	uv_system_init();
1318	1382
1319	1383	set_mtrr_aps_delayed_init();
..	..	@@ -1323,12 +1387,12 @@
1323	1387	speculative_store_bypass_ht_init();
1324	1388	}
1325	1389
1326		-void arch_enable_nonboot_cpus_begin(void)
	1390	+void arch_thaw_secondary_cpus_begin(void)
1327	1391	{
1328	1392	set_mtrr_aps_delayed_init();
1329	1393	}
1330	1394
1331		-void arch_enable_nonboot_cpus_end(void)
	1395	+void arch_thaw_secondary_cpus_end(void)
1332	1396	{
1333	1397	mtrr_aps_init();
1334	1398	}
..	..	@@ -1343,6 +1407,7 @@
1343	1407	/* already set me in cpu_online_mask in boot_cpu_init() */
1344	1408	cpumask_set_cpu(me, cpu_callout_mask);
1345	1409	cpu_set_state_online(me);
	1410	+ native_pv_lock_init();
1346	1411	}
1347	1412
1348	1413	void __init calculate_max_logical_packages(void)
..	..	@@ -1384,7 +1449,7 @@
1384	1449	/*
1385	1450	* cpu_possible_mask should be static, it cannot change as cpu's
1386	1451	* are onlined, or offlined. The reason is per-cpu data-structures
1387		- * are allocated by some modules at init time, and dont expect to
	1452	+ * are allocated by some modules at init time, and don't expect to
1388	1453	* do this dynamically on cpu arrival/departure.
1389	1454	* cpu_present_mask on the other hand can change dynamically.
1390	1455	* In case when cpu_hotplug is not compiled, then we resort to current
..	..	@@ -1493,6 +1558,8 @@
1493	1558	cpu_data(sibling).booted_cores--;
1494	1559	}
1495	1560
	1561	+ for_each_cpu(sibling, topology_die_cpumask(cpu))
	1562	+ cpumask_clear_cpu(cpu, topology_die_cpumask(sibling));
1496	1563	for_each_cpu(sibling, topology_sibling_cpumask(cpu))
1497	1564	cpumask_clear_cpu(cpu, topology_sibling_cpumask(sibling));
1498	1565	for_each_cpu(sibling, cpu_llc_shared_mask(cpu))
..	..	@@ -1500,6 +1567,7 @@
1500	1567	cpumask_clear(cpu_llc_shared_mask(cpu));
1501	1568	cpumask_clear(topology_sibling_cpumask(cpu));
1502	1569	cpumask_clear(topology_core_cpumask(cpu));
	1570	+ cpumask_clear(topology_die_cpumask(cpu));
1503	1571	c->cpu_core_id = 0;
1504	1572	c->booted_cores = 0;
1505	1573	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
..	..	@@ -1538,8 +1606,27 @@
1538	1606	if (ret)
1539	1607	return ret;
1540	1608
1541		- clear_local_APIC();
1542	1609	cpu_disable_common();
	1610	+
	1611	+ /*
	1612	+ * Disable the local APIC. Otherwise IPI broadcasts will reach
	1613	+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
	1614	+ * messages.
	1615	+ *
	1616	+ * Disabling the APIC must happen after cpu_disable_common()
	1617	+ * which invokes fixup_irqs().
	1618	+ *
	1619	+ * Disabling the APIC preserves already set bits in IRR, but
	1620	+ * an interrupt arriving after disabling the local APIC does not
	1621	+ * set the corresponding IRR bit.
	1622	+ *
	1623	+ * fixup_irqs() scans IRR for set bits so it can raise a not
	1624	+ * yet handled interrupt on the new destination CPU via an IPI
	1625	+ * but obviously it can't do so for IRR bits which are not set.
	1626	+ * IOW, interrupts arriving after disabling the local APIC will
	1627	+ * be lost.
	1628	+ */
	1629	+ apic_soft_disable();
1543	1630
1544	1631	return 0;
1545	1632	}
..	..	@@ -1580,13 +1667,17 @@
1580	1667	local_irq_disable();
1581	1668	}
1582	1669
1583		-static bool wakeup_cpu0(void)
	1670	+/**
	1671	+ * cond_wakeup_cpu0 - Wake up CPU0 if needed.
	1672	+ *
	1673	+ * If NMI wants to wake up CPU0, start CPU0.
	1674	+ */
	1675	+void cond_wakeup_cpu0(void)
1584	1676	{
1585	1677	if (smp_processor_id() == 0 && enable_start_cpu0)
1586		- return true;
1587		-
1588		- return false;
	1678	+ start_cpu0();
1589	1679	}
	1680	+EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
1590	1681
1591	1682	/*
1592	1683	* We need to flush the caches before going to sleep, lest we have
..	..	@@ -1594,13 +1685,14 @@
1594	1685	*/
1595	1686	static inline void mwait_play_dead(void)
1596	1687	{
	1688	+ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
1597	1689	unsigned int eax, ebx, ecx, edx;
1598	1690	unsigned int highest_cstate = 0;
1599	1691	unsigned int highest_subcstate = 0;
1600		- void *mwait_ptr;
1601	1692	int i;
1602	1693
1603		- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
	1694	+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD \|\|
	1695	+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
1604	1696	return;
1605	1697	if (!this_cpu_has(X86_FEATURE_MWAIT))
1606	1698	return;
..	..	@@ -1631,13 +1723,6 @@
1631	1723	(highest_subcstate - 1);
1632	1724	}
1633	1725
1634		- /*
1635		- * This should be a memory location in a cache line which is
1636		- * unlikely to be touched by other processors. The actual
1637		- * content is immaterial as it is not actually modified in any way.
1638		- */
1639		- mwait_ptr = &current_thread_info()->flags;
1640		-
1641	1726	wbinvd();
1642	1727
1643	1728	while (1) {
..	..	@@ -1649,16 +1734,13 @@
1649	1734	* case where we return around the loop.
1650	1735	*/
1651	1736	mb();
1652		- clflush(mwait_ptr);
	1737	+ clflush(md);
1653	1738	mb();
1654		- __monitor(mwait_ptr, 0, 0);
	1739	+ __monitor(md, 0, 0);
1655	1740	mb();
1656	1741	__mwait(eax, 0);
1657		- /*
1658		- * If NMI wants to wake up CPU0, start CPU0.
1659		- */
1660		- if (wakeup_cpu0())
1661		- start_cpu0();
	1742	+
	1743	+ cond_wakeup_cpu0();
1662	1744	}
1663	1745	}
1664	1746
..	..	@@ -1669,11 +1751,8 @@
1669	1751
1670	1752	while (1) {
1671	1753	native_halt();
1672		- /*
1673		- * If NMI wants to wake up CPU0, start CPU0.
1674		- */
1675		- if (wakeup_cpu0())
1676		- start_cpu0();
	1754	+
	1755	+ cond_wakeup_cpu0();
1677	1756	}
1678	1757	}
1679	1758
..	..	@@ -1705,3 +1784,339 @@
1705	1784	}
1706	1785
1707	1786	#endif
	1787	+
	1788	+#ifdef CONFIG_X86_64
	1789	+/*
	1790	+ * APERF/MPERF frequency ratio computation.
	1791	+ *
	1792	+ * The scheduler wants to do frequency invariant accounting and needs a <1
	1793	+ * ratio to account for the 'current' frequency, corresponding to
	1794	+ * freq_curr / freq_max.
	1795	+ *
	1796	+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
	1797	+ * our P-state setting is little more than a request/hint, we need to observe
	1798	+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
	1799	+ * interval after discarding idle time. This is given by:
	1800	+ *
	1801	+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
	1802	+ *
	1803	+ * where freq_base is the max non-turbo P-state.
	1804	+ *
	1805	+ * The freq_max term has to be set to a somewhat arbitrary value, because we
	1806	+ * can't know which turbo states will be available at a given point in time:
	1807	+ * it all depends on the thermal headroom of the entire package. We set it to
	1808	+ * the turbo level with 4 cores active.
	1809	+ *
	1810	+ * Benchmarks show that's a good compromise between the 1C turbo ratio
	1811	+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
	1812	+ * which would ignore the entire turbo range (a conspicuous part, making
	1813	+ * freq_curr/freq_max always maxed out).
	1814	+ *
	1815	+ * An exception to the heuristic above is the Atom uarch, where we choose the
	1816	+ * highest turbo level for freq_max since Atom's are generally oriented towards
	1817	+ * power efficiency.
	1818	+ *
	1819	+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
	1820	+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
	1821	+ */
	1822	+
	1823	+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
	1824	+
	1825	+static DEFINE_PER_CPU(u64, arch_prev_aperf);
	1826	+static DEFINE_PER_CPU(u64, arch_prev_mperf);
	1827	+static u64 arch_turbo_freq_ratio = SCHED_CAPACITY_SCALE;
	1828	+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
	1829	+
	1830	+void arch_set_max_freq_ratio(bool turbo_disabled)
	1831	+{
	1832	+ arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
	1833	+ arch_turbo_freq_ratio;
	1834	+}
	1835	+EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
	1836	+
	1837	+static bool turbo_disabled(void)
	1838	+{
	1839	+ u64 misc_en;
	1840	+ int err;
	1841	+
	1842	+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
	1843	+ if (err)
	1844	+ return false;
	1845	+
	1846	+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
	1847	+}
	1848	+
	1849	+static bool slv_set_max_freq_ratio(u64 base_freq, u64 turbo_freq)
	1850	+{
	1851	+ int err;
	1852	+
	1853	+ err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq);
	1854	+ if (err)
	1855	+ return false;
	1856	+
	1857	+ err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq);
	1858	+ if (err)
	1859	+ return false;
	1860	+
	1861	+ base_freq = (base_freq >> 16) & 0x3F; /* max P state */
	1862	+ turbo_freq = turbo_freq & 0x3F; /* 1C turbo */
	1863	+
	1864	+ return true;
	1865	+}
	1866	+
	1867	+#include <asm/cpu_device_id.h>
	1868	+#include <asm/intel-family.h>
	1869	+
	1870	+#define X86_MATCH(model) \
	1871	+ X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
	1872	+ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
	1873	+
	1874	+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
	1875	+ X86_MATCH(XEON_PHI_KNL),
	1876	+ X86_MATCH(XEON_PHI_KNM),
	1877	+ {}
	1878	+};
	1879	+
	1880	+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
	1881	+ X86_MATCH(SKYLAKE_X),
	1882	+ {}
	1883	+};
	1884	+
	1885	+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
	1886	+ X86_MATCH(ATOM_GOLDMONT),
	1887	+ X86_MATCH(ATOM_GOLDMONT_D),
	1888	+ X86_MATCH(ATOM_GOLDMONT_PLUS),
	1889	+ {}
	1890	+};
	1891	+
	1892	+static bool knl_set_max_freq_ratio(u64 base_freq, u64 turbo_freq,
	1893	+ int num_delta_fratio)
	1894	+{
	1895	+ int fratio, delta_fratio, found;
	1896	+ int err, i;
	1897	+ u64 msr;
	1898	+
	1899	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1900	+ if (err)
	1901	+ return false;
	1902	+
	1903	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1904	+
	1905	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
	1906	+ if (err)
	1907	+ return false;
	1908	+
	1909	+ fratio = (msr >> 8) & 0xFF;
	1910	+ i = 16;
	1911	+ found = 0;
	1912	+ do {
	1913	+ if (found >= num_delta_fratio) {
	1914	+ *turbo_freq = fratio;
	1915	+ return true;
	1916	+ }
	1917	+
	1918	+ delta_fratio = (msr >> (i + 5)) & 0x7;
	1919	+
	1920	+ if (delta_fratio) {
	1921	+ found += 1;
	1922	+ fratio -= delta_fratio;
	1923	+ }
	1924	+
	1925	+ i += 8;
	1926	+ } while (i < 64);
	1927	+
	1928	+ return true;
	1929	+}
	1930	+
	1931	+static bool skx_set_max_freq_ratio(u64 base_freq, u64 turbo_freq, int size)
	1932	+{
	1933	+ u64 ratios, counts;
	1934	+ u32 group_size;
	1935	+ int err, i;
	1936	+
	1937	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1938	+ if (err)
	1939	+ return false;
	1940	+
	1941	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1942	+
	1943	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios);
	1944	+ if (err)
	1945	+ return false;
	1946	+
	1947	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts);
	1948	+ if (err)
	1949	+ return false;
	1950	+
	1951	+ for (i = 0; i < 64; i += 8) {
	1952	+ group_size = (counts >> i) & 0xFF;
	1953	+ if (group_size >= size) {
	1954	+ *turbo_freq = (ratios >> i) & 0xFF;
	1955	+ return true;
	1956	+ }
	1957	+ }
	1958	+
	1959	+ return false;
	1960	+}
	1961	+
	1962	+static bool core_set_max_freq_ratio(u64 base_freq, u64 turbo_freq)
	1963	+{
	1964	+ u64 msr;
	1965	+ int err;
	1966	+
	1967	+ err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
	1968	+ if (err)
	1969	+ return false;
	1970	+
	1971	+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
	1972	+ if (err)
	1973	+ return false;
	1974	+
	1975	+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
	1976	+ turbo_freq = (msr >> 24) & 0xFF; / 4C turbo */
	1977	+
	1978	+ /* The CPU may have less than 4 cores */
	1979	+ if (!*turbo_freq)
	1980	+ turbo_freq = msr & 0xFF; / 1C turbo */
	1981	+
	1982	+ return true;
	1983	+}
	1984	+
	1985	+static bool intel_set_max_freq_ratio(void)
	1986	+{
	1987	+ u64 base_freq, turbo_freq;
	1988	+ u64 turbo_ratio;
	1989	+
	1990	+ if (slv_set_max_freq_ratio(&base_freq, &turbo_freq))
	1991	+ goto out;
	1992	+
	1993	+ if (x86_match_cpu(has_glm_turbo_ratio_limits) &&
	1994	+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
	1995	+ goto out;
	1996	+
	1997	+ if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
	1998	+ knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
	1999	+ goto out;
	2000	+
	2001	+ if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
	2002	+ skx_set_max_freq_ratio(&base_freq, &turbo_freq, 4))
	2003	+ goto out;
	2004	+
	2005	+ if (core_set_max_freq_ratio(&base_freq, &turbo_freq))
	2006	+ goto out;
	2007	+
	2008	+ return false;
	2009	+
	2010	+out:
	2011	+ /*
	2012	+ * Some hypervisors advertise X86_FEATURE_APERFMPERF
	2013	+ * but then fill all MSR's with zeroes.
	2014	+ * Some CPUs have turbo boost but don't declare any turbo ratio
	2015	+ * in MSR_TURBO_RATIO_LIMIT.
	2016	+ */
	2017	+ if (!base_freq \|\| !turbo_freq) {
	2018	+ pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n");
	2019	+ return false;
	2020	+ }
	2021	+
	2022	+ turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq);
	2023	+ if (!turbo_ratio) {
	2024	+ pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n");
	2025	+ return false;
	2026	+ }
	2027	+
	2028	+ arch_turbo_freq_ratio = turbo_ratio;
	2029	+ arch_set_max_freq_ratio(turbo_disabled());
	2030	+
	2031	+ return true;
	2032	+}
	2033	+
	2034	+static void init_counter_refs(void)
	2035	+{
	2036	+ u64 aperf, mperf;
	2037	+
	2038	+ rdmsrl(MSR_IA32_APERF, aperf);
	2039	+ rdmsrl(MSR_IA32_MPERF, mperf);
	2040	+
	2041	+ this_cpu_write(arch_prev_aperf, aperf);
	2042	+ this_cpu_write(arch_prev_mperf, mperf);
	2043	+}
	2044	+
	2045	+static void init_freq_invariance(bool secondary)
	2046	+{
	2047	+ bool ret = false;
	2048	+
	2049	+ if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
	2050	+ return;
	2051	+
	2052	+ if (secondary) {
	2053	+ if (static_branch_likely(&arch_scale_freq_key)) {
	2054	+ init_counter_refs();
	2055	+ }
	2056	+ return;
	2057	+ }
	2058	+
	2059	+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
	2060	+ ret = intel_set_max_freq_ratio();
	2061	+
	2062	+ if (ret) {
	2063	+ init_counter_refs();
	2064	+ static_branch_enable(&arch_scale_freq_key);
	2065	+ } else {
	2066	+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
	2067	+ }
	2068	+}
	2069	+
	2070	+static void disable_freq_invariance_workfn(struct work_struct *work)
	2071	+{
	2072	+ static_branch_disable(&arch_scale_freq_key);
	2073	+}
	2074	+
	2075	+static DECLARE_WORK(disable_freq_invariance_work,
	2076	+ disable_freq_invariance_workfn);
	2077	+
	2078	+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
	2079	+
	2080	+void arch_scale_freq_tick(void)
	2081	+{
	2082	+ u64 freq_scale = SCHED_CAPACITY_SCALE;
	2083	+ u64 aperf, mperf;
	2084	+ u64 acnt, mcnt;
	2085	+
	2086	+ if (!arch_scale_freq_invariant())
	2087	+ return;
	2088	+
	2089	+ rdmsrl(MSR_IA32_APERF, aperf);
	2090	+ rdmsrl(MSR_IA32_MPERF, mperf);
	2091	+
	2092	+ acnt = aperf - this_cpu_read(arch_prev_aperf);
	2093	+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
	2094	+
	2095	+ this_cpu_write(arch_prev_aperf, aperf);
	2096	+ this_cpu_write(arch_prev_mperf, mperf);
	2097	+
	2098	+ if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt))
	2099	+ goto error;
	2100	+
	2101	+ if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) \|\| !mcnt)
	2102	+ goto error;
	2103	+
	2104	+ freq_scale = div64_u64(acnt, mcnt);
	2105	+ if (!freq_scale)
	2106	+ goto error;
	2107	+
	2108	+ if (freq_scale > SCHED_CAPACITY_SCALE)
	2109	+ freq_scale = SCHED_CAPACITY_SCALE;
	2110	+
	2111	+ this_cpu_write(arch_freq_scale, freq_scale);
	2112	+ return;
	2113	+
	2114	+error:
	2115	+ pr_warn("Scheduler frequency invariance went wobbly, disabling!\n");
	2116	+ schedule_work(&disable_freq_invariance_work);
	2117	+}
	2118	+#else
	2119	+static inline void init_freq_invariance(bool secondary)
	2120	+{
	2121	+}
	2122	+#endif /* CONFIG_X86_64 */