~ljy/RK3588_XEN.git

2024-11-01 2f529f9b558ca1c1bd74be7437a84e4711743404

commit \| author \| age
a07526	1	// SPDX-License-Identifier: GPL-2.0-or-later
H	2	/*
	3	* KVM paravirt_ops implementation
	4	*
	5	* Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
	6	* Copyright IBM Corporation, 2007
	7	* Authors: Anthony Liguori <aliguori@us.ibm.com>
	8	*/
	9
	10	#define pr_fmt(fmt) "kvm-guest: " fmt
	11
	12	#include <linux/context_tracking.h>
	13	#include <linux/init.h>
	14	#include <linux/irq.h>
	15	#include <linux/kernel.h>
	16	#include <linux/kvm_para.h>
	17	#include <linux/cpu.h>
	18	#include <linux/mm.h>
	19	#include <linux/highmem.h>
	20	#include <linux/hardirq.h>
	21	#include <linux/notifier.h>
	22	#include <linux/reboot.h>
	23	#include <linux/hash.h>
	24	#include <linux/sched.h>
	25	#include <linux/slab.h>
	26	#include <linux/kprobes.h>
	27	#include <linux/nmi.h>
	28	#include <linux/swait.h>
	29	#include <linux/syscore_ops.h>
	30	#include <asm/timer.h>
	31	#include <asm/cpu.h>
	32	#include <asm/traps.h>
	33	#include <asm/desc.h>
	34	#include <asm/tlbflush.h>
	35	#include <asm/apic.h>
	36	#include <asm/apicdef.h>
	37	#include <asm/hypervisor.h>
	38	#include <asm/tlb.h>
	39	#include <asm/cpuidle_haltpoll.h>
	40	#include <asm/ptrace.h>
	41	#include <asm/reboot.h>
	42	#include <asm/svm.h>
	43
	44	DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
	45
	46	static int kvmapf = 1;
	47
	48	static int __init parse_no_kvmapf(char *arg)
	49	{
	50	kvmapf = 0;
	51	return 0;
	52	}
	53
	54	early_param("no-kvmapf", parse_no_kvmapf);
	55
	56	static int steal_acc = 1;
	57	static int __init parse_no_stealacc(char *arg)
	58	{
	59	steal_acc = 0;
	60	return 0;
	61	}
	62
	63	early_param("no-steal-acc", parse_no_stealacc);
	64
	65	static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
	66	DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
	67	static int has_steal_clock = 0;
	68
	69	static int has_guest_poll = 0;
	70	/*
	71	* No need for any "IO delay" on KVM
	72	*/
	73	static void kvm_io_delay(void)
	74	{
	75	}
	76
	77	#define KVM_TASK_SLEEP_HASHBITS 8
	78	#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
	79
	80	struct kvm_task_sleep_node {
	81	struct hlist_node link;
	82	struct swait_queue_head wq;
	83	u32 token;
	84	int cpu;
	85	};
	86
	87	static struct kvm_task_sleep_head {
	88	raw_spinlock_t lock;
	89	struct hlist_head list;
	90	} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
	91
	92	static struct kvm_task_sleep_node _find_apf_task(struct kvm_task_sleep_head b,
	93	u32 token)
	94	{
	95	struct hlist_node *p;
	96
	97	hlist_for_each(p, &b->list) {
	98	struct kvm_task_sleep_node *n =
	99	hlist_entry(p, typeof(*n), link);
	100	if (n->token == token)
	101	return n;
	102	}
	103
	104	return NULL;
	105	}
	106
	107	static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
	108	{
	109	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	110	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	111	struct kvm_task_sleep_node *e;
	112
	113	raw_spin_lock(&b->lock);
	114	e = _find_apf_task(b, token);
	115	if (e) {
	116	/* dummy entry exist -> wake up was delivered ahead of PF */
	117	hlist_del(&e->link);
	118	raw_spin_unlock(&b->lock);
	119	kfree(e);
	120	return false;
	121	}
	122
	123	n->token = token;
	124	n->cpu = smp_processor_id();
	125	init_swait_queue_head(&n->wq);
	126	hlist_add_head(&n->link, &b->list);
	127	raw_spin_unlock(&b->lock);
	128	return true;
	129	}
	130
	131	/*
	132	* kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
	133	* @token: Token to identify the sleep node entry
	134	*
	135	* Invoked from the async pagefault handling code or from the VM exit page
	136	* fault handler. In both cases RCU is watching.
	137	*/
	138	void kvm_async_pf_task_wait_schedule(u32 token)
	139	{
	140	struct kvm_task_sleep_node n;
	141	DECLARE_SWAITQUEUE(wait);
	142
	143	lockdep_assert_irqs_disabled();
	144
	145	if (!kvm_async_pf_queue_task(token, &n))
	146	return;
	147
	148	for (;;) {
	149	prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
	150	if (hlist_unhashed(&n.link))
	151	break;
	152
	153	local_irq_enable();
	154	schedule();
	155	local_irq_disable();
	156	}
	157	finish_swait(&n.wq, &wait);
	158	}
	159	EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
	160
	161	static void apf_task_wake_one(struct kvm_task_sleep_node *n)
	162	{
	163	hlist_del_init(&n->link);
	164	if (swq_has_sleeper(&n->wq))
	165	swake_up_one(&n->wq);
	166	}
	167
	168	static void apf_task_wake_all(void)
	169	{
	170	int i;
	171
	172	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
	173	struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
	174	struct kvm_task_sleep_node *n;
	175	struct hlist_node p, next;
	176
	177	raw_spin_lock(&b->lock);
	178	hlist_for_each_safe(p, next, &b->list) {
	179	n = hlist_entry(p, typeof(*n), link);
	180	if (n->cpu == smp_processor_id())
	181	apf_task_wake_one(n);
	182	}
	183	raw_spin_unlock(&b->lock);
	184	}
	185	}
	186
	187	void kvm_async_pf_task_wake(u32 token)
	188	{
	189	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	190	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	191	struct kvm_task_sleep_node n, dummy = NULL;
	192
	193	if (token == ~0) {
	194	apf_task_wake_all();
	195	return;
	196	}
	197
	198	again:
	199	raw_spin_lock(&b->lock);
	200	n = _find_apf_task(b, token);
	201	if (!n) {
	202	/*
	203	* Async #PF not yet handled, add a dummy entry for the token.
	204	* Allocating the token must be down outside of the raw lock
	205	* as the allocator is preemptible on PREEMPT_RT kernels.
	206	*/
	207	if (!dummy) {
	208	raw_spin_unlock(&b->lock);
	209	dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC);
	210
	211	/*
	212	* Continue looping on allocation failure, eventually
	213	* the async #PF will be handled and allocating a new
	214	* node will be unnecessary.
	215	*/
	216	if (!dummy)
	217	cpu_relax();
	218
	219	/*
	220	* Recheck for async #PF completion before enqueueing
	221	* the dummy token to avoid duplicate list entries.
	222	*/
	223	goto again;
	224	}
	225	dummy->token = token;
	226	dummy->cpu = smp_processor_id();
	227	init_swait_queue_head(&dummy->wq);
	228	hlist_add_head(&dummy->link, &b->list);
	229	dummy = NULL;
	230	} else {
	231	apf_task_wake_one(n);
	232	}
	233	raw_spin_unlock(&b->lock);
	234
	235	/* A dummy token might be allocated and ultimately not used. */
	236	if (dummy)
	237	kfree(dummy);
	238	}
	239	EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
	240
	241	noinstr u32 kvm_read_and_reset_apf_flags(void)
	242	{
	243	u32 flags = 0;
	244
	245	if (__this_cpu_read(apf_reason.enabled)) {
	246	flags = __this_cpu_read(apf_reason.flags);
	247	__this_cpu_write(apf_reason.flags, 0);
	248	}
	249
	250	return flags;
	251	}
	252	EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
	253
	254	noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
	255	{
	256	u32 flags = kvm_read_and_reset_apf_flags();
	257	irqentry_state_t state;
2f529f	258	unsigned long irqflags;
a07526	259
H	260	if (!flags)
	261	return false;
	262
	263	state = irqentry_enter(regs);
2f529f	264	oob_trap_notify(X86_TRAP_PF, regs);
a07526	265	instrumentation_begin();
2f529f	266	irqflags = hard_cond_local_irq_save();
a07526	267
H	268	/*
	269	* If the host managed to inject an async #PF into an interrupt
	270	* disabled region, then die hard as this is not going to end well
	271	* and the host side is seriously broken.
	272	*/
	273	if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
	274	panic("Host injected async #PF in interrupt disabled region\n");
	275
	276	if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
	277	if (unlikely(!(user_mode(regs))))
	278	panic("Host injected async #PF in kernel mode\n");
	279	/* Page is swapped out by the host. */
	280	kvm_async_pf_task_wait_schedule(token);
	281	} else {
	282	WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags);
	283	}
	284
2f529f	285	hard_cond_local_irq_restore(irqflags);
a07526	286	instrumentation_end();
2f529f	287	oob_trap_unwind(X86_TRAP_PF, regs);
a07526	288	irqentry_exit(regs, state);
H	289	return true;
	290	}
	291
	292	DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt)
	293	{
	294	struct pt_regs *old_regs = set_irq_regs(regs);
	295	u32 token;
	296
	297	ack_APIC_irq();
	298
	299	inc_irq_stat(irq_hv_callback_count);
	300
	301	if (__this_cpu_read(apf_reason.enabled)) {
	302	token = __this_cpu_read(apf_reason.token);
	303	kvm_async_pf_task_wake(token);
	304	__this_cpu_write(apf_reason.token, 0);
	305	wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1);
	306	}
	307
	308	set_irq_regs(old_regs);
	309	}
	310
	311	static void __init paravirt_ops_setup(void)
	312	{
	313	pv_info.name = "KVM";
	314
	315	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
	316	pv_ops.cpu.io_delay = kvm_io_delay;
	317
	318	#ifdef CONFIG_X86_IO_APIC
	319	no_timer_check = 1;
	320	#endif
	321	}
	322
	323	static void kvm_register_steal_time(void)
	324	{
	325	int cpu = smp_processor_id();
	326	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
	327
	328	if (!has_steal_clock)
	329	return;
	330
	331	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) \| KVM_MSR_ENABLED));
	332	pr_info("stealtime: cpu %d, msr %llx\n", cpu,
	333	(unsigned long long) slow_virt_to_phys(st));
	334	}
	335
	336	static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
	337
	338	static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
	339	{
	340	/**
	341	* This relies on __test_and_clear_bit to modify the memory
	342	* in a way that is atomic with respect to the local CPU.
	343	* The hypervisor only accesses this memory from the local CPU so
	344	* there's no need for lock or memory barriers.
	345	* An optimization barrier is implied in apic write.
	346	*/
	347	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
	348	return;
	349	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
	350	}
	351
	352	static void kvm_guest_cpu_init(void)
	353	{
	354	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
	355	u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
	356
	357	WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
	358
	359	pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
	360	pa \|= KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
	361
	362	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
	363	pa \|= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
	364
	365	wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR);
	366
	367	wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
	368	__this_cpu_write(apf_reason.enabled, 1);
	369	pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
	370	}
	371
	372	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
	373	unsigned long pa;
	374
	375	/* Size alignment is implied but just to make it explicit. */
	376	BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
	377	__this_cpu_write(kvm_apic_eoi, 0);
	378	pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
	379	\| KVM_MSR_ENABLED;
	380	wrmsrl(MSR_KVM_PV_EOI_EN, pa);
	381	}
	382
	383	if (has_steal_clock)
	384	kvm_register_steal_time();
	385	}
	386
	387	static void kvm_pv_disable_apf(void)
	388	{
	389	if (!__this_cpu_read(apf_reason.enabled))
	390	return;
	391
	392	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
	393	__this_cpu_write(apf_reason.enabled, 0);
	394
	395	pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
	396	}
	397
	398	static void kvm_disable_steal_time(void)
	399	{
	400	if (!has_steal_clock)
	401	return;
	402
	403	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
	404	}
	405
	406	static void kvm_pv_guest_cpu_reboot(void *unused)
	407	{
	408	/*
	409	* We disable PV EOI before we load a new kernel by kexec,
	410	* since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
	411	* New kernel can re-enable when it boots.
	412	*/
	413	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
	414	wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	415	kvm_pv_disable_apf();
	416	kvm_disable_steal_time();
	417	}
	418
	419	static int kvm_pv_reboot_notify(struct notifier_block *nb,
	420	unsigned long code, void *unused)
	421	{
	422	if (code == SYS_RESTART)
	423	on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
	424	return NOTIFY_DONE;
	425	}
	426
	427	static struct notifier_block kvm_pv_reboot_nb = {
	428	.notifier_call = kvm_pv_reboot_notify,
	429	};
	430
	431	static u64 kvm_steal_clock(int cpu)
	432	{
	433	u64 steal;
	434	struct kvm_steal_time *src;
	435	int version;
	436
	437	src = &per_cpu(steal_time, cpu);
	438	do {
	439	version = src->version;
	440	virt_rmb();
	441	steal = src->steal;
	442	virt_rmb();
	443	} while ((version & 1) \|\| (version != src->version));
	444
	445	return steal;
	446	}
	447
	448	static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
	449	{
	450	early_set_memory_decrypted((unsigned long) ptr, size);
	451	}
	452
	453	/*
	454	* Iterate through all possible CPUs and map the memory region pointed
	455	* by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
	456	*
	457	* Note: we iterate through all possible CPUs to ensure that CPUs
	458	* hotplugged will have their per-cpu variable already mapped as
	459	* decrypted.
	460	*/
	461	static void __init sev_map_percpu_data(void)
	462	{
	463	int cpu;
	464
	465	if (!sev_active())
	466	return;
	467
	468	for_each_possible_cpu(cpu) {
	469	__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
	470	__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
	471	__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
	472	}
	473	}
	474
	475	static bool pv_tlb_flush_supported(void)
	476	{
	477	return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
	478	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
	479	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
	480	}
	481
	482	static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
	483
	484	static void kvm_guest_cpu_offline(bool shutdown)
	485	{
2f529f	486	unsigned long flags;
H	487
	488	flags = hard_local_irq_save();
a07526	489	kvm_disable_steal_time();
H	490	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
	491	wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	492	kvm_pv_disable_apf();
	493	if (!shutdown)
	494	apf_task_wake_all();
	495	kvmclock_disable();
2f529f	496	hard_local_irq_restore(flags);
a07526	497	}
H	498
	499	static int kvm_cpu_online(unsigned int cpu)
	500	{
	501	unsigned long flags;
	502
2f529f	503	local_irq_save_full(flags);
a07526	504	kvm_guest_cpu_init();
2f529f	505	local_irq_restore_full(flags);
a07526	506	return 0;
H	507	}
	508
	509	#ifdef CONFIG_SMP
	510
	511	static bool pv_ipi_supported(void)
	512	{
	513	return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
	514	}
	515
	516	static bool pv_sched_yield_supported(void)
	517	{
	518	return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
	519	!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
	520	kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
	521	}
	522
	523	#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
	524
	525	static void __send_ipi_mask(const struct cpumask *mask, int vector)
	526	{
	527	unsigned long flags;
	528	int cpu, apic_id, icr;
	529	int min = 0, max = 0;
	530	#ifdef CONFIG_X86_64
	531	__uint128_t ipi_bitmap = 0;
	532	#else
	533	u64 ipi_bitmap = 0;
	534	#endif
	535	long ret;
	536
	537	if (cpumask_empty(mask))
	538	return;
	539
	540	local_irq_save(flags);
	541
	542	switch (vector) {
	543	default:
	544	icr = APIC_DM_FIXED \| vector;
	545	break;
	546	case NMI_VECTOR:
	547	icr = APIC_DM_NMI;
	548	break;
	549	}
	550
	551	for_each_cpu(cpu, mask) {
	552	apic_id = per_cpu(x86_cpu_to_apicid, cpu);
	553	if (!ipi_bitmap) {
	554	min = max = apic_id;
	555	} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
	556	ipi_bitmap <<= min - apic_id;
	557	min = apic_id;
	558	} else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) {
	559	max = apic_id < max ? max : apic_id;
	560	} else {
	561	ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
	562	(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
	563	WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
	564	ret);
	565	min = max = apic_id;
	566	ipi_bitmap = 0;
	567	}
	568	__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
	569	}
	570
	571	if (ipi_bitmap) {
	572	ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
	573	(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
	574	WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld",
	575	ret);
	576	}
	577
	578	local_irq_restore(flags);
	579	}
	580
	581	static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
	582	{
	583	__send_ipi_mask(mask, vector);
	584	}
	585
	586	static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
	587	{
	588	unsigned int this_cpu = smp_processor_id();
	589	struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
	590	const struct cpumask *local_mask;
	591
	592	cpumask_copy(new_mask, mask);
	593	cpumask_clear_cpu(this_cpu, new_mask);
	594	local_mask = new_mask;
	595	__send_ipi_mask(local_mask, vector);
	596	}
	597
	598	/*
	599	* Set the IPI entry points
	600	*/
	601	static void kvm_setup_pv_ipi(void)
	602	{
	603	apic->send_IPI_mask = kvm_send_ipi_mask;
	604	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
	605	pr_info("setup PV IPIs\n");
	606	}
	607
	608	static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
	609	{
	610	int cpu;
	611
	612	native_send_call_func_ipi(mask);
	613
	614	/* Make sure other vCPUs get a chance to run if they need to. */
	615	for_each_cpu(cpu, mask) {
	616	if (vcpu_is_preempted(cpu)) {
	617	kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
	618	break;
	619	}
	620	}
	621	}
	622
	623	static void __init kvm_smp_prepare_boot_cpu(void)
	624	{
	625	/*
	626	* Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
	627	* shares the guest physical address with the hypervisor.
	628	*/
	629	sev_map_percpu_data();
	630
	631	kvm_guest_cpu_init();
	632	native_smp_prepare_boot_cpu();
	633	kvm_spinlock_init();
	634	}
	635
	636	static int kvm_cpu_down_prepare(unsigned int cpu)
	637	{
	638	unsigned long flags;
	639
	640	local_irq_save(flags);
	641	kvm_guest_cpu_offline(false);
	642	local_irq_restore(flags);
	643	return 0;
	644	}
	645
	646	#endif
	647
	648	static int kvm_suspend(void)
	649	{
	650	u64 val = 0;
	651
	652	kvm_guest_cpu_offline(false);
	653
	654	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
	655	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
	656	rdmsrl(MSR_KVM_POLL_CONTROL, val);
	657	has_guest_poll = !(val & 1);
	658	#endif
	659	return 0;
	660	}
	661
	662	static void kvm_resume(void)
	663	{
	664	kvm_cpu_online(raw_smp_processor_id());
	665
	666	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
	667	if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll)
	668	wrmsrl(MSR_KVM_POLL_CONTROL, 0);
	669	#endif
	670	}
	671
	672	static struct syscore_ops kvm_syscore_ops = {
	673	.suspend = kvm_suspend,
	674	.resume = kvm_resume,
	675	};
	676
	677	/*
	678	* After a PV feature is registered, the host will keep writing to the
	679	* registered memory location. If the guest happens to shutdown, this memory
	680	* won't be valid. In cases like kexec, in which you install a new kernel, this
	681	* means a random memory location will be kept being written.
	682	*/
	683	#ifdef CONFIG_KEXEC_CORE
	684	static void kvm_crash_shutdown(struct pt_regs *regs)
	685	{
	686	kvm_guest_cpu_offline(true);
	687	native_machine_crash_shutdown(regs);
	688	}
	689	#endif
	690
	691	static void kvm_flush_tlb_others(const struct cpumask *cpumask,
	692	const struct flush_tlb_info *info)
	693	{
	694	u8 state;
	695	int cpu;
	696	struct kvm_steal_time *src;
	697	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
	698
	699	cpumask_copy(flushmask, cpumask);
	700	/*
	701	* We have to call flush only on online vCPUs. And
	702	* queue flush_on_enter for pre-empted vCPUs
	703	*/
	704	for_each_cpu(cpu, flushmask) {
	705	src = &per_cpu(steal_time, cpu);
	706	state = READ_ONCE(src->preempted);
	707	if ((state & KVM_VCPU_PREEMPTED)) {
	708	if (try_cmpxchg(&src->preempted, &state,
	709	state \| KVM_VCPU_FLUSH_TLB))
	710	__cpumask_clear_cpu(cpu, flushmask);
	711	}
	712	}
	713
	714	native_flush_tlb_others(flushmask, info);
	715	}
	716
	717	static void __init kvm_guest_init(void)
	718	{
	719	int i;
	720
	721	paravirt_ops_setup();
	722	register_reboot_notifier(&kvm_pv_reboot_nb);
	723	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
	724	raw_spin_lock_init(&async_pf_sleepers[i].lock);
	725
	726	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
	727	has_steal_clock = 1;
	728	pv_ops.time.steal_clock = kvm_steal_clock;
	729	}
	730
	731	if (pv_tlb_flush_supported()) {
	732	pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
	733	pv_ops.mmu.tlb_remove_table = tlb_remove_table;
	734	pr_info("KVM setup pv remote TLB flush\n");
	735	}
	736
	737	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
	738	apic_set_eoi_write(kvm_guest_apic_eoi_write);
	739
	740	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) {
	741	static_branch_enable(&kvm_async_pf_enabled);
	742	alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt);
	743	}
	744
	745	#ifdef CONFIG_SMP
	746	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
	747	if (pv_sched_yield_supported()) {
	748	smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
	749	pr_info("setup PV sched yield\n");
	750	}
	751	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
	752	kvm_cpu_online, kvm_cpu_down_prepare) < 0)
	753	pr_err("failed to install cpu hotplug callbacks\n");
	754	#else
	755	sev_map_percpu_data();
	756	kvm_guest_cpu_init();
	757	#endif
	758
	759	#ifdef CONFIG_KEXEC_CORE
	760	machine_ops.crash_shutdown = kvm_crash_shutdown;
	761	#endif
	762
	763	register_syscore_ops(&kvm_syscore_ops);
	764
	765	/*
	766	* Hard lockup detection is enabled by default. Disable it, as guests
	767	* can get false positives too easily, for example if the host is
	768	* overcommitted.
	769	*/
	770	hardlockup_detector_disable();
	771	}
	772
	773	static noinline uint32_t __kvm_cpuid_base(void)
	774	{
	775	if (boot_cpu_data.cpuid_level < 0)
	776	return 0; /* So we don't blow up on old processors */
	777
	778	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
	779	return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);
	780
	781	return 0;
	782	}
	783
	784	static inline uint32_t kvm_cpuid_base(void)
	785	{
	786	static int kvm_cpuid_base = -1;
	787
	788	if (kvm_cpuid_base == -1)
	789	kvm_cpuid_base = __kvm_cpuid_base();
	790
	791	return kvm_cpuid_base;
	792	}
	793
	794	bool kvm_para_available(void)
	795	{
	796	return kvm_cpuid_base() != 0;
	797	}
	798	EXPORT_SYMBOL_GPL(kvm_para_available);
	799
	800	unsigned int kvm_arch_para_features(void)
	801	{
	802	return cpuid_eax(kvm_cpuid_base() \| KVM_CPUID_FEATURES);
	803	}
	804
	805	unsigned int kvm_arch_para_hints(void)
	806	{
	807	return cpuid_edx(kvm_cpuid_base() \| KVM_CPUID_FEATURES);
	808	}
	809	EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
	810
	811	static uint32_t __init kvm_detect(void)
	812	{
	813	return kvm_cpuid_base();
	814	}
	815
	816	static void __init kvm_apic_init(void)
	817	{
	818	#if defined(CONFIG_SMP)
	819	if (pv_ipi_supported())
	820	kvm_setup_pv_ipi();
	821	#endif
	822	}
	823
	824	static void __init kvm_init_platform(void)
	825	{
	826	kvmclock_init();
	827	x86_platform.apic_post_init = kvm_apic_init;
	828	}
	829
	830	#if defined(CONFIG_AMD_MEM_ENCRYPT)
	831	static void kvm_sev_es_hcall_prepare(struct ghcb ghcb, struct pt_regs regs)
	832	{
	833	/* RAX and CPL are already in the GHCB */
	834	ghcb_set_rbx(ghcb, regs->bx);
	835	ghcb_set_rcx(ghcb, regs->cx);
	836	ghcb_set_rdx(ghcb, regs->dx);
	837	ghcb_set_rsi(ghcb, regs->si);
	838	}
	839
	840	static bool kvm_sev_es_hcall_finish(struct ghcb ghcb, struct pt_regs regs)
	841	{
	842	/* No checking of the return state needed */
	843	return true;
	844	}
	845	#endif
	846
	847	const __initconst struct hypervisor_x86 x86_hyper_kvm = {
	848	.name = "KVM",
	849	.detect = kvm_detect,
	850	.type = X86_HYPER_KVM,
	851	.init.guest_late_init = kvm_guest_init,
	852	.init.x2apic_available = kvm_para_available,
	853	.init.init_platform = kvm_init_platform,
	854	#if defined(CONFIG_AMD_MEM_ENCRYPT)
	855	.runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare,
	856	.runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish,
	857	#endif
	858	};
	859
	860	static __init int activate_jump_labels(void)
	861	{
	862	if (has_steal_clock) {
	863	static_key_slow_inc(&paravirt_steal_enabled);
	864	if (steal_acc)
	865	static_key_slow_inc(&paravirt_steal_rq_enabled);
	866	}
	867
	868	return 0;
	869	}
	870	arch_initcall(activate_jump_labels);
	871
	872	static __init int kvm_alloc_cpumask(void)
	873	{
	874	int cpu;
	875	bool alloc = false;
	876
	877	if (!kvm_para_available() \|\| nopv)
	878	return 0;
	879
	880	if (pv_tlb_flush_supported())
	881	alloc = true;
	882
	883	#if defined(CONFIG_SMP)
	884	if (pv_ipi_supported())
	885	alloc = true;
	886	#endif
	887
	888	if (alloc)
	889	for_each_possible_cpu(cpu) {
	890	zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
	891	GFP_KERNEL, cpu_to_node(cpu));
	892	}
	893
	894	return 0;
	895	}
	896	arch_initcall(kvm_alloc_cpumask);
	897
	898	#ifdef CONFIG_PARAVIRT_SPINLOCKS
	899
	900	/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
	901	static void kvm_kick_cpu(int cpu)
	902	{
	903	int apicid;
	904	unsigned long flags = 0;
	905
	906	apicid = per_cpu(x86_cpu_to_apicid, cpu);
	907	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
	908	}
	909
	910	#include <asm/qspinlock.h>
	911
	912	static void kvm_wait(u8 *ptr, u8 val)
	913	{
	914	unsigned long flags;
	915
	916	if (in_nmi())
	917	return;
	918
2f529f	919	flags = hard_local_irq_save();
a07526	920
H	921	if (READ_ONCE(*ptr) != val)
	922	goto out;
	923
	924	/*
	925	* halt until it's our turn and kicked. Note that we do safe halt
	926	* for irq enabled case to avoid hang when lock info is overwritten
	927	* in irq spinlock slowpath and no spurious interrupt occur to save us.
	928	*/
	929	if (arch_irqs_disabled_flags(flags))
	930	halt();
	931	else
	932	safe_halt();
	933
	934	out:
2f529f	935	hard_local_irq_restore(flags);
a07526	936	}
H	937
	938	#ifdef CONFIG_X86_32
	939	__visible bool __kvm_vcpu_is_preempted(long cpu)
	940	{
	941	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
	942
	943	return !!(src->preempted & KVM_VCPU_PREEMPTED);
	944	}
	945	PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
	946
	947	#else
	948
	949	#include <asm/asm-offsets.h>
	950
	951	extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
	952
	953	/*
	954	* Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
	955	* restoring to/from the stack.
	956	*/
	957	asm(
	958	".pushsection .text;"
	959	".global __raw_callee_save___kvm_vcpu_is_preempted;"
	960	".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
	961	"__raw_callee_save___kvm_vcpu_is_preempted:"
	962	"movq __per_cpu_offset(,%rdi,8), %rax;"
	963	"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
	964	"setne %al;"
	965	ASM_RET
	966	".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
	967	".popsection");
	968
	969	#endif
	970
	971	/*
	972	* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
	973	*/
	974	void __init kvm_spinlock_init(void)
	975	{
	976	/*
	977	* In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an
	978	* advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is
	979	* preferred over native qspinlock when vCPU is preempted.
	980	*/
	981	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) {
	982	pr_info("PV spinlocks disabled, no host support\n");
	983	return;
	984	}
	985
	986	/*
	987	* Disable PV spinlocks and use native qspinlock when dedicated pCPUs
	988	* are available.
	989	*/
	990	if (kvm_para_has_hint(KVM_HINTS_REALTIME)) {
	991	pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n");
	992	goto out;
	993	}
	994
	995	if (num_possible_cpus() == 1) {
	996	pr_info("PV spinlocks disabled, single CPU\n");
	997	goto out;
	998	}
	999
	1000	if (nopvspin) {
	1001	pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n");
	1002	goto out;
	1003	}
	1004
	1005	pr_info("PV spinlocks enabled\n");
	1006
	1007	__pv_init_lock_hash();
	1008	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
	1009	pv_ops.lock.queued_spin_unlock =
	1010	PV_CALLEE_SAVE(__pv_queued_spin_unlock);
	1011	pv_ops.lock.wait = kvm_wait;
	1012	pv_ops.lock.kick = kvm_kick_cpu;
	1013
	1014	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
	1015	pv_ops.lock.vcpu_is_preempted =
	1016	PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
	1017	}
	1018	/*
	1019	* When PV spinlock is enabled which is preferred over
	1020	* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
	1021	* Just disable it anyway.
	1022	*/
	1023	out:
	1024	static_branch_disable(&virt_spin_lock_key);
	1025	}
	1026
	1027	#endif /* CONFIG_PARAVIRT_SPINLOCKS */
	1028
	1029	#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL
	1030
	1031	static void kvm_disable_host_haltpoll(void *i)
	1032	{
	1033	wrmsrl(MSR_KVM_POLL_CONTROL, 0);
	1034	}
	1035
	1036	static void kvm_enable_host_haltpoll(void *i)
	1037	{
	1038	wrmsrl(MSR_KVM_POLL_CONTROL, 1);
	1039	}
	1040
	1041	void arch_haltpoll_enable(unsigned int cpu)
	1042	{
	1043	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
	1044	pr_err_once("host does not support poll control\n");
	1045	pr_err_once("host upgrade recommended\n");
	1046	return;
	1047	}
	1048
	1049	/* Enable guest halt poll disables host halt poll */
	1050	smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
	1051	}
	1052	EXPORT_SYMBOL_GPL(arch_haltpoll_enable);
	1053
	1054	void arch_haltpoll_disable(unsigned int cpu)
	1055	{
	1056	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
	1057	return;
	1058
	1059	/* Disable guest halt poll enables host halt poll */
	1060	smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
	1061	}
	1062	EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
	1063	#endif