~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Kernel-based Virtual Machine driver for Linux
3	4	*
..	..	@@ -13,22 +14,21 @@
13	14	* Yaniv Kamay <yaniv@qumranet.com>
14	15	* Amit Shah <amit.shah@qumranet.com>
15	16	* Ben-Ami Yassour <benami@il.ibm.com>
16		- *
17		- * This work is licensed under the terms of the GNU GPL, version 2. See
18		- * the COPYING file in the top-level directory.
19		- *
20	17	*/
21	18
22	19	#include <linux/kvm_host.h>
23	20	#include "irq.h"
	21	+#include "ioapic.h"
24	22	#include "mmu.h"
25	23	#include "i8254.h"
26	24	#include "tss.h"
27	25	#include "kvm_cache_regs.h"
	26	+#include "kvm_emulate.h"
28	27	#include "x86.h"
29	28	#include "cpuid.h"
30	29	#include "pmu.h"
31	30	#include "hyperv.h"
	31	+#include "lapic.h"
32	32
33	33	#include <linux/clocksource.h>
34	34	#include <linux/interrupt.h>
..	..	@@ -54,7 +54,9 @@
54	54	#include <linux/kvm_irqfd.h>
55	55	#include <linux/irqbypass.h>
56	56	#include <linux/sched/stat.h>
	57	+#include <linux/sched/isolation.h>
57	58	#include <linux/mem_encrypt.h>
	59	+#include <linux/entry-kvm.h>
58	60
59	61	#include <trace/events/kvm.h>
60	62
..	..	@@ -69,6 +71,10 @@
69	71	#include <asm/irq_remapping.h>
70	72	#include <asm/mshyperv.h>
71	73	#include <asm/hypervisor.h>
	74	+#include <asm/tlbflush.h>
	75	+#include <asm/intel_pt.h>
	76	+#include <asm/emulate_prefix.h>
	77	+#include <clocksource/hyperv_timer.h>
72	78
73	79	#define CREATE_TRACE_POINTS
74	80	#include "trace.h"
..	..	@@ -79,7 +85,7 @@
79	85	EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
80	86
81	87	#define emul_to_vcpu(ctxt) \
82		- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
	88	+ ((struct kvm_vcpu *)(ctxt)->vcpu)
83	89
84	90	/* EFER defaults:
85	91	* - enable syscall per default because its emulated by KVM
..	..	@@ -94,9 +100,6 @@
94	100
95	101	static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
96	102
97		-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
98		-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
99		-
100	103	#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS \| \
101	104	KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
102	105
..	..	@@ -108,7 +111,7 @@
108	111	static void store_regs(struct kvm_vcpu *vcpu);
109	112	static int sync_regs(struct kvm_vcpu *vcpu);
110	113
111		-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
	114	+struct kvm_x86_ops kvm_x86_ops __read_mostly;
112	115	EXPORT_SYMBOL_GPL(kvm_x86_ops);
113	116
114	117	static bool __read_mostly ignore_msrs = 0;
..	..	@@ -138,10 +141,14 @@
138	141	static u32 __read_mostly tsc_tolerance_ppm = 250;
139	142	module_param(tsc_tolerance_ppm, uint, S_IRUGO \| S_IWUSR);
140	143
141		-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
142		-unsigned int __read_mostly lapic_timer_advance_ns = 0;
143		-module_param(lapic_timer_advance_ns, uint, S_IRUGO \| S_IWUSR);
144		-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
	144	+/*
	145	+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
	146	+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
	147	+ * advancement entirely. Any other value is used as-is and disables adaptive
	148	+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
	149	+ */
	150	+static int __read_mostly lapic_timer_advance_ns = -1;
	151	+module_param(lapic_timer_advance_ns, int, S_IRUGO \| S_IWUSR);
145	152
146	153	static bool __read_mostly vector_hashing = true;
147	154	module_param(vector_hashing, bool, S_IRUGO);
..	..	@@ -153,85 +160,147 @@
153	160	static bool __read_mostly force_emulation_prefix = false;
154	161	module_param(force_emulation_prefix, bool, S_IRUGO);
155	162
156		-#define KVM_NR_SHARED_MSRS 16
	163	+int __read_mostly pi_inject_timer = -1;
	164	+module_param(pi_inject_timer, bint, S_IRUGO \| S_IWUSR);
157	165
158		-struct kvm_shared_msrs_global {
	166	+/*
	167	+ * Restoring the host value for MSRs that are only consumed when running in
	168	+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
	169	+ * returns to userspace, i.e. the kernel can run with the guest's value.
	170	+ */
	171	+#define KVM_MAX_NR_USER_RETURN_MSRS 16
	172	+
	173	+struct kvm_user_return_msrs_global {
159	174	int nr;
160		- u32 msrs[KVM_NR_SHARED_MSRS];
	175	+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
161	176	};
162	177
163		-struct kvm_shared_msrs {
	178	+struct kvm_user_return_msrs {
164	179	struct user_return_notifier urn;
165	180	bool registered;
166		- struct kvm_shared_msr_values {
	181	+ struct kvm_user_return_msr_values {
167	182	u64 host;
168	183	u64 curr;
169		- } values[KVM_NR_SHARED_MSRS];
	184	+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
170	185	};
171	186
172		-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
173		-static struct kvm_shared_msrs __percpu *shared_msrs;
	187	+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
	188	+static struct kvm_user_return_msrs __percpu *user_return_msrs;
	189	+
	190	+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP \| XFEATURE_MASK_SSE \
	191	+ \| XFEATURE_MASK_YMM \| XFEATURE_MASK_BNDREGS \
	192	+ \| XFEATURE_MASK_BNDCSR \| XFEATURE_MASK_AVX512 \
	193	+ \| XFEATURE_MASK_PKRU)
	194	+
	195	+u64 __read_mostly host_efer;
	196	+EXPORT_SYMBOL_GPL(host_efer);
	197	+
	198	+bool __read_mostly allow_smaller_maxphyaddr = 0;
	199	+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
	200	+
	201	+static u64 __read_mostly host_xss;
	202	+u64 __read_mostly supported_xss;
	203	+EXPORT_SYMBOL_GPL(supported_xss);
174	204
175	205	struct kvm_stats_debugfs_item debugfs_entries[] = {
176		- { "pf_fixed", VCPU_STAT(pf_fixed) },
177		- { "pf_guest", VCPU_STAT(pf_guest) },
178		- { "tlb_flush", VCPU_STAT(tlb_flush) },
179		- { "invlpg", VCPU_STAT(invlpg) },
180		- { "exits", VCPU_STAT(exits) },
181		- { "io_exits", VCPU_STAT(io_exits) },
182		- { "mmio_exits", VCPU_STAT(mmio_exits) },
183		- { "signal_exits", VCPU_STAT(signal_exits) },
184		- { "irq_window", VCPU_STAT(irq_window_exits) },
185		- { "nmi_window", VCPU_STAT(nmi_window_exits) },
186		- { "halt_exits", VCPU_STAT(halt_exits) },
187		- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
188		- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
189		- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
190		- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
191		- { "hypercalls", VCPU_STAT(hypercalls) },
192		- { "request_irq", VCPU_STAT(request_irq_exits) },
193		- { "irq_exits", VCPU_STAT(irq_exits) },
194		- { "host_state_reload", VCPU_STAT(host_state_reload) },
195		- { "fpu_reload", VCPU_STAT(fpu_reload) },
196		- { "insn_emulation", VCPU_STAT(insn_emulation) },
197		- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
198		- { "irq_injections", VCPU_STAT(irq_injections) },
199		- { "nmi_injections", VCPU_STAT(nmi_injections) },
200		- { "req_event", VCPU_STAT(req_event) },
201		- { "l1d_flush", VCPU_STAT(l1d_flush) },
202		- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
203		- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
204		- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
205		- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
206		- { "mmu_flooded", VM_STAT(mmu_flooded) },
207		- { "mmu_recycled", VM_STAT(mmu_recycled) },
208		- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
209		- { "mmu_unsync", VM_STAT(mmu_unsync) },
210		- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
211		- { "largepages", VM_STAT(lpages, .mode = 0444) },
212		- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
213		- { "max_mmu_page_hash_collisions",
214		- VM_STAT(max_mmu_page_hash_collisions) },
	206	+ VCPU_STAT("pf_fixed", pf_fixed),
	207	+ VCPU_STAT("pf_guest", pf_guest),
	208	+ VCPU_STAT("tlb_flush", tlb_flush),
	209	+ VCPU_STAT("invlpg", invlpg),
	210	+ VCPU_STAT("exits", exits),
	211	+ VCPU_STAT("io_exits", io_exits),
	212	+ VCPU_STAT("mmio_exits", mmio_exits),
	213	+ VCPU_STAT("signal_exits", signal_exits),
	214	+ VCPU_STAT("irq_window", irq_window_exits),
	215	+ VCPU_STAT("nmi_window", nmi_window_exits),
	216	+ VCPU_STAT("halt_exits", halt_exits),
	217	+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
	218	+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
	219	+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
	220	+ VCPU_STAT("halt_wakeup", halt_wakeup),
	221	+ VCPU_STAT("hypercalls", hypercalls),
	222	+ VCPU_STAT("request_irq", request_irq_exits),
	223	+ VCPU_STAT("irq_exits", irq_exits),
	224	+ VCPU_STAT("host_state_reload", host_state_reload),
	225	+ VCPU_STAT("fpu_reload", fpu_reload),
	226	+ VCPU_STAT("insn_emulation", insn_emulation),
	227	+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
	228	+ VCPU_STAT("irq_injections", irq_injections),
	229	+ VCPU_STAT("nmi_injections", nmi_injections),
	230	+ VCPU_STAT("req_event", req_event),
	231	+ VCPU_STAT("l1d_flush", l1d_flush),
	232	+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
	233	+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
	234	+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
	235	+ VM_STAT("mmu_pte_write", mmu_pte_write),
	236	+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
	237	+ VM_STAT("mmu_flooded", mmu_flooded),
	238	+ VM_STAT("mmu_recycled", mmu_recycled),
	239	+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
	240	+ VM_STAT("mmu_unsync", mmu_unsync),
	241	+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
	242	+ VM_STAT("largepages", lpages, .mode = 0444),
	243	+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
	244	+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
215	245	{ NULL }
216	246	};
217	247
218	248	u64 __read_mostly host_xcr0;
	249	+u64 __read_mostly supported_xcr0;
	250	+EXPORT_SYMBOL_GPL(supported_xcr0);
	251	+
	252	+static struct kmem_cache *x86_fpu_cache;
	253	+
	254	+static struct kmem_cache *x86_emulator_cache;
	255	+
	256	+/*
	257	+ * When called, it means the previous get/set msr reached an invalid msr.
	258	+ * Return true if we want to ignore/silent this failed msr access.
	259	+ */
	260	+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
	261	+ u64 data, bool write)
	262	+{
	263	+ const char *op = write ? "wrmsr" : "rdmsr";
	264	+
	265	+ if (ignore_msrs) {
	266	+ if (report_ignored_msrs)
	267	+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
	268	+ op, msr, data);
	269	+ /* Mask the error */
	270	+ return true;
	271	+ } else {
	272	+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
	273	+ op, msr, data);
	274	+ return false;
	275	+ }
	276	+}
	277	+
	278	+static struct kmem_cache *kvm_alloc_emulator_cache(void)
	279	+{
	280	+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
	281	+ unsigned int size = sizeof(struct x86_emulate_ctxt);
	282	+
	283	+ return kmem_cache_create_usercopy("x86_emulator", size,
	284	+ __alignof__(struct x86_emulate_ctxt),
	285	+ SLAB_ACCOUNT, useroffset,
	286	+ size - useroffset, NULL);
	287	+}
219	288
220	289	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
221	290
222	291	static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
223	292	{
224	293	int i;
225		- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
	294	+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
226	295	vcpu->arch.apf.gfns[i] = ~0;
227	296	}
228	297
229	298	static void kvm_on_user_return(struct user_return_notifier *urn)
230	299	{
231	300	unsigned slot;
232		- struct kvm_shared_msrs *locals
233		- = container_of(urn, struct kvm_shared_msrs, urn);
234		- struct kvm_shared_msr_values *values;
	301	+ struct kvm_user_return_msrs *msrs
	302	+ = container_of(urn, struct kvm_user_return_msrs, urn);
	303	+ struct kvm_user_return_msr_values *values;
235	304	unsigned long flags;
236	305
237	306	/*
..	..	@@ -239,84 +308,89 @@
239	308	* interrupted and executed through kvm_arch_hardware_disable()
240	309	*/
241	310	local_irq_save(flags);
242		- if (locals->registered) {
243		- locals->registered = false;
	311	+ if (msrs->registered) {
	312	+ msrs->registered = false;
244	313	user_return_notifier_unregister(urn);
245	314	}
246	315	local_irq_restore(flags);
247		- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
248		- values = &locals->values[slot];
	316	+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
	317	+ values = &msrs->values[slot];
249	318	if (values->host != values->curr) {
250		- wrmsrl(shared_msrs_global.msrs[slot], values->host);
	319	+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
251	320	values->curr = values->host;
252	321	}
253	322	}
254	323	}
255	324
256		-static void shared_msr_update(unsigned slot, u32 msr)
	325	+int kvm_probe_user_return_msr(u32 msr)
257	326	{
	327	+ u64 val;
	328	+ int ret;
	329	+
	330	+ preempt_disable();
	331	+ ret = rdmsrl_safe(msr, &val);
	332	+ if (ret)
	333	+ goto out;
	334	+ ret = wrmsrl_safe(msr, val);
	335	+out:
	336	+ preempt_enable();
	337	+ return ret;
	338	+}
	339	+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
	340	+
	341	+void kvm_define_user_return_msr(unsigned slot, u32 msr)
	342	+{
	343	+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
	344	+ user_return_msrs_global.msrs[slot] = msr;
	345	+ if (slot >= user_return_msrs_global.nr)
	346	+ user_return_msrs_global.nr = slot + 1;
	347	+}
	348	+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
	349	+
	350	+static void kvm_user_return_msr_cpu_online(void)
	351	+{
	352	+ unsigned int cpu = smp_processor_id();
	353	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
258	354	u64 value;
259		- unsigned int cpu = smp_processor_id();
260		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	355	+ int i;
261	356
262		- /* only read, and nobody should modify it at this time,
263		- * so don't need lock */
264		- if (slot >= shared_msrs_global.nr) {
265		- printk(KERN_ERR "kvm: invalid MSR slot!");
266		- return;
	357	+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
	358	+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
	359	+ msrs->values[i].host = value;
	360	+ msrs->values[i].curr = value;
267	361	}
268		- rdmsrl_safe(msr, &value);
269		- smsr->values[slot].host = value;
270		- smsr->values[slot].curr = value;
271	362	}
272	363
273		-void kvm_define_shared_msr(unsigned slot, u32 msr)
274		-{
275		- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
276		- shared_msrs_global.msrs[slot] = msr;
277		- if (slot >= shared_msrs_global.nr)
278		- shared_msrs_global.nr = slot + 1;
279		-}
280		-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
281		-
282		-static void kvm_shared_msr_cpu_online(void)
283		-{
284		- unsigned i;
285		-
286		- for (i = 0; i < shared_msrs_global.nr; ++i)
287		- shared_msr_update(i, shared_msrs_global.msrs[i]);
288		-}
289		-
290		-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
	364	+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
291	365	{
292	366	unsigned int cpu = smp_processor_id();
293		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	367	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
294	368	int err;
295	369
296		- value = (value & mask) \| (smsr->values[slot].host & ~mask);
297		- if (value == smsr->values[slot].curr)
	370	+ value = (value & mask) \| (msrs->values[slot].host & ~mask);
	371	+ if (value == msrs->values[slot].curr)
298	372	return 0;
299		- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
	373	+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
300	374	if (err)
301	375	return 1;
302	376
303		- smsr->values[slot].curr = value;
304		- if (!smsr->registered) {
305		- smsr->urn.on_user_return = kvm_on_user_return;
306		- user_return_notifier_register(&smsr->urn);
307		- smsr->registered = true;
	377	+ msrs->values[slot].curr = value;
	378	+ if (!msrs->registered) {
	379	+ msrs->urn.on_user_return = kvm_on_user_return;
	380	+ user_return_notifier_register(&msrs->urn);
	381	+ msrs->registered = true;
308	382	}
309	383	return 0;
310	384	}
311		-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
	385	+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
312	386
313	387	static void drop_user_return_notifiers(void)
314	388	{
315	389	unsigned int cpu = smp_processor_id();
316		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	390	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
317	391
318		- if (smsr->registered)
319		- kvm_on_user_return(&smsr->urn);
	392	+ if (msrs->registered)
	393	+ kvm_on_user_return(&msrs->urn);
320	394	}
321	395
322	396	u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
..	..	@@ -348,14 +422,15 @@
348	422	}
349	423
350	424	kvm_lapic_set_base(vcpu, msr_info->data);
	425	+ kvm_recalculate_apic_map(vcpu->kvm);
351	426	return 0;
352	427	}
353	428	EXPORT_SYMBOL_GPL(kvm_set_apic_base);
354	429
355		-asmlinkage __visible void kvm_spurious_fault(void)
	430	+asmlinkage __visible noinstr void kvm_spurious_fault(void)
356	431	{
357	432	/* Fault while not rebooting. We want the trace. */
358		- BUG();
	433	+ BUG_ON(!kvm_rebooting);
359	434	}
360	435	EXPORT_SYMBOL_GPL(kvm_spurious_fault);
361	436
..	..	@@ -384,6 +459,7 @@
384	459	#define EXCPT_TRAP 1
385	460	#define EXCPT_ABORT 2
386	461	#define EXCPT_INTERRUPT 3
	462	+#define EXCPT_DB 4
387	463
388	464	static int exception_type(int vector)
389	465	{
..	..	@@ -394,8 +470,14 @@
394	470
395	471	mask = 1 << vector;
396	472
397		- /* #DB is trap, as instruction watchpoints are handled elsewhere */
398		- if (mask & ((1 << DB_VECTOR) \| (1 << BP_VECTOR) \| (1 << OF_VECTOR)))
	473	+ /*
	474	+ * #DBs can be trap-like or fault-like, the caller must check other CPU
	475	+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
	476	+ */
	477	+ if (mask & (1 << DB_VECTOR))
	478	+ return EXCPT_DB;
	479	+
	480	+ if (mask & ((1 << BP_VECTOR) \| (1 << OF_VECTOR)))
399	481	return EXCPT_TRAP;
400	482
401	483	if (mask & ((1 << DF_VECTOR) \| (1 << MC_VECTOR)))
..	..	@@ -405,9 +487,59 @@
405	487	return EXCPT_FAULT;
406	488	}
407	489
	490	+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
	491	+{
	492	+ unsigned nr = vcpu->arch.exception.nr;
	493	+ bool has_payload = vcpu->arch.exception.has_payload;
	494	+ unsigned long payload = vcpu->arch.exception.payload;
	495	+
	496	+ if (!has_payload)
	497	+ return;
	498	+
	499	+ switch (nr) {
	500	+ case DB_VECTOR:
	501	+ /*
	502	+ * "Certain debug exceptions may clear bit 0-3. The
	503	+ * remaining contents of the DR6 register are never
	504	+ * cleared by the processor".
	505	+ */
	506	+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
	507	+ /*
	508	+ * DR6.RTM is set by all #DB exceptions that don't clear it.
	509	+ */
	510	+ vcpu->arch.dr6 \|= DR6_RTM;
	511	+ vcpu->arch.dr6 \|= payload;
	512	+ /*
	513	+ * Bit 16 should be set in the payload whenever the #DB
	514	+ * exception should clear DR6.RTM. This makes the payload
	515	+ * compatible with the pending debug exceptions under VMX.
	516	+ * Though not currently documented in the SDM, this also
	517	+ * makes the payload compatible with the exit qualification
	518	+ * for #DB exceptions under VMX.
	519	+ */
	520	+ vcpu->arch.dr6 ^= payload & DR6_RTM;
	521	+
	522	+ /*
	523	+ * The #DB payload is defined as compatible with the 'pending
	524	+ * debug exceptions' field under VMX, not DR6. While bit 12 is
	525	+ * defined in the 'pending debug exceptions' field (enabled
	526	+ * breakpoint), it is reserved and must be zero in DR6.
	527	+ */
	528	+ vcpu->arch.dr6 &= ~BIT(12);
	529	+ break;
	530	+ case PF_VECTOR:
	531	+ vcpu->arch.cr2 = payload;
	532	+ break;
	533	+ }
	534	+
	535	+ vcpu->arch.exception.has_payload = false;
	536	+ vcpu->arch.exception.payload = 0;
	537	+}
	538	+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
	539	+
408	540	static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
409	541	unsigned nr, bool has_error, u32 error_code,
410		- bool reinject)
	542	+ bool has_payload, unsigned long payload, bool reinject)
411	543	{
412	544	u32 prev_nr;
413	545	int class1, class2;
..	..	@@ -427,6 +559,14 @@
427	559	*/
428	560	WARN_ON_ONCE(vcpu->arch.exception.pending);
429	561	vcpu->arch.exception.injected = true;
	562	+ if (WARN_ON_ONCE(has_payload)) {
	563	+ /*
	564	+ * A reinjected event has already
	565	+ * delivered its payload.
	566	+ */
	567	+ has_payload = false;
	568	+ payload = 0;
	569	+ }
430	570	} else {
431	571	vcpu->arch.exception.pending = true;
432	572	vcpu->arch.exception.injected = false;
..	..	@@ -434,6 +574,10 @@
434	574	vcpu->arch.exception.has_error_code = has_error;
435	575	vcpu->arch.exception.nr = nr;
436	576	vcpu->arch.exception.error_code = error_code;
	577	+ vcpu->arch.exception.has_payload = has_payload;
	578	+ vcpu->arch.exception.payload = payload;
	579	+ if (!is_guest_mode(vcpu))
	580	+ kvm_deliver_exception_payload(vcpu);
437	581	return;
438	582	}
439	583
..	..	@@ -458,6 +602,8 @@
458	602	vcpu->arch.exception.has_error_code = true;
459	603	vcpu->arch.exception.nr = DF_VECTOR;
460	604	vcpu->arch.exception.error_code = 0;
	605	+ vcpu->arch.exception.has_payload = false;
	606	+ vcpu->arch.exception.payload = 0;
461	607	} else
462	608	/* replace previous exception with a new one in a hope
463	609	that instruction re-execution will regenerate lost
..	..	@@ -467,15 +613,29 @@
467	613
468	614	void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
469	615	{
470		- kvm_multiple_exception(vcpu, nr, false, 0, false);
	616	+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
471	617	}
472	618	EXPORT_SYMBOL_GPL(kvm_queue_exception);
473	619
474	620	void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
475	621	{
476		- kvm_multiple_exception(vcpu, nr, false, 0, true);
	622	+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
477	623	}
478	624	EXPORT_SYMBOL_GPL(kvm_requeue_exception);
	625	+
	626	+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
	627	+ unsigned long payload)
	628	+{
	629	+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
	630	+}
	631	+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
	632	+
	633	+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
	634	+ u32 error_code, unsigned long payload)
	635	+{
	636	+ kvm_multiple_exception(vcpu, nr, true, error_code,
	637	+ true, payload, false);
	638	+}
479	639
480	640	int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
481	641	{
..	..	@@ -493,23 +653,38 @@
493	653	++vcpu->stat.pf_guest;
494	654	vcpu->arch.exception.nested_apf =
495	655	is_guest_mode(vcpu) && fault->async_page_fault;
496		- if (vcpu->arch.exception.nested_apf)
	656	+ if (vcpu->arch.exception.nested_apf) {
497	657	vcpu->arch.apf.nested_apf_token = fault->address;
498		- else
499		- vcpu->arch.cr2 = fault->address;
500		- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
	658	+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
	659	+ } else {
	660	+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
	661	+ fault->address);
	662	+ }
501	663	}
502	664	EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
503	665
504		-static bool kvm_propagate_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
	666	+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
	667	+ struct x86_exception *fault)
505	668	{
506		- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
507		- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
508		- else
509		- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
	669	+ struct kvm_mmu *fault_mmu;
	670	+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
510	671
	672	+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
	673	+ vcpu->arch.walk_mmu;
	674	+
	675	+ /*
	676	+ * Invalidate the TLB entry for the faulting address, if it exists,
	677	+ * else the access will fault indefinitely (and to emulate hardware).
	678	+ */
	679	+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
	680	+ !(fault->error_code & PFERR_RSVD_MASK))
	681	+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
	682	+ fault_mmu->root_hpa);
	683	+
	684	+ fault_mmu->inject_page_fault(vcpu, fault);
511	685	return fault->nested_page_fault;
512	686	}
	687	+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
513	688
514	689	void kvm_inject_nmi(struct kvm_vcpu *vcpu)
515	690	{
..	..	@@ -520,13 +695,13 @@
520	695
521	696	void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
522	697	{
523		- kvm_multiple_exception(vcpu, nr, true, error_code, false);
	698	+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
524	699	}
525	700	EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
526	701
527	702	void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
528	703	{
529		- kvm_multiple_exception(vcpu, nr, true, error_code, true);
	704	+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
530	705	}
531	706	EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
532	707
..	..	@@ -536,7 +711,7 @@
536	711	*/
537	712	bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
538	713	{
539		- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
	714	+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
540	715	return true;
541	716	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
542	717	return false;
..	..	@@ -618,10 +793,8 @@
618	793	ret = 1;
619	794
620	795	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
621		- __set_bit(VCPU_EXREG_PDPTR,
622		- (unsigned long *)&vcpu->arch.regs_avail);
623		- __set_bit(VCPU_EXREG_PDPTR,
624		- (unsigned long *)&vcpu->arch.regs_dirty);
	796	+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
	797	+
625	798	out:
626	799
627	800	return ret;
..	..	@@ -631,7 +804,6 @@
631	804	bool pdptrs_changed(struct kvm_vcpu *vcpu)
632	805	{
633	806	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
634		- bool changed = true;
635	807	int offset;
636	808	gfn_t gfn;
637	809	int r;
..	..	@@ -639,8 +811,7 @@
639	811	if (!is_pae_paging(vcpu))
640	812	return false;
641	813
642		- if (!test_bit(VCPU_EXREG_PDPTR,
643		- (unsigned long *)&vcpu->arch.regs_avail))
	814	+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
644	815	return true;
645	816
646	817	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
..	..	@@ -648,17 +819,16 @@
648	819	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
649	820	PFERR_USER_MASK \| PFERR_WRITE_MASK);
650	821	if (r < 0)
651		- goto out;
652		- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
653		-out:
	822	+ return true;
654	823
655		- return changed;
	824	+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
656	825	}
657	826	EXPORT_SYMBOL_GPL(pdptrs_changed);
658	827
659	828	int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
660	829	{
661	830	unsigned long old_cr0 = kvm_read_cr0(vcpu);
	831	+ unsigned long pdptr_bits = X86_CR0_CD \| X86_CR0_NW \| X86_CR0_PG;
662	832	unsigned long update_bits = X86_CR0_PG \| X86_CR0_WP;
663	833
664	834	cr0 \|= X86_CR0_ET;
..	..	@@ -676,27 +846,27 @@
676	846	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
677	847	return 1;
678	848
679		- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
680	849	#ifdef CONFIG_X86_64
681		- if ((vcpu->arch.efer & EFER_LME)) {
682		- int cs_db, cs_l;
	850	+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
	851	+ (cr0 & X86_CR0_PG)) {
	852	+ int cs_db, cs_l;
683	853
684		- if (!is_pae(vcpu))
685		- return 1;
686		- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
687		- if (cs_l)
688		- return 1;
689		- } else
690		-#endif
691		- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
692		- kvm_read_cr3(vcpu)))
	854	+ if (!is_pae(vcpu))
	855	+ return 1;
	856	+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
	857	+ if (cs_l)
693	858	return 1;
694	859	}
	860	+#endif
	861	+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
	862	+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
	863	+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
	864	+ return 1;
695	865
696	866	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
697	867	return 1;
698	868
699		- kvm_x86_ops->set_cr0(vcpu, cr0);
	869	+ kvm_x86_ops.set_cr0(vcpu, cr0);
700	870
701	871	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
702	872	kvm_clear_async_pf_completion_queue(vcpu);
..	..	@@ -721,27 +891,48 @@
721	891	}
722	892	EXPORT_SYMBOL_GPL(kvm_lmsw);
723	893
724		-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
	894	+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
725	895	{
726		- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
727		- !vcpu->guest_xcr0_loaded) {
728		- /* kvm_set_xcr() also depends on this */
	896	+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
	897	+
729	898	if (vcpu->arch.xcr0 != host_xcr0)
730	899	xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
731		- vcpu->guest_xcr0_loaded = 1;
732		- }
733		-}
734		-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
735	900
736		-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
	901	+ if (vcpu->arch.xsaves_enabled &&
	902	+ vcpu->arch.ia32_xss != host_xss)
	903	+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
	904	+ }
	905	+
	906	+ if (static_cpu_has(X86_FEATURE_PKU) &&
	907	+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
	908	+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
	909	+ vcpu->arch.pkru != vcpu->arch.host_pkru)
	910	+ __write_pkru(vcpu->arch.pkru);
	911	+}
	912	+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
	913	+
	914	+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
737	915	{
738		- if (vcpu->guest_xcr0_loaded) {
	916	+ if (static_cpu_has(X86_FEATURE_PKU) &&
	917	+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
	918	+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
	919	+ vcpu->arch.pkru = rdpkru();
	920	+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
	921	+ __write_pkru(vcpu->arch.host_pkru);
	922	+ }
	923	+
	924	+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
	925	+
739	926	if (vcpu->arch.xcr0 != host_xcr0)
740	927	xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
741		- vcpu->guest_xcr0_loaded = 0;
	928	+
	929	+ if (vcpu->arch.xsaves_enabled &&
	930	+ vcpu->arch.ia32_xss != host_xss)
	931	+ wrmsrl(MSR_IA32_XSS, host_xss);
742	932	}
	933	+
743	934	}
744		-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
	935	+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
745	936
746	937	static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
747	938	{
..	..	@@ -779,13 +970,13 @@
779	970	vcpu->arch.xcr0 = xcr0;
780	971
781	972	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
782		- kvm_update_cpuid(vcpu);
	973	+ kvm_update_cpuid_runtime(vcpu);
783	974	return 0;
784	975	}
785	976
786	977	int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
787	978	{
788		- if (kvm_x86_ops->get_cpl(vcpu) != 0 \|\|
	979	+ if (kvm_x86_ops.get_cpl(vcpu) != 0 \|\|
789	980	__kvm_set_xcr(vcpu, index, xcr)) {
790	981	kvm_inject_gp(vcpu, 0);
791	982	return 1;
..	..	@@ -794,63 +985,20 @@
794	985	}
795	986	EXPORT_SYMBOL_GPL(kvm_set_xcr);
796	987
797		-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
798		-{
799		- u64 reserved_bits = CR4_RESERVED_BITS;
800		-
801		- if (!cpu_has(c, X86_FEATURE_XSAVE))
802		- reserved_bits \|= X86_CR4_OSXSAVE;
803		-
804		- if (!cpu_has(c, X86_FEATURE_SMEP))
805		- reserved_bits \|= X86_CR4_SMEP;
806		-
807		- if (!cpu_has(c, X86_FEATURE_SMAP))
808		- reserved_bits \|= X86_CR4_SMAP;
809		-
810		- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
811		- reserved_bits \|= X86_CR4_FSGSBASE;
812		-
813		- if (!cpu_has(c, X86_FEATURE_PKU))
814		- reserved_bits \|= X86_CR4_PKE;
815		-
816		- if (!cpu_has(c, X86_FEATURE_LA57) &&
817		- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
818		- reserved_bits \|= X86_CR4_LA57;
819		-
820		- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
821		- reserved_bits \|= X86_CR4_UMIP;
822		-
823		- return reserved_bits;
824		-}
825		-
826		-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
	988	+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
827	989	{
828	990	if (cr4 & cr4_reserved_bits)
829	991	return -EINVAL;
830	992
831		- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
	993	+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
832	994	return -EINVAL;
833	995
834		- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
835		- return -EINVAL;
836		-
837		- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
838		- return -EINVAL;
839		-
840		- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
841		- return -EINVAL;
842		-
843		- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
844		- return -EINVAL;
845		-
846		- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
847		- return -EINVAL;
848		-
849		- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
	996	+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
850	997	return -EINVAL;
851	998
852	999	return 0;
853	1000	}
	1001	+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
854	1002
855	1003	int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
856	1004	{
..	..	@@ -882,15 +1030,14 @@
882	1030	return 1;
883	1031	}
884	1032
885		- if (kvm_x86_ops->set_cr4(vcpu, cr4))
886		- return 1;
	1033	+ kvm_x86_ops.set_cr4(vcpu, cr4);
887	1034
888	1035	if (((cr4 ^ old_cr4) & mmu_role_bits) \|\|
889	1036	(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
890	1037	kvm_mmu_reset_context(vcpu);
891	1038
892	1039	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE \| X86_CR4_PKE))
893		- kvm_update_cpuid(vcpu);
	1040	+ kvm_update_cpuid_runtime(vcpu);
894	1041
895	1042	return 0;
896	1043	}
..	..	@@ -911,21 +1058,21 @@
911	1058	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
912	1059	if (!skip_tlb_flush) {
913	1060	kvm_mmu_sync_roots(vcpu);
914		- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
	1061	+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
915	1062	}
916	1063	return 0;
917	1064	}
918	1065
919	1066	if (is_long_mode(vcpu) &&
920		- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
	1067	+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
921	1068	return 1;
922	1069	else if (is_pae_paging(vcpu) &&
923	1070	!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
924	1071	return 1;
925	1072
926		- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
	1073	+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
927	1074	vcpu->arch.cr3 = cr3;
928		- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
	1075	+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
929	1076
930	1077	return 0;
931	1078	}
..	..	@@ -963,13 +1110,7 @@
963	1110	}
964	1111	}
965	1112
966		-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
967		-{
968		- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
969		- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
970		-}
971		-
972		-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
	1113	+void kvm_update_dr7(struct kvm_vcpu *vcpu)
973	1114	{
974	1115	unsigned long dr7;
975	1116
..	..	@@ -977,11 +1118,12 @@
977	1118	dr7 = vcpu->arch.guest_debug_dr7;
978	1119	else
979	1120	dr7 = vcpu->arch.dr7;
980		- kvm_x86_ops->set_dr7(vcpu, dr7);
	1121	+ kvm_x86_ops.set_dr7(vcpu, dr7);
981	1122	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
982	1123	if (dr7 & DR7_BP_EN_MASK)
983	1124	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_BP_ENABLED;
984	1125	}
	1126	+EXPORT_SYMBOL_GPL(kvm_update_dr7);
985	1127
986	1128	static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
987	1129	{
..	..	@@ -1003,17 +1145,14 @@
1003	1145	vcpu->arch.eff_db[dr] = val;
1004	1146	break;
1005	1147	case 4:
1006		- /* fall through */
1007	1148	case 6:
1008		- if (val & 0xffffffff00000000ULL)
	1149	+ if (!kvm_dr6_valid(val))
1009	1150	return -1; /* #GP */
1010	1151	vcpu->arch.dr6 = (val & DR6_VOLATILE) \| kvm_dr6_fixed(vcpu);
1011		- kvm_update_dr6(vcpu);
1012	1152	break;
1013	1153	case 5:
1014		- /* fall through */
1015	1154	default: /* 7 */
1016		- if (val & 0xffffffff00000000ULL)
	1155	+ if (!kvm_dr7_valid(val))
1017	1156	return -1; /* #GP */
1018	1157	vcpu->arch.dr7 = (val & DR7_VOLATILE) \| DR7_FIXED_1;
1019	1158	kvm_update_dr7(vcpu);
..	..	@@ -1042,15 +1181,10 @@
1042	1181	*val = vcpu->arch.db[array_index_nospec(dr, size)];
1043	1182	break;
1044	1183	case 4:
1045		- /* fall through */
1046	1184	case 6:
1047		- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1048		- *val = vcpu->arch.dr6;
1049		- else
1050		- *val = kvm_x86_ops->get_dr6(vcpu);
	1185	+ *val = vcpu->arch.dr6;
1051	1186	break;
1052	1187	case 5:
1053		- /* fall through */
1054	1188	default: /* 7 */
1055	1189	*val = vcpu->arch.dr7;
1056	1190	break;
..	..	@@ -1061,15 +1195,15 @@
1061	1195
1062	1196	bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1063	1197	{
1064		- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	1198	+ u32 ecx = kvm_rcx_read(vcpu);
1065	1199	u64 data;
1066	1200	int err;
1067	1201
1068	1202	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1069	1203	if (err)
1070	1204	return err;
1071		- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1072		- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	1205	+ kvm_rax_write(vcpu, (u32)data);
	1206	+ kvm_rdx_write(vcpu, data >> 32);
1073	1207	return err;
1074	1208	}
1075	1209	EXPORT_SYMBOL_GPL(kvm_rdpmc);
..	..	@@ -1078,26 +1212,66 @@
1078	1212	* List of msr numbers which we expose to userspace through KVM_GET_MSRS
1079	1213	* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1080	1214	*
1081		- * This list is modified at module load time to reflect the
	1215	+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
	1216	+ * extract the supported MSRs from the related const lists.
	1217	+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
1082	1218	* capabilities of the host cpu. This capabilities test skips MSRs that are
1083		- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
	1219	+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1084	1220	* may depend on host virtualization features rather than host cpu features.
1085	1221	*/
1086	1222
1087		-static u32 msrs_to_save[] = {
	1223	+static const u32 msrs_to_save_all[] = {
1088	1224	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1089	1225	MSR_STAR,
1090	1226	#ifdef CONFIG_X86_64
1091	1227	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1092	1228	#endif
1093	1229	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1094		- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1095		- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
	1230	+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
	1231	+ MSR_IA32_SPEC_CTRL,
	1232	+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
	1233	+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
	1234	+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
	1235	+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
	1236	+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
	1237	+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
	1238	+ MSR_IA32_UMWAIT_CONTROL,
	1239	+
	1240	+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
	1241	+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
	1242	+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
	1243	+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
	1244	+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
	1245	+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
	1246	+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
	1247	+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
	1248	+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
	1249	+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
	1250	+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
	1251	+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
	1252	+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
	1253	+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
	1254	+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
	1255	+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
	1256	+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
	1257	+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
	1258	+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
	1259	+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
	1260	+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
	1261	+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
	1262	+
	1263	+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
	1264	+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
	1265	+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
	1266	+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
	1267	+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
	1268	+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
1096	1269	};
1097	1270
	1271	+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1098	1272	static unsigned num_msrs_to_save;
1099	1273
1100		-static u32 emulated_msrs[] = {
	1274	+static const u32 emulated_msrs_all[] = {
1101	1275	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1102	1276	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1103	1277	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
..	..	@@ -1113,12 +1287,18 @@
1113	1287	HV_X64_MSR_VP_ASSIST_PAGE,
1114	1288	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1115	1289	HV_X64_MSR_TSC_EMULATION_STATUS,
	1290	+ HV_X64_MSR_SYNDBG_OPTIONS,
	1291	+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
	1292	+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
	1293	+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1116	1294
1117	1295	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1118		- MSR_KVM_PV_EOI_EN,
	1296	+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1119	1297
1120	1298	MSR_IA32_TSC_ADJUST,
1121	1299	MSR_IA32_TSCDEADLINE,
	1300	+ MSR_IA32_ARCH_CAPABILITIES,
	1301	+ MSR_IA32_PERF_CAPABILITIES,
1122	1302	MSR_IA32_MISC_ENABLE,
1123	1303	MSR_IA32_MCG_STATUS,
1124	1304	MSR_IA32_MCG_CTL,
..	..	@@ -1128,15 +1308,41 @@
1128	1308	MSR_PLATFORM_INFO,
1129	1309	MSR_MISC_FEATURES_ENABLES,
1130	1310	MSR_AMD64_VIRT_SPEC_CTRL,
	1311	+ MSR_IA32_POWER_CTL,
	1312	+ MSR_IA32_UCODE_REV,
	1313	+
	1314	+ /*
	1315	+ * The following list leaves out MSRs whose values are determined
	1316	+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
	1317	+ * We always support the "true" VMX control MSRs, even if the host
	1318	+ * processor does not, so I am putting these registers here rather
	1319	+ * than in msrs_to_save_all.
	1320	+ */
	1321	+ MSR_IA32_VMX_BASIC,
	1322	+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
	1323	+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
	1324	+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
	1325	+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
	1326	+ MSR_IA32_VMX_MISC,
	1327	+ MSR_IA32_VMX_CR0_FIXED0,
	1328	+ MSR_IA32_VMX_CR4_FIXED0,
	1329	+ MSR_IA32_VMX_VMCS_ENUM,
	1330	+ MSR_IA32_VMX_PROCBASED_CTLS2,
	1331	+ MSR_IA32_VMX_EPT_VPID_CAP,
	1332	+ MSR_IA32_VMX_VMFUNC,
	1333	+
	1334	+ MSR_K7_HWCR,
	1335	+ MSR_KVM_POLL_CONTROL,
1131	1336	};
1132	1337
	1338	+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1133	1339	static unsigned num_emulated_msrs;
1134	1340
1135	1341	/*
1136	1342	* List of msr numbers which are used to expose MSR-based features that
1137	1343	* can be used by a hypervisor to validate requested CPU features.
1138	1344	*/
1139		-static u32 msr_based_features[] = {
	1345	+static const u32 msr_based_features_all[] = {
1140	1346	MSR_IA32_VMX_BASIC,
1141	1347	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1142	1348	MSR_IA32_VMX_PINBASED_CTLS,
..	..	@@ -1156,18 +1362,41 @@
1156	1362	MSR_IA32_VMX_EPT_VPID_CAP,
1157	1363	MSR_IA32_VMX_VMFUNC,
1158	1364
1159		- MSR_F10H_DECFG,
	1365	+ MSR_AMD64_DE_CFG,
1160	1366	MSR_IA32_UCODE_REV,
1161	1367	MSR_IA32_ARCH_CAPABILITIES,
	1368	+ MSR_IA32_PERF_CAPABILITIES,
1162	1369	};
1163	1370
	1371	+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1164	1372	static unsigned int num_msr_based_features;
1165	1373
1166		-u64 kvm_get_arch_capabilities(void)
1167		-{
1168		- u64 data;
	1374	+/*
	1375	+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
	1376	+ * does not yet virtualize. These include:
	1377	+ * 10 - MISC_PACKAGE_CTRLS
	1378	+ * 11 - ENERGY_FILTERING_CTL
	1379	+ * 12 - DOITM
	1380	+ * 18 - FB_CLEAR_CTRL
	1381	+ * 21 - XAPIC_DISABLE_STATUS
	1382	+ * 23 - OVERCLOCKING_STATUS
	1383	+ */
1169	1384
1170		- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
	1385	+#define KVM_SUPPORTED_ARCH_CAP \
	1386	+ (ARCH_CAP_RDCL_NO \| ARCH_CAP_IBRS_ALL \| ARCH_CAP_RSBA \| \
	1387	+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH \| ARCH_CAP_SSB_NO \| ARCH_CAP_MDS_NO \| \
	1388	+ ARCH_CAP_PSCHANGE_MC_NO \| ARCH_CAP_TSX_CTRL_MSR \| ARCH_CAP_TAA_NO \| \
	1389	+ ARCH_CAP_SBDR_SSDP_NO \| ARCH_CAP_FBSDP_NO \| ARCH_CAP_PSDP_NO \| \
	1390	+ ARCH_CAP_FB_CLEAR \| ARCH_CAP_RRSBA \| ARCH_CAP_PBRSB_NO)
	1391	+
	1392	+static u64 kvm_get_arch_capabilities(void)
	1393	+{
	1394	+ u64 data = 0;
	1395	+
	1396	+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
	1397	+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
	1398	+ data &= KVM_SUPPORTED_ARCH_CAP;
	1399	+ }
1171	1400
1172	1401	/*
1173	1402	* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
..	..	@@ -1196,34 +1425,27 @@
1196	1425	if (!boot_cpu_has_bug(X86_BUG_MDS))
1197	1426	data \|= ARCH_CAP_MDS_NO;
1198	1427
1199		- /*
1200		- * On TAA affected systems, export MDS_NO=0 when:
1201		- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
1202		- * - Updated microcode is present. This is detected by
1203		- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
1204		- * that VERW clears CPU buffers.
1205		- *
1206		- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
1207		- * mitigation and don't complain:
1208		- *
1209		- * "Vulnerable: Clear CPU buffers attempted, no microcode"
1210		- *
1211		- * If TSX is disabled on the system, guests are also mitigated against
1212		- * TAA and clear CPU buffer mitigation is not required for guests.
1213		- */
1214		- if (!boot_cpu_has(X86_FEATURE_RTM))
	1428	+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
	1429	+ /*
	1430	+ * If RTM=0 because the kernel has disabled TSX, the host might
	1431	+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
	1432	+ * and therefore knows that there cannot be TAA) but keep
	1433	+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
	1434	+ * and we want to allow migrating those guests to tsx=off hosts.
	1435	+ */
1215	1436	data &= ~ARCH_CAP_TAA_NO;
1216		- else if (!boot_cpu_has_bug(X86_BUG_TAA))
	1437	+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1217	1438	data \|= ARCH_CAP_TAA_NO;
1218		- else if (data & ARCH_CAP_TSX_CTRL_MSR)
1219		- data &= ~ARCH_CAP_MDS_NO;
	1439	+ } else {
	1440	+ /*
	1441	+ * Nothing to do here; we emulate TSX_CTRL if present on the
	1442	+ * host so the guest can choose between disabling TSX or
	1443	+ * using VERW to clear CPU buffers.
	1444	+ */
	1445	+ }
1220	1446
1221		- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
1222		- data &= ~ARCH_CAP_TSX_CTRL_MSR;
1223	1447	return data;
1224	1448	}
1225		-
1226		-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1227	1449
1228	1450	static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1229	1451	{
..	..	@@ -1235,8 +1457,7 @@
1235	1457	rdmsrl_safe(msr->index, &msr->data);
1236	1458	break;
1237	1459	default:
1238		- if (kvm_x86_ops->get_msr_feature(msr))
1239		- return 1;
	1460	+ return kvm_x86_ops.get_msr_feature(msr);
1240	1461	}
1241	1462	return 0;
1242	1463	}
..	..	@@ -1248,6 +1469,14 @@
1248	1469
1249	1470	msr.index = index;
1250	1471	r = kvm_get_msr_feature(&msr);
	1472	+
	1473	+ if (r == KVM_MSR_RET_INVALID) {
	1474	+ /* Unconditionally clear the output for simplicity */
	1475	+ *data = 0;
	1476	+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
	1477	+ r = 0;
	1478	+ }
	1479	+
1251	1480	if (r)
1252	1481	return r;
1253	1482
..	..	@@ -1262,6 +1491,13 @@
1262	1491	return false;
1263	1492
1264	1493	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
	1494	+ return false;
	1495	+
	1496	+ if (efer & (EFER_LME \| EFER_LMA) &&
	1497	+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
	1498	+ return false;
	1499	+
	1500	+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1265	1501	return false;
1266	1502
1267	1503	return true;
..	..	@@ -1280,6 +1516,7 @@
1280	1516	{
1281	1517	u64 old_efer = vcpu->arch.efer;
1282	1518	u64 efer = msr_info->data;
	1519	+ int r;
1283	1520
1284	1521	if (efer & efer_reserved_bits)
1285	1522	return 1;
..	..	@@ -1296,7 +1533,11 @@
1296	1533	efer &= ~EFER_LMA;
1297	1534	efer \|= vcpu->arch.efer & EFER_LMA;
1298	1535
1299		- kvm_x86_ops->set_efer(vcpu, efer);
	1536	+ r = kvm_x86_ops.set_efer(vcpu, efer);
	1537	+ if (r) {
	1538	+ WARN_ON(r > 0);
	1539	+ return r;
	1540	+ }
1300	1541
1301	1542	/* Update reserved bits */
1302	1543	if ((efer ^ old_efer) & EFER_NX)
..	..	@@ -1311,20 +1552,70 @@
1311	1552	}
1312	1553	EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1313	1554
	1555	+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
	1556	+{
	1557	+ struct kvm_x86_msr_filter *msr_filter;
	1558	+ struct msr_bitmap_range *ranges;
	1559	+ struct kvm *kvm = vcpu->kvm;
	1560	+ bool allowed;
	1561	+ int idx;
	1562	+ u32 i;
	1563	+
	1564	+ /* x2APIC MSRs do not support filtering. */
	1565	+ if (index >= 0x800 && index <= 0x8ff)
	1566	+ return true;
	1567	+
	1568	+ idx = srcu_read_lock(&kvm->srcu);
	1569	+
	1570	+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
	1571	+ if (!msr_filter) {
	1572	+ allowed = true;
	1573	+ goto out;
	1574	+ }
	1575	+
	1576	+ allowed = msr_filter->default_allow;
	1577	+ ranges = msr_filter->ranges;
	1578	+
	1579	+ for (i = 0; i < msr_filter->count; i++) {
	1580	+ u32 start = ranges[i].base;
	1581	+ u32 end = start + ranges[i].nmsrs;
	1582	+ u32 flags = ranges[i].flags;
	1583	+ unsigned long *bitmap = ranges[i].bitmap;
	1584	+
	1585	+ if ((index >= start) && (index < end) && (flags & type)) {
	1586	+ allowed = !!test_bit(index - start, bitmap);
	1587	+ break;
	1588	+ }
	1589	+ }
	1590	+
	1591	+out:
	1592	+ srcu_read_unlock(&kvm->srcu, idx);
	1593	+
	1594	+ return allowed;
	1595	+}
	1596	+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
	1597	+
1314	1598	/*
1315		- * Writes msr value into into the appropriate "register".
	1599	+ * Write @data into the MSR specified by @index. Select MSR specific fault
	1600	+ * checks are bypassed if @host_initiated is %true.
1316	1601	* Returns 0 on success, non-0 otherwise.
1317	1602	* Assumes vcpu_load() was already called.
1318	1603	*/
1319		-int kvm_set_msr(struct kvm_vcpu vcpu, struct msr_data msr)
	1604	+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
	1605	+ bool host_initiated)
1320	1606	{
1321		- switch (msr->index) {
	1607	+ struct msr_data msr;
	1608	+
	1609	+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
	1610	+ return KVM_MSR_RET_FILTERED;
	1611	+
	1612	+ switch (index) {
1322	1613	case MSR_FS_BASE:
1323	1614	case MSR_GS_BASE:
1324	1615	case MSR_KERNEL_GS_BASE:
1325	1616	case MSR_CSTAR:
1326	1617	case MSR_LSTAR:
1327		- if (is_noncanonical_address(msr->data, vcpu))
	1618	+ if (is_noncanonical_address(data, vcpu))
1328	1619	return 1;
1329	1620	break;
1330	1621	case MSR_IA32_SYSENTER_EIP:
..	..	@@ -1341,54 +1632,313 @@
1341	1632	* value, and that something deterministic happens if the guest
1342	1633	* invokes 64-bit SYSENTER.
1343	1634	*/
1344		- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
	1635	+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1345	1636	}
1346		- return kvm_x86_ops->set_msr(vcpu, msr);
	1637	+
	1638	+ msr.data = data;
	1639	+ msr.index = index;
	1640	+ msr.host_initiated = host_initiated;
	1641	+
	1642	+ return kvm_x86_ops.set_msr(vcpu, &msr);
	1643	+}
	1644	+
	1645	+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
	1646	+ u32 index, u64 data, bool host_initiated)
	1647	+{
	1648	+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
	1649	+
	1650	+ if (ret == KVM_MSR_RET_INVALID)
	1651	+ if (kvm_msr_ignored_check(vcpu, index, data, true))
	1652	+ ret = 0;
	1653	+
	1654	+ return ret;
	1655	+}
	1656	+
	1657	+/*
	1658	+ * Read the MSR specified by @index into @data. Select MSR specific fault
	1659	+ * checks are bypassed if @host_initiated is %true.
	1660	+ * Returns 0 on success, non-0 otherwise.
	1661	+ * Assumes vcpu_load() was already called.
	1662	+ */
	1663	+int __kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data,
	1664	+ bool host_initiated)
	1665	+{
	1666	+ struct msr_data msr;
	1667	+ int ret;
	1668	+
	1669	+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
	1670	+ return KVM_MSR_RET_FILTERED;
	1671	+
	1672	+ msr.index = index;
	1673	+ msr.host_initiated = host_initiated;
	1674	+
	1675	+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
	1676	+ if (!ret)
	1677	+ *data = msr.data;
	1678	+ return ret;
	1679	+}
	1680	+
	1681	+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
	1682	+ u32 index, u64 *data, bool host_initiated)
	1683	+{
	1684	+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
	1685	+
	1686	+ if (ret == KVM_MSR_RET_INVALID) {
	1687	+ /* Unconditionally clear data for simplicity /
	1688	+ *data = 0;
	1689	+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
	1690	+ ret = 0;
	1691	+ }
	1692	+
	1693	+ return ret;
	1694	+}
	1695	+
	1696	+int kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data)
	1697	+{
	1698	+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
	1699	+}
	1700	+EXPORT_SYMBOL_GPL(kvm_get_msr);
	1701	+
	1702	+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
	1703	+{
	1704	+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
1347	1705	}
1348	1706	EXPORT_SYMBOL_GPL(kvm_set_msr);
	1707	+
	1708	+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
	1709	+{
	1710	+ if (vcpu->run->msr.error) {
	1711	+ kvm_inject_gp(vcpu, 0);
	1712	+ return 1;
	1713	+ } else if (is_read) {
	1714	+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
	1715	+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
	1716	+ }
	1717	+
	1718	+ return kvm_skip_emulated_instruction(vcpu);
	1719	+}
	1720	+
	1721	+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
	1722	+{
	1723	+ return complete_emulated_msr(vcpu, true);
	1724	+}
	1725	+
	1726	+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
	1727	+{
	1728	+ return complete_emulated_msr(vcpu, false);
	1729	+}
	1730	+
	1731	+static u64 kvm_msr_reason(int r)
	1732	+{
	1733	+ switch (r) {
	1734	+ case KVM_MSR_RET_INVALID:
	1735	+ return KVM_MSR_EXIT_REASON_UNKNOWN;
	1736	+ case KVM_MSR_RET_FILTERED:
	1737	+ return KVM_MSR_EXIT_REASON_FILTER;
	1738	+ default:
	1739	+ return KVM_MSR_EXIT_REASON_INVAL;
	1740	+ }
	1741	+}
	1742	+
	1743	+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
	1744	+ u32 exit_reason, u64 data,
	1745	+ int (completion)(struct kvm_vcpu vcpu),
	1746	+ int r)
	1747	+{
	1748	+ u64 msr_reason = kvm_msr_reason(r);
	1749	+
	1750	+ /* Check if the user wanted to know about this MSR fault */
	1751	+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
	1752	+ return 0;
	1753	+
	1754	+ vcpu->run->exit_reason = exit_reason;
	1755	+ vcpu->run->msr.error = 0;
	1756	+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
	1757	+ vcpu->run->msr.reason = msr_reason;
	1758	+ vcpu->run->msr.index = index;
	1759	+ vcpu->run->msr.data = data;
	1760	+ vcpu->arch.complete_userspace_io = completion;
	1761	+
	1762	+ return 1;
	1763	+}
	1764	+
	1765	+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
	1766	+{
	1767	+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
	1768	+ complete_emulated_rdmsr, r);
	1769	+}
	1770	+
	1771	+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
	1772	+{
	1773	+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
	1774	+ complete_emulated_wrmsr, r);
	1775	+}
	1776	+
	1777	+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
	1778	+{
	1779	+ u32 ecx = kvm_rcx_read(vcpu);
	1780	+ u64 data;
	1781	+ int r;
	1782	+
	1783	+ r = kvm_get_msr(vcpu, ecx, &data);
	1784	+
	1785	+ /* MSR read failed? See if we should ask user space */
	1786	+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
	1787	+ /* Bounce to user space */
	1788	+ return 0;
	1789	+ }
	1790	+
	1791	+ /* MSR read failed? Inject a #GP */
	1792	+ if (r) {
	1793	+ trace_kvm_msr_read_ex(ecx);
	1794	+ kvm_inject_gp(vcpu, 0);
	1795	+ return 1;
	1796	+ }
	1797	+
	1798	+ trace_kvm_msr_read(ecx, data);
	1799	+
	1800	+ kvm_rax_write(vcpu, data & -1u);
	1801	+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
	1802	+ return kvm_skip_emulated_instruction(vcpu);
	1803	+}
	1804	+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
	1805	+
	1806	+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
	1807	+{
	1808	+ u32 ecx = kvm_rcx_read(vcpu);
	1809	+ u64 data = kvm_read_edx_eax(vcpu);
	1810	+ int r;
	1811	+
	1812	+ r = kvm_set_msr(vcpu, ecx, data);
	1813	+
	1814	+ /* MSR write failed? See if we should ask user space */
	1815	+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
	1816	+ /* Bounce to user space */
	1817	+ return 0;
	1818	+
	1819	+ /* Signal all other negative errors to userspace */
	1820	+ if (r < 0)
	1821	+ return r;
	1822	+
	1823	+ /* MSR write failed? Inject a #GP */
	1824	+ if (r > 0) {
	1825	+ trace_kvm_msr_write_ex(ecx, data);
	1826	+ kvm_inject_gp(vcpu, 0);
	1827	+ return 1;
	1828	+ }
	1829	+
	1830	+ trace_kvm_msr_write(ecx, data);
	1831	+ return kvm_skip_emulated_instruction(vcpu);
	1832	+}
	1833	+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
	1834	+
	1835	+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
	1836	+{
	1837	+ return vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu) \|\|
	1838	+ xfer_to_guest_mode_work_pending();
	1839	+}
	1840	+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
	1841	+
	1842	+/*
	1843	+ * The fast path for frequent and performance sensitive wrmsr emulation,
	1844	+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
	1845	+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
	1846	+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
	1847	+ * other cases which must be called after interrupts are enabled on the host.
	1848	+ */
	1849	+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
	1850	+{
	1851	+ if (!lapic_in_kernel(vcpu) \|\| !apic_x2apic_mode(vcpu->arch.apic))
	1852	+ return 1;
	1853	+
	1854	+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
	1855	+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
	1856	+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
	1857	+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
	1858	+
	1859	+ data &= ~(1 << 12);
	1860	+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
	1861	+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
	1862	+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
	1863	+ trace_kvm_apic_write(APIC_ICR, (u32)data);
	1864	+ return 0;
	1865	+ }
	1866	+
	1867	+ return 1;
	1868	+}
	1869	+
	1870	+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
	1871	+{
	1872	+ if (!kvm_can_use_hv_timer(vcpu))
	1873	+ return 1;
	1874	+
	1875	+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
	1876	+ return 0;
	1877	+}
	1878	+
	1879	+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
	1880	+{
	1881	+ u32 msr = kvm_rcx_read(vcpu);
	1882	+ u64 data;
	1883	+ fastpath_t ret = EXIT_FASTPATH_NONE;
	1884	+
	1885	+ switch (msr) {
	1886	+ case APIC_BASE_MSR + (APIC_ICR >> 4):
	1887	+ data = kvm_read_edx_eax(vcpu);
	1888	+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
	1889	+ kvm_skip_emulated_instruction(vcpu);
	1890	+ ret = EXIT_FASTPATH_EXIT_HANDLED;
	1891	+ }
	1892	+ break;
	1893	+ case MSR_IA32_TSCDEADLINE:
	1894	+ data = kvm_read_edx_eax(vcpu);
	1895	+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
	1896	+ kvm_skip_emulated_instruction(vcpu);
	1897	+ ret = EXIT_FASTPATH_REENTER_GUEST;
	1898	+ }
	1899	+ break;
	1900	+ default:
	1901	+ break;
	1902	+ }
	1903	+
	1904	+ if (ret != EXIT_FASTPATH_NONE)
	1905	+ trace_kvm_msr_write(msr, data);
	1906	+
	1907	+ return ret;
	1908	+}
	1909	+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1349	1910
1350	1911	/*
1351	1912	* Adapt set_msr() to msr_io()'s calling convention
1352	1913	*/
1353	1914	static int do_get_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
1354	1915	{
1355		- struct msr_data msr;
1356		- int r;
1357		-
1358		- msr.index = index;
1359		- msr.host_initiated = true;
1360		- r = kvm_get_msr(vcpu, &msr);
1361		- if (r)
1362		- return r;
1363		-
1364		- *data = msr.data;
1365		- return 0;
	1916	+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
1366	1917	}
1367	1918
1368	1919	static int do_set_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
1369	1920	{
1370		- struct msr_data msr;
1371		-
1372		- msr.data = *data;
1373		- msr.index = index;
1374		- msr.host_initiated = true;
1375		- return kvm_set_msr(vcpu, &msr);
	1921	+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1376	1922	}
1377	1923
1378	1924	#ifdef CONFIG_X86_64
	1925	+struct pvclock_clock {
	1926	+ int vclock_mode;
	1927	+ u64 cycle_last;
	1928	+ u64 mask;
	1929	+ u32 mult;
	1930	+ u32 shift;
	1931	+ u64 base_cycles;
	1932	+ u64 offset;
	1933	+};
	1934	+
1379	1935	struct pvclock_gtod_data {
1380	1936	seqcount_t seq;
1381	1937
1382		- struct { /* extract of a clocksource struct */
1383		- int vclock_mode;
1384		- u64 cycle_last;
1385		- u64 mask;
1386		- u32 mult;
1387		- u32 shift;
1388		- } clock;
	1938	+ struct pvclock_clock clock; /* extract of a clocksource struct */
	1939	+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1389	1940
1390		- u64 boot_ns;
1391		- u64 nsec_base;
	1941	+ ktime_t offs_boot;
1392	1942	u64 wall_time_sec;
1393	1943	};
1394	1944
..	..	@@ -1397,44 +1947,54 @@
1397	1947	static void update_pvclock_gtod(struct timekeeper *tk)
1398	1948	{
1399	1949	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1400		- u64 boot_ns;
1401		-
1402		- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1403	1950
1404	1951	write_seqcount_begin(&vdata->seq);
1405	1952
1406	1953	/* copy pvclock gtod data */
1407		- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
	1954	+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
1408	1955	vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
1409	1956	vdata->clock.mask = tk->tkr_mono.mask;
1410	1957	vdata->clock.mult = tk->tkr_mono.mult;
1411	1958	vdata->clock.shift = tk->tkr_mono.shift;
	1959	+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
	1960	+ vdata->clock.offset = tk->tkr_mono.base;
1412	1961
1413		- vdata->boot_ns = boot_ns;
1414		- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
	1962	+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
	1963	+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
	1964	+ vdata->raw_clock.mask = tk->tkr_raw.mask;
	1965	+ vdata->raw_clock.mult = tk->tkr_raw.mult;
	1966	+ vdata->raw_clock.shift = tk->tkr_raw.shift;
	1967	+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
	1968	+ vdata->raw_clock.offset = tk->tkr_raw.base;
1415	1969
1416	1970	vdata->wall_time_sec = tk->xtime_sec;
1417	1971
	1972	+ vdata->offs_boot = tk->offs_boot;
	1973	+
1418	1974	write_seqcount_end(&vdata->seq);
1419	1975	}
1420		-#endif
1421	1976
1422		-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
	1977	+static s64 get_kvmclock_base_ns(void)
1423	1978	{
1424		- /*
1425		- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1426		- * vcpu_enter_guest. This function is only called from
1427		- * the physical CPU that is running vcpu.
1428		- */
1429		- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
	1979	+ /* Count up from boot time, but with the frequency of the raw clock. */
	1980	+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1430	1981	}
	1982	+#else
	1983	+static s64 get_kvmclock_base_ns(void)
	1984	+{
	1985	+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
	1986	+ return ktime_get_boottime_ns();
	1987	+}
	1988	+#endif
1431	1989
1432	1990	static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1433	1991	{
1434	1992	int version;
1435	1993	int r;
1436	1994	struct pvclock_wall_clock wc;
1437		- struct timespec64 boot;
	1995	+ u64 wall_nsec;
	1996	+
	1997	+ kvm->arch.wall_clock = wall_clock;
1438	1998
1439	1999	if (!wall_clock)
1440	2000	return;
..	..	@@ -1454,23 +2014,46 @@
1454	2014	/*
1455	2015	* The guest calculates current wall clock time by adding
1456	2016	* system time (updated by kvm_guest_time_update below) to the
1457		- * wall clock specified here. guest system time equals host
1458		- * system time for us, thus we must fill in host boot time here.
	2017	+ * wall clock specified here. We do the reverse here.
1459	2018	*/
1460		- getboottime64(&boot);
	2019	+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
1461	2020
1462		- if (kvm->arch.kvmclock_offset) {
1463		- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1464		- boot = timespec64_sub(boot, ts);
1465		- }
1466		- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1467		- wc.nsec = boot.tv_nsec;
	2021	+ wc.nsec = do_div(wall_nsec, 1000000000);
	2022	+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
1468	2023	wc.version = version;
1469	2024
1470	2025	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1471	2026
1472	2027	version++;
1473	2028	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
	2029	+}
	2030	+
	2031	+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
	2032	+ bool old_msr, bool host_initiated)
	2033	+{
	2034	+ struct kvm_arch *ka = &vcpu->kvm->arch;
	2035	+
	2036	+ if (vcpu->vcpu_id == 0 && !host_initiated) {
	2037	+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
	2038	+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	2039	+
	2040	+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
	2041	+ }
	2042	+
	2043	+ vcpu->arch.time = system_time;
	2044	+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
	2045	+
	2046	+ /* we verify if the enable bit is set... */
	2047	+ vcpu->arch.pv_time_enabled = false;
	2048	+ if (!(system_time & 1))
	2049	+ return;
	2050	+
	2051	+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
	2052	+ &vcpu->arch.pv_time, system_time & ~1ULL,
	2053	+ sizeof(struct pvclock_vcpu_time_info)))
	2054	+ vcpu->arch.pv_time_enabled = true;
	2055	+
	2056	+ return;
1474	2057	}
1475	2058
1476	2059	static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
..	..	@@ -1505,9 +2088,6 @@
1505	2088
1506	2089	*pshift = shift;
1507	2090	*pmultiplier = div_frac(scaled64, tps32);
1508		-
1509		- pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1510		- __func__, base_hz, scaled_hz, shift, *pmultiplier);
1511	2091	}
1512	2092
1513	2093	#ifdef CONFIG_X86_64
..	..	@@ -1604,7 +2184,7 @@
1604	2184
1605	2185	static inline int gtod_is_based_on_tsc(int mode)
1606	2186	{
1607		- return mode == VCLOCK_TSC \|\| mode == VCLOCK_HVCLOCK;
	2187	+ return mode == VDSO_CLOCKMODE_TSC \|\| mode == VDSO_CLOCKMODE_HVCLOCK;
1608	2188	}
1609	2189
1610	2190	static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
..	..	@@ -1633,12 +2213,6 @@
1633	2213	atomic_read(&vcpu->kvm->online_vcpus),
1634	2214	ka->use_master_clock, gtod->clock.vclock_mode);
1635	2215	#endif
1636		-}
1637		-
1638		-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1639		-{
1640		- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1641		- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1642	2216	}
1643	2217
1644	2218	/*
..	..	@@ -1679,15 +2253,14 @@
1679	2253
1680	2254	u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1681	2255	{
1682		- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1683		-
1684		- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
	2256	+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1685	2257	}
1686	2258	EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1687	2259
1688	2260	static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1689	2261	{
1690		- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
	2262	+ vcpu->arch.l1_tsc_offset = offset;
	2263	+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
1691	2264	}
1692	2265
1693	2266	static inline bool kvm_check_tsc_unstable(void)
..	..	@@ -1697,29 +2270,28 @@
1697	2270	* TSC is marked unstable when we're running on Hyper-V,
1698	2271	* 'TSC page' clocksource is good.
1699	2272	*/
1700		- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
	2273	+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
1701	2274	return false;
1702	2275	#endif
1703	2276	return check_tsc_unstable();
1704	2277	}
1705	2278
1706		-void kvm_write_tsc(struct kvm_vcpu vcpu, struct msr_data msr)
	2279	+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
1707	2280	{
1708	2281	struct kvm *kvm = vcpu->kvm;
1709	2282	u64 offset, ns, elapsed;
1710	2283	unsigned long flags;
1711	2284	bool matched;
1712	2285	bool already_matched;
1713		- u64 data = msr->data;
1714	2286	bool synchronizing = false;
1715	2287
1716	2288	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1717	2289	offset = kvm_compute_tsc_offset(vcpu, data);
1718		- ns = ktime_get_boot_ns();
	2290	+ ns = get_kvmclock_base_ns();
1719	2291	elapsed = ns - kvm->arch.last_tsc_nsec;
1720	2292
1721	2293	if (vcpu->arch.virtual_tsc_khz) {
1722		- if (data == 0 && msr->host_initiated) {
	2294	+ if (data == 0) {
1723	2295	/*
1724	2296	* detection of vcpu initialization -- need to sync
1725	2297	* with other vCPUs. This particularly helps to keep
..	..	@@ -1750,12 +2322,10 @@
1750	2322	vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1751	2323	if (!kvm_check_tsc_unstable()) {
1752	2324	offset = kvm->arch.cur_tsc_offset;
1753		- pr_debug("kvm: matched tsc offset for %llu\n", data);
1754	2325	} else {
1755	2326	u64 delta = nsec_to_cycles(vcpu, elapsed);
1756	2327	data += delta;
1757	2328	offset = kvm_compute_tsc_offset(vcpu, data);
1758		- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1759	2329	}
1760	2330	matched = true;
1761	2331	already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
..	..	@@ -1774,8 +2344,6 @@
1774	2344	kvm->arch.cur_tsc_write = data;
1775	2345	kvm->arch.cur_tsc_offset = offset;
1776	2346	matched = false;
1777		- pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1778		- kvm->arch.cur_tsc_generation, data);
1779	2347	}
1780	2348
1781	2349	/*
..	..	@@ -1793,9 +2361,6 @@
1793	2361	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1794	2362	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1795	2363
1796		- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1797		- update_ia32_tsc_adjust_msr(vcpu, offset);
1798		-
1799	2364	kvm_vcpu_write_tsc_offset(vcpu, offset);
1800	2365	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1801	2366
..	..	@@ -1810,12 +2375,10 @@
1810	2375	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1811	2376	}
1812	2377
1813		-EXPORT_SYMBOL_GPL(kvm_write_tsc);
1814		-
1815	2378	static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1816	2379	s64 adjustment)
1817	2380	{
1818		- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
	2381	+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
1819	2382	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
1820	2383	}
1821	2384
..	..	@@ -1849,43 +2412,43 @@
1849	2412	return last;
1850	2413	}
1851	2414
1852		-static inline u64 vgettsc(u64 tsc_timestamp, int mode)
	2415	+static inline u64 vgettsc(struct pvclock_clock clock, u64 tsc_timestamp,
	2416	+ int *mode)
1853	2417	{
1854	2418	long v;
1855		- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1856	2419	u64 tsc_pg_val;
1857	2420
1858		- switch (gtod->clock.vclock_mode) {
1859		- case VCLOCK_HVCLOCK:
	2421	+ switch (clock->vclock_mode) {
	2422	+ case VDSO_CLOCKMODE_HVCLOCK:
1860	2423	tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1861	2424	tsc_timestamp);
1862	2425	if (tsc_pg_val != U64_MAX) {
1863	2426	/* TSC page valid */
1864		- *mode = VCLOCK_HVCLOCK;
1865		- v = (tsc_pg_val - gtod->clock.cycle_last) &
1866		- gtod->clock.mask;
	2427	+ *mode = VDSO_CLOCKMODE_HVCLOCK;
	2428	+ v = (tsc_pg_val - clock->cycle_last) &
	2429	+ clock->mask;
1867	2430	} else {
1868	2431	/* TSC page invalid */
1869		- *mode = VCLOCK_NONE;
	2432	+ *mode = VDSO_CLOCKMODE_NONE;
1870	2433	}
1871	2434	break;
1872		- case VCLOCK_TSC:
1873		- *mode = VCLOCK_TSC;
	2435	+ case VDSO_CLOCKMODE_TSC:
	2436	+ *mode = VDSO_CLOCKMODE_TSC;
1874	2437	*tsc_timestamp = read_tsc();
1875		- v = (*tsc_timestamp - gtod->clock.cycle_last) &
1876		- gtod->clock.mask;
	2438	+ v = (*tsc_timestamp - clock->cycle_last) &
	2439	+ clock->mask;
1877	2440	break;
1878	2441	default:
1879		- *mode = VCLOCK_NONE;
	2442	+ *mode = VDSO_CLOCKMODE_NONE;
1880	2443	}
1881	2444
1882		- if (*mode == VCLOCK_NONE)
	2445	+ if (*mode == VDSO_CLOCKMODE_NONE)
1883	2446	*tsc_timestamp = v = 0;
1884	2447
1885		- return v * gtod->clock.mult;
	2448	+ return v * clock->mult;
1886	2449	}
1887	2450
1888		-static int do_monotonic_boot(s64 t, u64 tsc_timestamp)
	2451	+static int do_monotonic_raw(s64 t, u64 tsc_timestamp)
1889	2452	{
1890	2453	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1891	2454	unsigned long seq;
..	..	@@ -1894,10 +2457,10 @@
1894	2457
1895	2458	do {
1896	2459	seq = read_seqcount_begin(&gtod->seq);
1897		- ns = gtod->nsec_base;
1898		- ns += vgettsc(tsc_timestamp, &mode);
1899		- ns >>= gtod->clock.shift;
1900		- ns += gtod->boot_ns;
	2460	+ ns = gtod->raw_clock.base_cycles;
	2461	+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
	2462	+ ns >>= gtod->raw_clock.shift;
	2463	+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
1901	2464	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1902	2465	*t = ns;
1903	2466
..	..	@@ -1914,8 +2477,8 @@
1914	2477	do {
1915	2478	seq = read_seqcount_begin(&gtod->seq);
1916	2479	ts->tv_sec = gtod->wall_time_sec;
1917		- ns = gtod->nsec_base;
1918		- ns += vgettsc(tsc_timestamp, &mode);
	2480	+ ns = gtod->clock.base_cycles;
	2481	+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
1919	2482	ns >>= gtod->clock.shift;
1920	2483	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1921	2484
..	..	@@ -1932,7 +2495,7 @@
1932	2495	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1933	2496	return false;
1934	2497
1935		- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
	2498	+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
1936	2499	tsc_timestamp));
1937	2500	}
1938	2501
..	..	@@ -2057,7 +2620,7 @@
2057	2620	spin_lock(&ka->pvclock_gtod_sync_lock);
2058	2621	if (!ka->use_master_clock) {
2059	2622	spin_unlock(&ka->pvclock_gtod_sync_lock);
2060		- return ktime_get_boot_ns() + ka->kvmclock_offset;
	2623	+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
2061	2624	}
2062	2625
2063	2626	hv_clock.tsc_timestamp = ka->master_cycle_now;
..	..	@@ -2073,7 +2636,7 @@
2073	2636	&hv_clock.tsc_to_system_mul);
2074	2637	ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2075	2638	} else
2076		- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
	2639	+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2077	2640
2078	2641	put_cpu();
2079	2642
..	..	@@ -2172,7 +2735,7 @@
2172	2735	}
2173	2736	if (!use_master_clock) {
2174	2737	host_tsc = rdtsc();
2175		- kernel_ns = ktime_get_boot_ns();
	2738	+ kernel_ns = get_kvmclock_base_ns();
2176	2739	}
2177	2740
2178	2741	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
..	..	@@ -2284,6 +2847,18 @@
2284	2847	KVMCLOCK_SYNC_PERIOD);
2285	2848	}
2286	2849
	2850	+/*
	2851	+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
	2852	+ */
	2853	+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
	2854	+{
	2855	+ /* McStatusWrEn enabled? */
	2856	+ if (guest_cpuid_is_amd_or_hygon(vcpu))
	2857	+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
	2858	+
	2859	+ return false;
	2860	+}
	2861	+
2287	2862	static int set_msr_mce(struct kvm_vcpu vcpu, struct msr_data msr_info)
2288	2863	{
2289	2864	u64 mcg_cap = vcpu->arch.mcg_cap;
..	..	@@ -2313,14 +2888,22 @@
2313	2888	/* only 0 or all 1s can be written to IA32_MCi_CTL
2314	2889	* some Linux kernels though clear bit 10 in bank 4 to
2315	2890	* workaround a BIOS/GART TBL issue on AMD K8s, ignore
2316		- * this to avoid an uncatched #GP in the guest
	2891	+ * this to avoid an uncatched #GP in the guest.
	2892	+ *
	2893	+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
	2894	+ * correctable, single-bit ECC data errors.
2317	2895	*/
2318	2896	if ((offset & 0x3) == 0 &&
2319		- data != 0 && (data \| (1 << 10)) != ~(u64)0)
2320		- return -1;
	2897	+ data != 0 && (data \| (1 << 10) \| 1) != ~(u64)0)
	2898	+ return 1;
	2899	+
	2900	+ /* MCi_STATUS */
2321	2901	if (!msr_info->host_initiated &&
2322		- (offset & 0x3) == 1 && data != 0)
2323		- return -1;
	2902	+ (offset & 0x3) == 1 && data != 0) {
	2903	+ if (!can_set_mci_status(vcpu))
	2904	+ return 1;
	2905	+ }
	2906	+
2324	2907	vcpu->arch.mce_banks[offset] = data;
2325	2908	break;
2326	2909	}
..	..	@@ -2340,61 +2923,99 @@
2340	2923	u32 page_num = data & ~PAGE_MASK;
2341	2924	u64 page_addr = data & PAGE_MASK;
2342	2925	u8 *page;
2343		- int r;
2344	2926
2345		- r = -E2BIG;
2346	2927	if (page_num >= blob_size)
2347		- goto out;
2348		- r = -ENOMEM;
	2928	+ return 1;
	2929	+
2349	2930	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2350		- if (IS_ERR(page)) {
2351		- r = PTR_ERR(page);
2352		- goto out;
	2931	+ if (IS_ERR(page))
	2932	+ return PTR_ERR(page);
	2933	+
	2934	+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
	2935	+ kfree(page);
	2936	+ return 1;
2353	2937	}
2354		- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2355		- goto out_free;
2356		- r = 0;
2357		-out_free:
2358		- kfree(page);
2359		-out:
2360		- return r;
	2938	+ return 0;
	2939	+}
	2940	+
	2941	+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
	2942	+{
	2943	+ u64 mask = KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
	2944	+
	2945	+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
2361	2946	}
2362	2947
2363	2948	static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2364	2949	{
2365	2950	gpa_t gpa = data & ~0x3f;
2366	2951
2367		- /* Bits 3:5 are reserved, Should be zero */
2368		- if (data & 0x38)
	2952	+ /* Bits 4:5 are reserved, Should be zero */
	2953	+ if (data & 0x30)
2369	2954	return 1;
2370	2955
2371		- vcpu->arch.apf.msr_val = data;
	2956	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
	2957	+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
	2958	+ return 1;
2372	2959
2373		- if (!(data & KVM_ASYNC_PF_ENABLED)) {
	2960	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
	2961	+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
	2962	+ return 1;
	2963	+
	2964	+ if (!lapic_in_kernel(vcpu))
	2965	+ return data ? 1 : 0;
	2966	+
	2967	+ vcpu->arch.apf.msr_en_val = data;
	2968	+
	2969	+ if (!kvm_pv_async_pf_enabled(vcpu)) {
2374	2970	kvm_clear_async_pf_completion_queue(vcpu);
2375	2971	kvm_async_pf_hash_reset(vcpu);
2376	2972	return 0;
2377	2973	}
2378	2974
2379	2975	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2380		- sizeof(u32)))
	2976	+ sizeof(u64)))
2381	2977	return 1;
2382	2978
2383	2979	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2384	2980	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
	2981	+
2385	2982	kvm_async_pf_wakeup_all(vcpu);
	2983	+
	2984	+ return 0;
	2985	+}
	2986	+
	2987	+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
	2988	+{
	2989	+ /* Bits 8-63 are reserved */
	2990	+ if (data >> 8)
	2991	+ return 1;
	2992	+
	2993	+ if (!lapic_in_kernel(vcpu))
	2994	+ return 1;
	2995	+
	2996	+ vcpu->arch.apf.msr_int_val = data;
	2997	+
	2998	+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
	2999	+
2386	3000	return 0;
2387	3001	}
2388	3002
2389	3003	static void kvmclock_reset(struct kvm_vcpu *vcpu)
2390	3004	{
2391	3005	vcpu->arch.pv_time_enabled = false;
	3006	+ vcpu->arch.time = 0;
2392	3007	}
2393	3008
2394		-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
	3009	+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
2395	3010	{
2396	3011	++vcpu->stat.tlb_flush;
2397		- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
	3012	+ kvm_x86_ops.tlb_flush_all(vcpu);
	3013	+}
	3014	+
	3015	+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
	3016	+{
	3017	+ ++vcpu->stat.tlb_flush;
	3018	+ kvm_x86_ops.tlb_flush_guest(vcpu);
2398	3019	}
2399	3020
2400	3021	static void record_steal_time(struct kvm_vcpu *vcpu)
..	..	@@ -2417,8 +3038,14 @@
2417	3038	* Doing a TLB flush here, on the guest's behalf, can avoid
2418	3039	* expensive IPIs.
2419	3040	*/
2420		- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2421		- kvm_vcpu_flush_tlb(vcpu, false);
	3041	+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
	3042	+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
	3043	+ st->preempted & KVM_VCPU_FLUSH_TLB);
	3044	+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
	3045	+ kvm_vcpu_flush_tlb_guest(vcpu);
	3046	+ } else {
	3047	+ st->preempted = 0;
	3048	+ }
2422	3049
2423	3050	vcpu->arch.st.preempted = 0;
2424	3051
..	..	@@ -2465,14 +3092,31 @@
2465	3092	return 1;
2466	3093	vcpu->arch.arch_capabilities = data;
2467	3094	break;
	3095	+ case MSR_IA32_PERF_CAPABILITIES: {
	3096	+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
	3097	+
	3098	+ if (!msr_info->host_initiated)
	3099	+ return 1;
	3100	+ if (kvm_get_msr_feature(&msr_ent))
	3101	+ return 1;
	3102	+ if (data & ~msr_ent.data)
	3103	+ return 1;
	3104	+
	3105	+ vcpu->arch.perf_capabilities = data;
	3106	+
	3107	+ return 0;
	3108	+ }
2468	3109	case MSR_EFER:
2469	3110	return set_efer(vcpu, msr_info);
2470	3111	case MSR_K7_HWCR:
2471	3112	data &= ~(u64)0x40; /* ignore flush filter disable */
2472	3113	data &= ~(u64)0x100; /* ignore ignne emulation enable */
2473	3114	data &= ~(u64)0x8; /* ignore TLB cache disable */
2474		- data &= ~(u64)0x40000; /* ignore Mc status write enable */
2475		- if (data != 0) {
	3115	+
	3116	+ /* Handle McStatusWrEn */
	3117	+ if (data == BIT_ULL(18)) {
	3118	+ vcpu->arch.msr_hwcr = data;
	3119	+ } else if (data != 0) {
2476	3120	vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2477	3121	data);
2478	3122	return 1;
..	..	@@ -2493,9 +3137,9 @@
2493	3137	/* Values other than LBR and BTF are vendor-specific,
2494	3138	thus reserved and should throw a #GP */
2495	3139	return 1;
2496		- }
2497		- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2498		- __func__, data);
	3140	+ } else if (report_ignored_msrs)
	3141	+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
	3142	+ __func__, data);
2499	3143	break;
2500	3144	case 0x200 ... 0x2ff:
2501	3145	return kvm_mtrr_set_msr(vcpu, msr, data);
..	..	@@ -2520,15 +3164,46 @@
2520	3164	}
2521	3165	break;
2522	3166	case MSR_IA32_MISC_ENABLE:
2523		- vcpu->arch.ia32_misc_enable_msr = data;
	3167	+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
	3168	+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
	3169	+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
	3170	+ return 1;
	3171	+ vcpu->arch.ia32_misc_enable_msr = data;
	3172	+ kvm_update_cpuid_runtime(vcpu);
	3173	+ } else {
	3174	+ vcpu->arch.ia32_misc_enable_msr = data;
	3175	+ }
2524	3176	break;
2525	3177	case MSR_IA32_SMBASE:
2526	3178	if (!msr_info->host_initiated)
2527	3179	return 1;
2528	3180	vcpu->arch.smbase = data;
2529	3181	break;
	3182	+ case MSR_IA32_POWER_CTL:
	3183	+ vcpu->arch.msr_ia32_power_ctl = data;
	3184	+ break;
2530	3185	case MSR_IA32_TSC:
2531		- kvm_write_tsc(vcpu, msr_info);
	3186	+ if (msr_info->host_initiated) {
	3187	+ kvm_synchronize_tsc(vcpu, data);
	3188	+ } else {
	3189	+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
	3190	+ adjust_tsc_offset_guest(vcpu, adj);
	3191	+ vcpu->arch.ia32_tsc_adjust_msr += adj;
	3192	+ }
	3193	+ break;
	3194	+ case MSR_IA32_XSS:
	3195	+ if (!msr_info->host_initiated &&
	3196	+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
	3197	+ return 1;
	3198	+ /*
	3199	+ * KVM supports exposing PT to the guest, but does not support
	3200	+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
	3201	+ * XSAVES/XRSTORS to save/restore PT MSRs.
	3202	+ */
	3203	+ if (data & ~supported_xss)
	3204	+ return 1;
	3205	+ vcpu->arch.ia32_xss = data;
	3206	+ kvm_update_cpuid_runtime(vcpu);
2532	3207	break;
2533	3208	case MSR_SMI_COUNT:
2534	3209	if (!msr_info->host_initiated)
..	..	@@ -2536,46 +3211,54 @@
2536	3211	vcpu->arch.smi_count = data;
2537	3212	break;
2538	3213	case MSR_KVM_WALL_CLOCK_NEW:
	3214	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3215	+ return 1;
	3216	+
	3217	+ kvm_write_wall_clock(vcpu->kvm, data);
	3218	+ break;
2539	3219	case MSR_KVM_WALL_CLOCK:
2540		- vcpu->kvm->arch.wall_clock = data;
	3220	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3221	+ return 1;
	3222	+
2541	3223	kvm_write_wall_clock(vcpu->kvm, data);
2542	3224	break;
2543	3225	case MSR_KVM_SYSTEM_TIME_NEW:
2544		- case MSR_KVM_SYSTEM_TIME: {
2545		- struct kvm_arch *ka = &vcpu->kvm->arch;
	3226	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3227	+ return 1;
2546	3228
2547		- kvmclock_reset(vcpu);
2548		-
2549		- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2550		- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2551		-
2552		- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2553		- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2554		-
2555		- ka->boot_vcpu_runs_old_kvmclock = tmp;
2556		- }
2557		-
2558		- vcpu->arch.time = data;
2559		- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2560		-
2561		- /* we verify if the enable bit is set... */
2562		- if (!(data & 1))
2563		- break;
2564		-
2565		- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2566		- &vcpu->arch.pv_time, data & ~1ULL,
2567		- sizeof(struct pvclock_vcpu_time_info)))
2568		- vcpu->arch.pv_time_enabled = false;
2569		- else
2570		- vcpu->arch.pv_time_enabled = true;
2571		-
	3229	+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
2572	3230	break;
2573		- }
	3231	+ case MSR_KVM_SYSTEM_TIME:
	3232	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3233	+ return 1;
	3234	+
	3235	+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
	3236	+ break;
2574	3237	case MSR_KVM_ASYNC_PF_EN:
	3238	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
	3239	+ return 1;
	3240	+
2575	3241	if (kvm_pv_enable_async_pf(vcpu, data))
2576	3242	return 1;
2577	3243	break;
	3244	+ case MSR_KVM_ASYNC_PF_INT:
	3245	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3246	+ return 1;
	3247	+
	3248	+ if (kvm_pv_enable_async_pf_int(vcpu, data))
	3249	+ return 1;
	3250	+ break;
	3251	+ case MSR_KVM_ASYNC_PF_ACK:
	3252	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3253	+ return 1;
	3254	+ if (data & 0x1) {
	3255	+ vcpu->arch.apf.pageready_pending = false;
	3256	+ kvm_check_async_pf_completion(vcpu);
	3257	+ }
	3258	+ break;
2578	3259	case MSR_KVM_STEAL_TIME:
	3260	+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
	3261	+ return 1;
2579	3262
2580	3263	if (unlikely(!sched_info_on()))
2581	3264	return 1;
..	..	@@ -2592,8 +3275,22 @@
2592	3275
2593	3276	break;
2594	3277	case MSR_KVM_PV_EOI_EN:
	3278	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
	3279	+ return 1;
	3280	+
2595	3281	if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
2596	3282	return 1;
	3283	+ break;
	3284	+
	3285	+ case MSR_KVM_POLL_CONTROL:
	3286	+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	3287	+ return 1;
	3288	+
	3289	+ /* only enable bit supported */
	3290	+ if (data & (-1ULL << 1))
	3291	+ return 1;
	3292	+
	3293	+ vcpu->arch.msr_kvm_poll_control = data;
2597	3294	break;
2598	3295
2599	3296	case MSR_IA32_MCG_CTL:
..	..	@@ -2603,7 +3300,8 @@
2603	3300
2604	3301	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2605	3302	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2606		- pr = true; /* fall through */
	3303	+ pr = true;
	3304	+ fallthrough;
2607	3305	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2608	3306	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2609	3307	if (kvm_pmu_is_valid_msr(vcpu, msr))
..	..	@@ -2624,6 +3322,8 @@
2624	3322	*/
2625	3323	break;
2626	3324	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
	3325	+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
	3326	+ case HV_X64_MSR_SYNDBG_OPTIONS:
2627	3327	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2628	3328	case HV_X64_MSR_CRASH_CTL:
2629	3329	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
..	..	@@ -2669,33 +3369,11 @@
2669	3369	return xen_hvm_config(vcpu, data);
2670	3370	if (kvm_pmu_is_valid_msr(vcpu, msr))
2671	3371	return kvm_pmu_set_msr(vcpu, msr_info);
2672		- if (!ignore_msrs) {
2673		- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2674		- msr, data);
2675		- return 1;
2676		- } else {
2677		- if (report_ignored_msrs)
2678		- vcpu_unimpl(vcpu,
2679		- "ignored wrmsr: 0x%x data 0x%llx\n",
2680		- msr, data);
2681		- break;
2682		- }
	3372	+ return KVM_MSR_RET_INVALID;
2683	3373	}
2684	3374	return 0;
2685	3375	}
2686	3376	EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2687		-
2688		-
2689		-/*
2690		- * Reads an msr value (of 'msr_index') into 'pdata'.
2691		- * Returns 0 on success, non-0 otherwise.
2692		- * Assumes vcpu_load() was already called.
2693		- */
2694		-int kvm_get_msr(struct kvm_vcpu vcpu, struct msr_data msr)
2695		-{
2696		- return kvm_x86_ops->get_msr(vcpu, msr);
2697		-}
2698		-EXPORT_SYMBOL_GPL(kvm_get_msr);
2699	3377
2700	3378	static int get_msr_mce(struct kvm_vcpu vcpu, u32 msr, u64 pdata, bool host)
2701	3379	{
..	..	@@ -2748,7 +3426,6 @@
2748	3426	case MSR_K8_SYSCFG:
2749	3427	case MSR_K8_TSEG_ADDR:
2750	3428	case MSR_K8_TSEG_MASK:
2751		- case MSR_K7_HWCR:
2752	3429	case MSR_VM_HSAVE_PA:
2753	3430	case MSR_K8_INT_PENDING_MSG:
2754	3431	case MSR_AMD64_NB_CFG:
..	..	@@ -2757,6 +3434,17 @@
2757	3434	case MSR_IA32_PERF_CTL:
2758	3435	case MSR_AMD64_DC_CFG:
2759	3436	case MSR_F15H_EX_CFG:
	3437	+ /*
	3438	+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
	3439	+ * limit) MSRs. Just return 0, as we do not want to expose the host
	3440	+ * data here. Do not conditionalize this on CPUID, as KVM does not do
	3441	+ * so for existing CPU-specific MSRs.
	3442	+ */
	3443	+ case MSR_RAPL_POWER_UNIT:
	3444	+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
	3445	+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
	3446	+ case MSR_PKG_ENERGY_STATUS: /* Total package */
	3447	+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
2760	3448	msr_info->data = 0;
2761	3449	break;
2762	3450	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
..	..	@@ -2765,7 +3453,7 @@
2765	3453	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2766	3454	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2767	3455	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2768		- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
	3456	+ return kvm_pmu_get_msr(vcpu, msr_info);
2769	3457	msr_info->data = 0;
2770	3458	break;
2771	3459	case MSR_IA32_UCODE_REV:
..	..	@@ -2777,9 +3465,31 @@
2777	3465	return 1;
2778	3466	msr_info->data = vcpu->arch.arch_capabilities;
2779	3467	break;
2780		- case MSR_IA32_TSC:
2781		- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
	3468	+ case MSR_IA32_PERF_CAPABILITIES:
	3469	+ if (!msr_info->host_initiated &&
	3470	+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
	3471	+ return 1;
	3472	+ msr_info->data = vcpu->arch.perf_capabilities;
2782	3473	break;
	3474	+ case MSR_IA32_POWER_CTL:
	3475	+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
	3476	+ break;
	3477	+ case MSR_IA32_TSC: {
	3478	+ /*
	3479	+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
	3480	+ * even when not intercepted. AMD manual doesn't explicitly
	3481	+ * state this but appears to behave the same.
	3482	+ *
	3483	+ * On userspace reads and writes, however, we unconditionally
	3484	+ * return L1's TSC value to ensure backwards-compatible
	3485	+ * behavior for migration.
	3486	+ */
	3487	+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
	3488	+ vcpu->arch.tsc_offset;
	3489	+
	3490	+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
	3491	+ break;
	3492	+ }
2783	3493	case MSR_MTRRcap:
2784	3494	case 0x200 ... 0x2ff:
2785	3495	return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
..	..	@@ -2805,7 +3515,6 @@
2805	3515	break;
2806	3516	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2807	3517	return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2808		- break;
2809	3518	case MSR_IA32_TSCDEADLINE:
2810	3519	msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2811	3520	break;
..	..	@@ -2833,21 +3542,64 @@
2833	3542	msr_info->data = vcpu->arch.efer;
2834	3543	break;
2835	3544	case MSR_KVM_WALL_CLOCK:
	3545	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3546	+ return 1;
	3547	+
	3548	+ msr_info->data = vcpu->kvm->arch.wall_clock;
	3549	+ break;
2836	3550	case MSR_KVM_WALL_CLOCK_NEW:
	3551	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3552	+ return 1;
	3553	+
2837	3554	msr_info->data = vcpu->kvm->arch.wall_clock;
2838	3555	break;
2839	3556	case MSR_KVM_SYSTEM_TIME:
	3557	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3558	+ return 1;
	3559	+
	3560	+ msr_info->data = vcpu->arch.time;
	3561	+ break;
2840	3562	case MSR_KVM_SYSTEM_TIME_NEW:
	3563	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3564	+ return 1;
	3565	+
2841	3566	msr_info->data = vcpu->arch.time;
2842	3567	break;
2843	3568	case MSR_KVM_ASYNC_PF_EN:
2844		- msr_info->data = vcpu->arch.apf.msr_val;
	3569	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
	3570	+ return 1;
	3571	+
	3572	+ msr_info->data = vcpu->arch.apf.msr_en_val;
	3573	+ break;
	3574	+ case MSR_KVM_ASYNC_PF_INT:
	3575	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3576	+ return 1;
	3577	+
	3578	+ msr_info->data = vcpu->arch.apf.msr_int_val;
	3579	+ break;
	3580	+ case MSR_KVM_ASYNC_PF_ACK:
	3581	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3582	+ return 1;
	3583	+
	3584	+ msr_info->data = 0;
2845	3585	break;
2846	3586	case MSR_KVM_STEAL_TIME:
	3587	+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
	3588	+ return 1;
	3589	+
2847	3590	msr_info->data = vcpu->arch.st.msr_val;
2848	3591	break;
2849	3592	case MSR_KVM_PV_EOI_EN:
	3593	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
	3594	+ return 1;
	3595	+
2850	3596	msr_info->data = vcpu->arch.pv_eoi.msr_val;
	3597	+ break;
	3598	+ case MSR_KVM_POLL_CONTROL:
	3599	+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	3600	+ return 1;
	3601	+
	3602	+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
2851	3603	break;
2852	3604	case MSR_IA32_P5_MC_ADDR:
2853	3605	case MSR_IA32_P5_MC_TYPE:
..	..	@@ -2857,6 +3609,12 @@
2857	3609	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2858	3610	return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2859	3611	msr_info->host_initiated);
	3612	+ case MSR_IA32_XSS:
	3613	+ if (!msr_info->host_initiated &&
	3614	+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
	3615	+ return 1;
	3616	+ msr_info->data = vcpu->arch.ia32_xss;
	3617	+ break;
2860	3618	case MSR_K7_CLK_CTL:
2861	3619	/*
2862	3620	* Provide expected ramp-up count for K7. All other
..	..	@@ -2870,6 +3628,8 @@
2870	3628	msr_info->data = 0x20000000;
2871	3629	break;
2872	3630	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
	3631	+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
	3632	+ case HV_X64_MSR_SYNDBG_OPTIONS:
2873	3633	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2874	3634	case HV_X64_MSR_CRASH_CTL:
2875	3635	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
..	..	@@ -2879,7 +3639,6 @@
2879	3639	return kvm_hv_get_msr_common(vcpu,
2880	3640	msr_info->index, &msr_info->data,
2881	3641	msr_info->host_initiated);
2882		- break;
2883	3642	case MSR_IA32_BBL_CR_CTL3:
2884	3643	/* This legacy MSR exists but isn't fully documented in current
2885	3644	* silicon. It is however accessed by winxp in very narrow
..	..	@@ -2912,20 +3671,13 @@
2912	3671	case MSR_MISC_FEATURES_ENABLES:
2913	3672	msr_info->data = vcpu->arch.msr_misc_features_enables;
2914	3673	break;
	3674	+ case MSR_K7_HWCR:
	3675	+ msr_info->data = vcpu->arch.msr_hwcr;
	3676	+ break;
2915	3677	default:
2916	3678	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2917		- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2918		- if (!ignore_msrs) {
2919		- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2920		- msr_info->index);
2921		- return 1;
2922		- } else {
2923		- if (report_ignored_msrs)
2924		- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2925		- msr_info->index);
2926		- msr_info->data = 0;
2927		- }
2928		- break;
	3679	+ return kvm_pmu_get_msr(vcpu, msr_info);
	3680	+ return KVM_MSR_RET_INVALID;
2929	3681	}
2930	3682	return 0;
2931	3683	}
..	..	@@ -2966,7 +3718,7 @@
2966	3718	unsigned size;
2967	3719
2968	3720	r = -EFAULT;
2969		- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
	3721	+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
2970	3722	goto out;
2971	3723
2972	3724	r = -E2BIG;
..	..	@@ -3037,24 +3789,33 @@
3037	3789	case KVM_CAP_HYPERV_VP_INDEX:
3038	3790	case KVM_CAP_HYPERV_EVENTFD:
3039	3791	case KVM_CAP_HYPERV_TLBFLUSH:
	3792	+ case KVM_CAP_HYPERV_SEND_IPI:
	3793	+ case KVM_CAP_HYPERV_CPUID:
3040	3794	case KVM_CAP_PCI_SEGMENT:
3041	3795	case KVM_CAP_DEBUGREGS:
3042	3796	case KVM_CAP_X86_ROBUST_SINGLESTEP:
3043	3797	case KVM_CAP_XSAVE:
3044	3798	case KVM_CAP_ASYNC_PF:
	3799	+ case KVM_CAP_ASYNC_PF_INT:
3045	3800	case KVM_CAP_GET_TSC_KHZ:
3046	3801	case KVM_CAP_KVMCLOCK_CTRL:
3047	3802	case KVM_CAP_READONLY_MEM:
3048	3803	case KVM_CAP_HYPERV_TIME:
3049	3804	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3050	3805	case KVM_CAP_TSC_DEADLINE_TIMER:
3051		- case KVM_CAP_ENABLE_CAP_VM:
3052	3806	case KVM_CAP_DISABLE_QUIRKS:
3053	3807	case KVM_CAP_SET_BOOT_CPU_ID:
3054	3808	case KVM_CAP_SPLIT_IRQCHIP:
3055	3809	case KVM_CAP_IMMEDIATE_EXIT:
	3810	+ case KVM_CAP_PMU_EVENT_FILTER:
3056	3811	case KVM_CAP_GET_MSR_FEATURES:
3057	3812	case KVM_CAP_MSR_PLATFORM_INFO:
	3813	+ case KVM_CAP_EXCEPTION_PAYLOAD:
	3814	+ case KVM_CAP_SET_GUEST_DEBUG:
	3815	+ case KVM_CAP_LAST_CPU:
	3816	+ case KVM_CAP_X86_USER_SPACE_MSR:
	3817	+ case KVM_CAP_X86_MSR_FILTER:
	3818	+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
3058	3819	r = 1;
3059	3820	break;
3060	3821	case KVM_CAP_SYNC_REGS:
..	..	@@ -3064,7 +3825,8 @@
3064	3825	r = KVM_CLOCK_TSC_STABLE;
3065	3826	break;
3066	3827	case KVM_CAP_X86_DISABLE_EXITS:
3067		- r \|= KVM_X86_DISABLE_EXITS_HLT \| KVM_X86_DISABLE_EXITS_PAUSE;
	3828	+ r \|= KVM_X86_DISABLE_EXITS_HLT \| KVM_X86_DISABLE_EXITS_PAUSE \|
	3829	+ KVM_X86_DISABLE_EXITS_CSTATE;
3068	3830	if(kvm_can_mwait_in_guest())
3069	3831	r \|= KVM_X86_DISABLE_EXITS_MWAIT;
3070	3832	break;
..	..	@@ -3077,10 +3839,10 @@
3077	3839	* fringe case that is not enabled except via specific settings
3078	3840	* of the module parameters.
3079	3841	*/
3080		- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
	3842	+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
3081	3843	break;
3082	3844	case KVM_CAP_VAPIC:
3083		- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
	3845	+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
3084	3846	break;
3085	3847	case KVM_CAP_NR_VCPUS:
3086	3848	r = KVM_SOFT_MAX_VCPUS;
..	..	@@ -3090,9 +3852,6 @@
3090	3852	break;
3091	3853	case KVM_CAP_MAX_VCPU_ID:
3092	3854	r = KVM_MAX_VCPU_ID;
3093		- break;
3094		- case KVM_CAP_NR_MEMSLOTS:
3095		- r = KVM_USER_MEM_SLOTS;
3096	3855	break;
3097	3856	case KVM_CAP_PV_MMU: /* obsolete */
3098	3857	r = 0;
..	..	@@ -3110,8 +3869,20 @@
3110	3869	r = KVM_X2APIC_API_VALID_FLAGS;
3111	3870	break;
3112	3871	case KVM_CAP_NESTED_STATE:
3113		- r = kvm_x86_ops->get_nested_state ?
3114		- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
	3872	+ r = kvm_x86_ops.nested_ops->get_state ?
	3873	+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
	3874	+ break;
	3875	+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
	3876	+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
	3877	+ break;
	3878	+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
	3879	+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
	3880	+ break;
	3881	+ case KVM_CAP_SMALLER_MAXPHYADDR:
	3882	+ r = (int) allow_smaller_maxphyaddr;
	3883	+ break;
	3884	+ case KVM_CAP_STEAL_TIME:
	3885	+ r = sched_info_on();
3115	3886	break;
3116	3887	default:
3117	3888	break;
..	..	@@ -3133,11 +3904,11 @@
3133	3904	unsigned n;
3134	3905
3135	3906	r = -EFAULT;
3136		- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
	3907	+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3137	3908	goto out;
3138	3909	n = msr_list.nmsrs;
3139	3910	msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3140		- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
	3911	+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3141	3912	goto out;
3142	3913	r = -E2BIG;
3143	3914	if (n < msr_list.nmsrs)
..	..	@@ -3159,7 +3930,7 @@
3159	3930	struct kvm_cpuid2 cpuid;
3160	3931
3161	3932	r = -EFAULT;
3162		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	3933	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3163	3934	goto out;
3164	3935
3165	3936	r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
..	..	@@ -3168,12 +3939,12 @@
3168	3939	goto out;
3169	3940
3170	3941	r = -EFAULT;
3171		- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
	3942	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3172	3943	goto out;
3173	3944	r = 0;
3174	3945	break;
3175	3946	}
3176		- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
	3947	+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
3177	3948	r = -EFAULT;
3178	3949	if (copy_to_user(argp, &kvm_mce_cap_supported,
3179	3950	sizeof(kvm_mce_cap_supported)))
..	..	@@ -3205,9 +3976,9 @@
3205	3976	case KVM_GET_MSRS:
3206	3977	r = msr_io(NULL, argp, do_get_msr_feature, 1);
3207	3978	break;
3208		- }
3209	3979	default:
3210	3980	r = -EINVAL;
	3981	+ break;
3211	3982	}
3212	3983	out:
3213	3984	return r;
..	..	@@ -3227,14 +3998,17 @@
3227	3998	{
3228	3999	/* Address WBINVD may be executed by guest */
3229	4000	if (need_emulate_wbinvd(vcpu)) {
3230		- if (kvm_x86_ops->has_wbinvd_exit())
	4001	+ if (kvm_x86_ops.has_wbinvd_exit())
3231	4002	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3232	4003	else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3233	4004	smp_call_function_single(vcpu->cpu,
3234	4005	wbinvd_ipi, NULL, 1);
3235	4006	}
3236	4007
3237		- kvm_x86_ops->vcpu_load(vcpu, cpu);
	4008	+ kvm_x86_ops.vcpu_load(vcpu, cpu);
	4009	+
	4010	+ /* Save host pkru register if supported */
	4011	+ vcpu->arch.host_pkru = read_pkru();
3238	4012
3239	4013	/* Apply any externally detected TSC adjustments (due to suspend) */
3240	4014	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
..	..	@@ -3301,7 +4075,7 @@
3301	4075	int idx;
3302	4076
3303	4077	if (vcpu->preempted)
3304		- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
	4078	+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
3305	4079
3306	4080	/*
3307	4081	* Disable page faults because we're in atomic context here.
..	..	@@ -3320,7 +4094,7 @@
3320	4094	kvm_steal_time_set_preempted(vcpu);
3321	4095	srcu_read_unlock(&vcpu->kvm->srcu, idx);
3322	4096	pagefault_enable();
3323		- kvm_x86_ops->vcpu_put(vcpu);
	4097	+ kvm_x86_ops.vcpu_put(vcpu);
3324	4098	vcpu->arch.last_host_tsc = rdtsc();
3325	4099	/*
3326	4100	* If userspace has set any breakpoints or watchpoints, dr6 is restored
..	..	@@ -3334,7 +4108,7 @@
3334	4108	struct kvm_lapic_state *s)
3335	4109	{
3336	4110	if (vcpu->arch.apicv_active)
3337		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	4111	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
3338	4112
3339	4113	return kvm_apic_get_state(vcpu, s);
3340	4114	}
..	..	@@ -3453,8 +4227,7 @@
3453	4227	for (bank = 0; bank < bank_num; bank++)
3454	4228	vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3455	4229
3456		- if (kvm_x86_ops->setup_mce)
3457		- kvm_x86_ops->setup_mce(vcpu);
	4230	+ kvm_x86_ops.setup_mce(vcpu);
3458	4231	out:
3459	4232	return r;
3460	4233	}
..	..	@@ -3516,28 +4289,56 @@
3516	4289	process_smi(vcpu);
3517	4290
3518	4291	/*
3519		- * FIXME: pass injected and pending separately. This is only
3520		- * needed for nested virtualization, whose state cannot be
3521		- * migrated yet. For now we can combine them.
	4292	+ * In guest mode, payload delivery should be deferred,
	4293	+ * so that the L1 hypervisor can intercept #PF before
	4294	+ * CR2 is modified (or intercept #DB before DR6 is
	4295	+ * modified under nVMX). Unless the per-VM capability,
	4296	+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
	4297	+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
	4298	+ * opportunistically defer the exception payload, deliver it if the
	4299	+ * capability hasn't been requested before processing a
	4300	+ * KVM_GET_VCPU_EVENTS.
3522	4301	*/
3523		- events->exception.injected =
3524		- (vcpu->arch.exception.pending \|\|
3525		- vcpu->arch.exception.injected) &&
3526		- !kvm_exception_is_soft(vcpu->arch.exception.nr);
	4302	+ if (!vcpu->kvm->arch.exception_payload_enabled &&
	4303	+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
	4304	+ kvm_deliver_exception_payload(vcpu);
	4305	+
	4306	+ /*
	4307	+ * The API doesn't provide the instruction length for software
	4308	+ * exceptions, so don't report them. As long as the guest RIP
	4309	+ * isn't advanced, we should expect to encounter the exception
	4310	+ * again.
	4311	+ */
	4312	+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
	4313	+ events->exception.injected = 0;
	4314	+ events->exception.pending = 0;
	4315	+ } else {
	4316	+ events->exception.injected = vcpu->arch.exception.injected;
	4317	+ events->exception.pending = vcpu->arch.exception.pending;
	4318	+ /*
	4319	+ * For ABI compatibility, deliberately conflate
	4320	+ * pending and injected exceptions when
	4321	+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
	4322	+ */
	4323	+ if (!vcpu->kvm->arch.exception_payload_enabled)
	4324	+ events->exception.injected \|=
	4325	+ vcpu->arch.exception.pending;
	4326	+ }
3527	4327	events->exception.nr = vcpu->arch.exception.nr;
3528	4328	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3529		- events->exception.pad = 0;
3530	4329	events->exception.error_code = vcpu->arch.exception.error_code;
	4330	+ events->exception_has_payload = vcpu->arch.exception.has_payload;
	4331	+ events->exception_payload = vcpu->arch.exception.payload;
3531	4332
3532	4333	events->interrupt.injected =
3533	4334	vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3534	4335	events->interrupt.nr = vcpu->arch.interrupt.nr;
3535	4336	events->interrupt.soft = 0;
3536		- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
	4337	+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
3537	4338
3538	4339	events->nmi.injected = vcpu->arch.nmi_injected;
3539	4340	events->nmi.pending = vcpu->arch.nmi_pending != 0;
3540		- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
	4341	+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
3541	4342	events->nmi.pad = 0;
3542	4343
3543	4344	events->sipi_vector = 0; /* never valid when reporting to user space */
..	..	@@ -3551,10 +4352,13 @@
3551	4352	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3552	4353	\| KVM_VCPUEVENT_VALID_SHADOW
3553	4354	\| KVM_VCPUEVENT_VALID_SMM);
	4355	+ if (vcpu->kvm->arch.exception_payload_enabled)
	4356	+ events->flags \|= KVM_VCPUEVENT_VALID_PAYLOAD;
	4357	+
3554	4358	memset(&events->reserved, 0, sizeof(events->reserved));
3555	4359	}
3556	4360
3557		-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
	4361	+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
3558	4362
3559	4363	static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3560	4364	struct kvm_vcpu_events *events)
..	..	@@ -3562,12 +4366,24 @@
3562	4366	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3563	4367	\| KVM_VCPUEVENT_VALID_SIPI_VECTOR
3564	4368	\| KVM_VCPUEVENT_VALID_SHADOW
3565		- \| KVM_VCPUEVENT_VALID_SMM))
	4369	+ \| KVM_VCPUEVENT_VALID_SMM
	4370	+ \| KVM_VCPUEVENT_VALID_PAYLOAD))
3566	4371	return -EINVAL;
3567	4372
3568		- if (events->exception.injected &&
3569		- (events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR \|\|
3570		- is_guest_mode(vcpu)))
	4373	+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
	4374	+ if (!vcpu->kvm->arch.exception_payload_enabled)
	4375	+ return -EINVAL;
	4376	+ if (events->exception.pending)
	4377	+ events->exception.injected = 0;
	4378	+ else
	4379	+ events->exception_has_payload = 0;
	4380	+ } else {
	4381	+ events->exception.pending = 0;
	4382	+ events->exception_has_payload = 0;
	4383	+ }
	4384	+
	4385	+ if ((events->exception.injected \|\| events->exception.pending) &&
	4386	+ (events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR))
3571	4387	return -EINVAL;
3572	4388
3573	4389	/* INITs are latched while in SMM */
..	..	@@ -3577,35 +4393,40 @@
3577	4393	return -EINVAL;
3578	4394
3579	4395	process_nmi(vcpu);
3580		- vcpu->arch.exception.injected = false;
3581		- vcpu->arch.exception.pending = events->exception.injected;
	4396	+ vcpu->arch.exception.injected = events->exception.injected;
	4397	+ vcpu->arch.exception.pending = events->exception.pending;
3582	4398	vcpu->arch.exception.nr = events->exception.nr;
3583	4399	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3584	4400	vcpu->arch.exception.error_code = events->exception.error_code;
	4401	+ vcpu->arch.exception.has_payload = events->exception_has_payload;
	4402	+ vcpu->arch.exception.payload = events->exception_payload;
3585	4403
3586	4404	vcpu->arch.interrupt.injected = events->interrupt.injected;
3587	4405	vcpu->arch.interrupt.nr = events->interrupt.nr;
3588	4406	vcpu->arch.interrupt.soft = events->interrupt.soft;
3589	4407	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3590		- kvm_x86_ops->set_interrupt_shadow(vcpu,
	4408	+ kvm_x86_ops.set_interrupt_shadow(vcpu,
3591	4409	events->interrupt.shadow);
3592	4410
3593	4411	vcpu->arch.nmi_injected = events->nmi.injected;
3594	4412	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3595	4413	vcpu->arch.nmi_pending = events->nmi.pending;
3596		- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
	4414	+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
3597	4415
3598	4416	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3599	4417	lapic_in_kernel(vcpu))
3600	4418	vcpu->arch.apic->sipi_vector = events->sipi_vector;
3601	4419
3602	4420	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3603		- u32 hflags = vcpu->arch.hflags;
3604		- if (events->smi.smm)
3605		- hflags \|= HF_SMM_MASK;
3606		- else
3607		- hflags &= ~HF_SMM_MASK;
3608		- kvm_set_hflags(vcpu, hflags);
	4421	+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
	4422	+ if (events->smi.smm)
	4423	+ vcpu->arch.hflags \|= HF_SMM_MASK;
	4424	+ else
	4425	+ vcpu->arch.hflags &= ~HF_SMM_MASK;
	4426	+
	4427	+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
	4428	+ kvm_smm_changed(vcpu);
	4429	+ }
3609	4430
3610	4431	vcpu->arch.smi_pending = events->smi.pending;
3611	4432
..	..	@@ -3614,12 +4435,13 @@
3614	4435	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
3615	4436	else
3616	4437	vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3617		- if (lapic_in_kernel(vcpu)) {
3618		- if (events->smi.latched_init)
3619		- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3620		- else
3621		- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3622		- }
	4438	+ }
	4439	+
	4440	+ if (lapic_in_kernel(vcpu)) {
	4441	+ if (events->smi.latched_init)
	4442	+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
	4443	+ else
	4444	+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3623	4445	}
3624	4446	}
3625	4447
..	..	@@ -3655,7 +4477,6 @@
3655	4477	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3656	4478	kvm_update_dr0123(vcpu);
3657	4479	vcpu->arch.dr6 = dbgregs->dr6;
3658		- kvm_update_dr6(vcpu);
3659	4480	vcpu->arch.dr7 = dbgregs->dr7;
3660	4481	kvm_update_dr7(vcpu);
3661	4482
..	..	@@ -3666,7 +4487,7 @@
3666	4487
3667	4488	static void fill_xsave(u8 dest, struct kvm_vcpu vcpu)
3668	4489	{
3669		- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
	4490	+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3670	4491	u64 xstate_bv = xsave->header.xfeatures;
3671	4492	u64 valid;
3672	4493
..	..	@@ -3686,15 +4507,15 @@
3686	4507	*/
3687	4508	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3688	4509	while (valid) {
3689		- u64 feature = valid & -valid;
3690		- int index = fls64(feature) - 1;
3691		- void *src = get_xsave_addr(xsave, feature);
	4510	+ u64 xfeature_mask = valid & -valid;
	4511	+ int xfeature_nr = fls64(xfeature_mask) - 1;
	4512	+ void *src = get_xsave_addr(xsave, xfeature_nr);
3692	4513
3693	4514	if (src) {
3694	4515	u32 size, offset, ecx, edx;
3695		- cpuid_count(XSTATE_CPUID, index,
	4516	+ cpuid_count(XSTATE_CPUID, xfeature_nr,
3696	4517	&size, &offset, &ecx, &edx);
3697		- if (feature == XFEATURE_MASK_PKRU)
	4518	+ if (xfeature_nr == XFEATURE_PKRU)
3698	4519	memcpy(dest + offset, &vcpu->arch.pkru,
3699	4520	sizeof(vcpu->arch.pkru));
3700	4521	else
..	..	@@ -3702,13 +4523,13 @@
3702	4523
3703	4524	}
3704	4525
3705		- valid -= feature;
	4526	+ valid -= xfeature_mask;
3706	4527	}
3707	4528	}
3708	4529
3709	4530	static void load_xsave(struct kvm_vcpu vcpu, u8 src)
3710	4531	{
3711		- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
	4532	+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3712	4533	u64 xstate_bv = (u64 )(src + XSAVE_HDR_OFFSET);
3713	4534	u64 valid;
3714	4535
..	..	@@ -3729,22 +4550,22 @@
3729	4550	*/
3730	4551	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3731	4552	while (valid) {
3732		- u64 feature = valid & -valid;
3733		- int index = fls64(feature) - 1;
3734		- void *dest = get_xsave_addr(xsave, feature);
	4553	+ u64 xfeature_mask = valid & -valid;
	4554	+ int xfeature_nr = fls64(xfeature_mask) - 1;
	4555	+ void *dest = get_xsave_addr(xsave, xfeature_nr);
3735	4556
3736	4557	if (dest) {
3737	4558	u32 size, offset, ecx, edx;
3738		- cpuid_count(XSTATE_CPUID, index,
	4559	+ cpuid_count(XSTATE_CPUID, xfeature_nr,
3739	4560	&size, &offset, &ecx, &edx);
3740		- if (feature == XFEATURE_MASK_PKRU)
	4561	+ if (xfeature_nr == XFEATURE_PKRU)
3741	4562	memcpy(&vcpu->arch.pkru, src + offset,
3742	4563	sizeof(vcpu->arch.pkru));
3743	4564	else
3744	4565	memcpy(dest, src + offset, size);
3745	4566	}
3746	4567
3747		- valid -= feature;
	4568	+ valid -= xfeature_mask;
3748	4569	}
3749	4570	}
3750	4571
..	..	@@ -3756,7 +4577,7 @@
3756	4577	fill_xsave((u8 *) guest_xsave->region, vcpu);
3757	4578	} else {
3758	4579	memcpy(guest_xsave->region,
3759		- &vcpu->arch.guest_fpu.state.fxsave,
	4580	+ &vcpu->arch.guest_fpu->state.fxsave,
3760	4581	sizeof(struct fxregs_state));
3761	4582	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3762	4583	XFEATURE_MASK_FPSSE;
..	..	@@ -3778,15 +4599,14 @@
3778	4599	* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
3779	4600	* with old userspace.
3780	4601	*/
3781		- if (xstate_bv & ~kvm_supported_xcr0() \|\|
3782		- mxcsr & ~mxcsr_feature_mask)
	4602	+ if (xstate_bv & ~supported_xcr0 \|\| mxcsr & ~mxcsr_feature_mask)
3783	4603	return -EINVAL;
3784	4604	load_xsave(vcpu, (u8 *)guest_xsave->region);
3785	4605	} else {
3786	4606	if (xstate_bv & ~XFEATURE_MASK_FPSSE \|\|
3787	4607	mxcsr & ~mxcsr_feature_mask)
3788	4608	return -EINVAL;
3789		- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
	4609	+ memcpy(&vcpu->arch.guest_fpu->state.fxsave,
3790	4610	guest_xsave->region, sizeof(struct fxregs_state));
3791	4611	}
3792	4612	return 0;
..	..	@@ -3847,6 +4667,10 @@
3847	4667	static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3848	4668	struct kvm_enable_cap *cap)
3849	4669	{
	4670	+ int r;
	4671	+ uint16_t vmcs_version;
	4672	+ void __user *user_ptr;
	4673	+
3850	4674	if (cap->flags)
3851	4675	return -EINVAL;
3852	4676
..	..	@@ -3854,11 +4678,37 @@
3854	4678	case KVM_CAP_HYPERV_SYNIC2:
3855	4679	if (cap->args[0])
3856	4680	return -EINVAL;
	4681	+ fallthrough;
	4682	+
3857	4683	case KVM_CAP_HYPERV_SYNIC:
3858	4684	if (!irqchip_in_kernel(vcpu->kvm))
3859	4685	return -EINVAL;
3860	4686	return kvm_hv_activate_synic(vcpu, cap->cap ==
3861	4687	KVM_CAP_HYPERV_SYNIC2);
	4688	+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
	4689	+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
	4690	+ return -ENOTTY;
	4691	+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
	4692	+ if (!r) {
	4693	+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
	4694	+ if (copy_to_user(user_ptr, &vmcs_version,
	4695	+ sizeof(vmcs_version)))
	4696	+ r = -EFAULT;
	4697	+ }
	4698	+ return r;
	4699	+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
	4700	+ if (!kvm_x86_ops.enable_direct_tlbflush)
	4701	+ return -ENOTTY;
	4702	+
	4703	+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
	4704	+
	4705	+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
	4706	+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
	4707	+ if (vcpu->arch.pv_cpuid.enforce)
	4708	+ kvm_update_pv_runtime(vcpu);
	4709	+
	4710	+ return 0;
	4711	+
3862	4712	default:
3863	4713	return -EINVAL;
3864	4714	}
..	..	@@ -3885,7 +4735,8 @@
3885	4735	r = -EINVAL;
3886	4736	if (!lapic_in_kernel(vcpu))
3887	4737	goto out;
3888		- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
	4738	+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
	4739	+ GFP_KERNEL_ACCOUNT);
3889	4740
3890	4741	r = -ENOMEM;
3891	4742	if (!u.lapic)
..	..	@@ -3916,7 +4767,7 @@
3916	4767	struct kvm_interrupt irq;
3917	4768
3918	4769	r = -EFAULT;
3919		- if (copy_from_user(&irq, argp, sizeof irq))
	4770	+ if (copy_from_user(&irq, argp, sizeof(irq)))
3920	4771	goto out;
3921	4772	r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3922	4773	break;
..	..	@@ -3934,7 +4785,7 @@
3934	4785	struct kvm_cpuid cpuid;
3935	4786
3936	4787	r = -EFAULT;
3937		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4788	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3938	4789	goto out;
3939	4790	r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3940	4791	break;
..	..	@@ -3944,7 +4795,7 @@
3944	4795	struct kvm_cpuid2 cpuid;
3945	4796
3946	4797	r = -EFAULT;
3947		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4798	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3948	4799	goto out;
3949	4800	r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3950	4801	cpuid_arg->entries);
..	..	@@ -3955,14 +4806,14 @@
3955	4806	struct kvm_cpuid2 cpuid;
3956	4807
3957	4808	r = -EFAULT;
3958		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4809	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3959	4810	goto out;
3960	4811	r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3961	4812	cpuid_arg->entries);
3962	4813	if (r)
3963	4814	goto out;
3964	4815	r = -EFAULT;
3965		- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
	4816	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3966	4817	goto out;
3967	4818	r = 0;
3968	4819	break;
..	..	@@ -3983,13 +4834,13 @@
3983	4834	struct kvm_tpr_access_ctl tac;
3984	4835
3985	4836	r = -EFAULT;
3986		- if (copy_from_user(&tac, argp, sizeof tac))
	4837	+ if (copy_from_user(&tac, argp, sizeof(tac)))
3987	4838	goto out;
3988	4839	r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3989	4840	if (r)
3990	4841	goto out;
3991	4842	r = -EFAULT;
3992		- if (copy_to_user(argp, &tac, sizeof tac))
	4843	+ if (copy_to_user(argp, &tac, sizeof(tac)))
3993	4844	goto out;
3994	4845	r = 0;
3995	4846	break;
..	..	@@ -4002,7 +4853,7 @@
4002	4853	if (!lapic_in_kernel(vcpu))
4003	4854	goto out;
4004	4855	r = -EFAULT;
4005		- if (copy_from_user(&va, argp, sizeof va))
	4856	+ if (copy_from_user(&va, argp, sizeof(va)))
4006	4857	goto out;
4007	4858	idx = srcu_read_lock(&vcpu->kvm->srcu);
4008	4859	r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
..	..	@@ -4013,7 +4864,7 @@
4013	4864	u64 mcg_cap;
4014	4865
4015	4866	r = -EFAULT;
4016		- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
	4867	+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4017	4868	goto out;
4018	4869	r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4019	4870	break;
..	..	@@ -4022,7 +4873,7 @@
4022	4873	struct kvm_x86_mce mce;
4023	4874
4024	4875	r = -EFAULT;
4025		- if (copy_from_user(&mce, argp, sizeof mce))
	4876	+ if (copy_from_user(&mce, argp, sizeof(mce)))
4026	4877	goto out;
4027	4878	r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4028	4879	break;
..	..	@@ -4072,7 +4923,7 @@
4072	4923	break;
4073	4924	}
4074	4925	case KVM_GET_XSAVE: {
4075		- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
	4926	+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4076	4927	r = -ENOMEM;
4077	4928	if (!u.xsave)
4078	4929	break;
..	..	@@ -4096,7 +4947,7 @@
4096	4947	break;
4097	4948	}
4098	4949	case KVM_GET_XCRS: {
4099		- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
	4950	+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4100	4951	r = -ENOMEM;
4101	4952	if (!u.xcrs)
4102	4953	break;
..	..	@@ -4126,7 +4977,8 @@
4126	4977	r = -EINVAL;
4127	4978	user_tsc_khz = (u32)arg;
4128	4979
4129		- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
	4980	+ if (kvm_has_tsc_control &&
	4981	+ user_tsc_khz >= kvm_max_guest_tsc_khz)
4130	4982	goto out;
4131	4983
4132	4984	if (user_tsc_khz == 0)
..	..	@@ -4159,7 +5011,7 @@
4159	5011	u32 user_data_size;
4160	5012
4161	5013	r = -EINVAL;
4162		- if (!kvm_x86_ops->get_nested_state)
	5014	+ if (!kvm_x86_ops.nested_ops->get_state)
4163	5015	break;
4164	5016
4165	5017	BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
..	..	@@ -4167,8 +5019,8 @@
4167	5019	if (get_user(user_data_size, &user_kvm_nested_state->size))
4168	5020	break;
4169	5021
4170		- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
4171		- user_data_size);
	5022	+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
	5023	+ user_data_size);
4172	5024	if (r < 0)
4173	5025	break;
4174	5026
..	..	@@ -4189,7 +5041,7 @@
4189	5041	int idx;
4190	5042
4191	5043	r = -EINVAL;
4192		- if (!kvm_x86_ops->set_nested_state)
	5044	+ if (!kvm_x86_ops.nested_ops->set_state)
4193	5045	break;
4194	5046
4195	5047	r = -EFAULT;
..	..	@@ -4201,16 +5053,38 @@
4201	5053	break;
4202	5054
4203	5055	if (kvm_state.flags &
4204		- ~(KVM_STATE_NESTED_RUN_PENDING \| KVM_STATE_NESTED_GUEST_MODE))
	5056	+ ~(KVM_STATE_NESTED_RUN_PENDING \| KVM_STATE_NESTED_GUEST_MODE
	5057	+ \| KVM_STATE_NESTED_EVMCS \| KVM_STATE_NESTED_MTF_PENDING
	5058	+ \| KVM_STATE_NESTED_GIF_SET))
4205	5059	break;
4206	5060
4207	5061	/* nested_run_pending implies guest_mode. */
4208		- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
	5062	+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
	5063	+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
4209	5064	break;
4210	5065
4211	5066	idx = srcu_read_lock(&vcpu->kvm->srcu);
4212		- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
	5067	+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
4213	5068	srcu_read_unlock(&vcpu->kvm->srcu, idx);
	5069	+ break;
	5070	+ }
	5071	+ case KVM_GET_SUPPORTED_HV_CPUID: {
	5072	+ struct kvm_cpuid2 __user *cpuid_arg = argp;
	5073	+ struct kvm_cpuid2 cpuid;
	5074	+
	5075	+ r = -EFAULT;
	5076	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
	5077	+ goto out;
	5078	+
	5079	+ r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
	5080	+ cpuid_arg->entries);
	5081	+ if (r)
	5082	+ goto out;
	5083	+
	5084	+ r = -EFAULT;
	5085	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
	5086	+ goto out;
	5087	+ r = 0;
4214	5088	break;
4215	5089	}
4216	5090	default:
..	..	@@ -4234,14 +5108,14 @@
4234	5108
4235	5109	if (addr > (unsigned int)(-3 * PAGE_SIZE))
4236	5110	return -EINVAL;
4237		- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
	5111	+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
4238	5112	return ret;
4239	5113	}
4240	5114
4241	5115	static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
4242	5116	u64 ident_addr)
4243	5117	{
4244		- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
	5118	+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
4245	5119	}
4246	5120
4247	5121	static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
..	..	@@ -4382,9 +5256,6 @@
4382	5256	{
4383	5257	struct kvm_pit *pit = kvm->arch.vpit;
4384	5258
4385		- if (!pit)
4386		- return -ENXIO;
4387		-
4388	5259	/* pit->pit_state.lock was overloaded to prevent userspace from getting
4389	5260	* an inconsistent state after running multiple KVM_REINJECT_CONTROL
4390	5261	* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
..	..	@@ -4396,50 +5267,13 @@
4396	5267	return 0;
4397	5268	}
4398	5269
4399		-/**
4400		- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4401		- * @kvm: kvm instance
4402		- * @log: slot id and address to which we copy the log
4403		- *
4404		- * Steps 1-4 below provide general overview of dirty page logging. See
4405		- * kvm_get_dirty_log_protect() function description for additional details.
4406		- *
4407		- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4408		- * always flush the TLB (step 4) even if previous step failed and the dirty
4409		- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4410		- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4411		- * writes will be marked dirty for next log read.
4412		- *
4413		- * 1. Take a snapshot of the bit and clear it if needed.
4414		- * 2. Write protect the corresponding page.
4415		- * 3. Copy the snapshot to the userspace.
4416		- * 4. Flush TLB's if needed.
4417		- */
4418		-int kvm_vm_ioctl_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log)
	5270	+void kvm_arch_sync_dirty_log(struct kvm kvm, struct kvm_memory_slot memslot)
4419	5271	{
4420		- bool is_dirty = false;
4421		- int r;
4422		-
4423		- mutex_lock(&kvm->slots_lock);
4424		-
4425	5272	/*
4426	5273	* Flush potentially hardware-cached dirty pages to dirty_bitmap.
4427	5274	*/
4428		- if (kvm_x86_ops->flush_log_dirty)
4429		- kvm_x86_ops->flush_log_dirty(kvm);
4430		-
4431		- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
4432		-
4433		- /*
4434		- * All the TLBs can be flushed out of mmu lock, see the comments in
4435		- * kvm_mmu_slot_remove_write_access().
4436		- */
4437		- lockdep_assert_held(&kvm->slots_lock);
4438		- if (is_dirty)
4439		- kvm_flush_remote_tlbs(kvm);
4440		-
4441		- mutex_unlock(&kvm->slots_lock);
4442		- return r;
	5275	+ if (kvm_x86_ops.flush_log_dirty)
	5276	+ kvm_x86_ops.flush_log_dirty(kvm);
4443	5277	}
4444	5278
4445	5279	int kvm_vm_ioctl_irq_line(struct kvm kvm, struct kvm_irq_level irq_event,
..	..	@@ -4454,8 +5288,8 @@
4454	5288	return 0;
4455	5289	}
4456	5290
4457		-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4458		- struct kvm_enable_cap *cap)
	5291	+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
	5292	+ struct kvm_enable_cap *cap)
4459	5293	{
4460	5294	int r;
4461	5295
..	..	@@ -4513,10 +5347,25 @@
4513	5347	kvm->arch.hlt_in_guest = true;
4514	5348	if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4515	5349	kvm->arch.pause_in_guest = true;
	5350	+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
	5351	+ kvm->arch.cstate_in_guest = true;
4516	5352	r = 0;
4517	5353	break;
4518	5354	case KVM_CAP_MSR_PLATFORM_INFO:
4519	5355	kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
	5356	+ r = 0;
	5357	+ break;
	5358	+ case KVM_CAP_EXCEPTION_PAYLOAD:
	5359	+ kvm->arch.exception_payload_enabled = cap->args[0];
	5360	+ r = 0;
	5361	+ break;
	5362	+ case KVM_CAP_X86_USER_SPACE_MSR:
	5363	+ r = -EINVAL;
	5364	+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL \|
	5365	+ KVM_MSR_EXIT_REASON_UNKNOWN \|
	5366	+ KVM_MSR_EXIT_REASON_FILTER))
	5367	+ break;
	5368	+ kvm->arch.user_space_msr_mask = cap->args[0];
4520	5369	r = 0;
4521	5370	break;
4522	5371	default:
..	..	@@ -4525,6 +5374,180 @@
4525	5374	}
4526	5375	return r;
4527	5376	}
	5377	+
	5378	+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
	5379	+{
	5380	+ struct kvm_x86_msr_filter *msr_filter;
	5381	+
	5382	+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
	5383	+ if (!msr_filter)
	5384	+ return NULL;
	5385	+
	5386	+ msr_filter->default_allow = default_allow;
	5387	+ return msr_filter;
	5388	+}
	5389	+
	5390	+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
	5391	+{
	5392	+ u32 i;
	5393	+
	5394	+ if (!msr_filter)
	5395	+ return;
	5396	+
	5397	+ for (i = 0; i < msr_filter->count; i++)
	5398	+ kfree(msr_filter->ranges[i].bitmap);
	5399	+
	5400	+ kfree(msr_filter);
	5401	+}
	5402	+
	5403	+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
	5404	+ struct kvm_msr_filter_range *user_range)
	5405	+{
	5406	+ struct msr_bitmap_range range;
	5407	+ unsigned long *bitmap = NULL;
	5408	+ size_t bitmap_size;
	5409	+ int r;
	5410	+
	5411	+ if (!user_range->nmsrs)
	5412	+ return 0;
	5413	+
	5414	+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
	5415	+ if (!bitmap_size \|\| bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
	5416	+ return -EINVAL;
	5417	+
	5418	+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
	5419	+ if (IS_ERR(bitmap))
	5420	+ return PTR_ERR(bitmap);
	5421	+
	5422	+ range = (struct msr_bitmap_range) {
	5423	+ .flags = user_range->flags,
	5424	+ .base = user_range->base,
	5425	+ .nmsrs = user_range->nmsrs,
	5426	+ .bitmap = bitmap,
	5427	+ };
	5428	+
	5429	+ if (range.flags & ~(KVM_MSR_FILTER_READ \| KVM_MSR_FILTER_WRITE)) {
	5430	+ r = -EINVAL;
	5431	+ goto err;
	5432	+ }
	5433	+
	5434	+ if (!range.flags) {
	5435	+ r = -EINVAL;
	5436	+ goto err;
	5437	+ }
	5438	+
	5439	+ /* Everything ok, add this range identifier. */
	5440	+ msr_filter->ranges[msr_filter->count] = range;
	5441	+ msr_filter->count++;
	5442	+
	5443	+ return 0;
	5444	+err:
	5445	+ kfree(bitmap);
	5446	+ return r;
	5447	+}
	5448	+
	5449	+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
	5450	+ struct kvm_msr_filter *filter)
	5451	+{
	5452	+ struct kvm_x86_msr_filter new_filter, old_filter;
	5453	+ bool default_allow;
	5454	+ bool empty = true;
	5455	+ int r = 0;
	5456	+ u32 i;
	5457	+
	5458	+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
	5459	+ return -EINVAL;
	5460	+
	5461	+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
	5462	+ empty &= !filter->ranges[i].nmsrs;
	5463	+
	5464	+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
	5465	+ if (empty && !default_allow)
	5466	+ return -EINVAL;
	5467	+
	5468	+ new_filter = kvm_alloc_msr_filter(default_allow);
	5469	+ if (!new_filter)
	5470	+ return -ENOMEM;
	5471	+
	5472	+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
	5473	+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
	5474	+ if (r) {
	5475	+ kvm_free_msr_filter(new_filter);
	5476	+ return r;
	5477	+ }
	5478	+ }
	5479	+
	5480	+ mutex_lock(&kvm->lock);
	5481	+
	5482	+ /* The per-VM filter is protected by kvm->lock... */
	5483	+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
	5484	+
	5485	+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
	5486	+ synchronize_srcu(&kvm->srcu);
	5487	+
	5488	+ kvm_free_msr_filter(old_filter);
	5489	+
	5490	+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
	5491	+ mutex_unlock(&kvm->lock);
	5492	+
	5493	+ return 0;
	5494	+}
	5495	+
	5496	+#ifdef CONFIG_KVM_COMPAT
	5497	+/* for KVM_X86_SET_MSR_FILTER */
	5498	+struct kvm_msr_filter_range_compat {
	5499	+ __u32 flags;
	5500	+ __u32 nmsrs;
	5501	+ __u32 base;
	5502	+ __u32 bitmap;
	5503	+};
	5504	+
	5505	+struct kvm_msr_filter_compat {
	5506	+ __u32 flags;
	5507	+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
	5508	+};
	5509	+
	5510	+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
	5511	+
	5512	+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
	5513	+ unsigned long arg)
	5514	+{
	5515	+ void __user argp = (void __user )arg;
	5516	+ struct kvm *kvm = filp->private_data;
	5517	+ long r = -ENOTTY;
	5518	+
	5519	+ switch (ioctl) {
	5520	+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
	5521	+ struct kvm_msr_filter __user *user_msr_filter = argp;
	5522	+ struct kvm_msr_filter_compat filter_compat;
	5523	+ struct kvm_msr_filter filter;
	5524	+ int i;
	5525	+
	5526	+ if (copy_from_user(&filter_compat, user_msr_filter,
	5527	+ sizeof(filter_compat)))
	5528	+ return -EFAULT;
	5529	+
	5530	+ filter.flags = filter_compat.flags;
	5531	+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
	5532	+ struct kvm_msr_filter_range_compat *cr;
	5533	+
	5534	+ cr = &filter_compat.ranges[i];
	5535	+ filter.ranges[i] = (struct kvm_msr_filter_range) {
	5536	+ .flags = cr->flags,
	5537	+ .nmsrs = cr->nmsrs,
	5538	+ .base = cr->base,
	5539	+ .bitmap = (__u8 *)(ulong)cr->bitmap,
	5540	+ };
	5541	+ }
	5542	+
	5543	+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
	5544	+ break;
	5545	+ }
	5546	+ }
	5547	+
	5548	+ return r;
	5549	+}
	5550	+#endif
4528	5551
4529	5552	long kvm_arch_vm_ioctl(struct file *filp,
4530	5553	unsigned int ioctl, unsigned long arg)
..	..	@@ -4555,7 +5578,7 @@
4555	5578	if (kvm->created_vcpus)
4556	5579	goto set_identity_unlock;
4557	5580	r = -EFAULT;
4558		- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
	5581	+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
4559	5582	goto set_identity_unlock;
4560	5583	r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
4561	5584	set_identity_unlock:
..	..	@@ -4639,7 +5662,7 @@
4639	5662	if (r)
4640	5663	goto get_irqchip_out;
4641	5664	r = -EFAULT;
4642		- if (copy_to_user(argp, chip, sizeof *chip))
	5665	+ if (copy_to_user(argp, chip, sizeof(*chip)))
4643	5666	goto get_irqchip_out;
4644	5667	r = 0;
4645	5668	get_irqchip_out:
..	..	@@ -4660,9 +5683,6 @@
4660	5683	if (!irqchip_kernel(kvm))
4661	5684	goto set_irqchip_out;
4662	5685	r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4663		- if (r)
4664		- goto set_irqchip_out;
4665		- r = 0;
4666	5686	set_irqchip_out:
4667	5687	kfree(chip);
4668	5688	break;
..	..	@@ -4685,7 +5705,7 @@
4685	5705	}
4686	5706	case KVM_SET_PIT: {
4687	5707	r = -EFAULT;
4688		- if (copy_from_user(&u.ps, argp, sizeof u.ps))
	5708	+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
4689	5709	goto out;
4690	5710	mutex_lock(&kvm->lock);
4691	5711	r = -ENXIO;
..	..	@@ -4726,6 +5746,9 @@
4726	5746	struct kvm_reinject_control control;
4727	5747	r = -EFAULT;
4728	5748	if (copy_from_user(&control, argp, sizeof(control)))
	5749	+ goto out;
	5750	+ r = -ENXIO;
	5751	+ if (!kvm->arch.vpit)
4729	5752	goto out;
4730	5753	r = kvm_vm_ioctl_reinject(kvm, &control);
4731	5754	break;
..	..	@@ -4790,19 +5813,10 @@
4790	5813	r = 0;
4791	5814	break;
4792	5815	}
4793		- case KVM_ENABLE_CAP: {
4794		- struct kvm_enable_cap cap;
4795		-
4796		- r = -EFAULT;
4797		- if (copy_from_user(&cap, argp, sizeof(cap)))
4798		- goto out;
4799		- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4800		- break;
4801		- }
4802	5816	case KVM_MEMORY_ENCRYPT_OP: {
4803	5817	r = -ENOTTY;
4804		- if (kvm_x86_ops->mem_enc_op)
4805		- r = kvm_x86_ops->mem_enc_op(kvm, argp);
	5818	+ if (kvm_x86_ops.mem_enc_op)
	5819	+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
4806	5820	break;
4807	5821	}
4808	5822	case KVM_MEMORY_ENCRYPT_REG_REGION: {
..	..	@@ -4813,8 +5827,8 @@
4813	5827	goto out;
4814	5828
4815	5829	r = -ENOTTY;
4816		- if (kvm_x86_ops->mem_enc_reg_region)
4817		- r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
	5830	+ if (kvm_x86_ops.mem_enc_reg_region)
	5831	+ r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
4818	5832	break;
4819	5833	}
4820	5834	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
..	..	@@ -4825,8 +5839,8 @@
4825	5839	goto out;
4826	5840
4827	5841	r = -ENOTTY;
4828		- if (kvm_x86_ops->mem_enc_unreg_region)
4829		- r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
	5842	+ if (kvm_x86_ops.mem_enc_unreg_region)
	5843	+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
4830	5844	break;
4831	5845	}
4832	5846	case KVM_HYPERV_EVENTFD: {
..	..	@@ -4838,6 +5852,19 @@
4838	5852	r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4839	5853	break;
4840	5854	}
	5855	+ case KVM_SET_PMU_EVENT_FILTER:
	5856	+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
	5857	+ break;
	5858	+ case KVM_X86_SET_MSR_FILTER: {
	5859	+ struct kvm_msr_filter __user *user_msr_filter = argp;
	5860	+ struct kvm_msr_filter filter;
	5861	+
	5862	+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
	5863	+ return -EFAULT;
	5864	+
	5865	+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
	5866	+ break;
	5867	+ }
4841	5868	default:
4842	5869	r = -ENOTTY;
4843	5870	}
..	..	@@ -4847,58 +5874,96 @@
4847	5874
4848	5875	static void kvm_init_msr_list(void)
4849	5876	{
	5877	+ struct x86_pmu_capability x86_pmu;
4850	5878	u32 dummy[2];
4851		- unsigned i, j;
	5879	+ unsigned i;
4852	5880
4853		- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4854		- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
	5881	+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
	5882	+ "Please update the fixed PMCs in msrs_to_saved_all[]");
	5883	+
	5884	+ perf_get_x86_pmu_capability(&x86_pmu);
	5885	+
	5886	+ num_msrs_to_save = 0;
	5887	+ num_emulated_msrs = 0;
	5888	+ num_msr_based_features = 0;
	5889	+
	5890	+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
	5891	+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
4855	5892	continue;
4856	5893
4857	5894	/*
4858	5895	* Even MSRs that are valid in the host may not be exposed
4859	5896	* to the guests in some cases.
4860	5897	*/
4861		- switch (msrs_to_save[i]) {
	5898	+ switch (msrs_to_save_all[i]) {
4862	5899	case MSR_IA32_BNDCFGS:
4863	5900	if (!kvm_mpx_supported())
4864	5901	continue;
4865	5902	break;
4866	5903	case MSR_TSC_AUX:
4867		- if (!kvm_x86_ops->rdtscp_supported())
	5904	+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
	5905	+ continue;
	5906	+ break;
	5907	+ case MSR_IA32_UMWAIT_CONTROL:
	5908	+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
	5909	+ continue;
	5910	+ break;
	5911	+ case MSR_IA32_RTIT_CTL:
	5912	+ case MSR_IA32_RTIT_STATUS:
	5913	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
	5914	+ continue;
	5915	+ break;
	5916	+ case MSR_IA32_RTIT_CR3_MATCH:
	5917	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5918	+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
	5919	+ continue;
	5920	+ break;
	5921	+ case MSR_IA32_RTIT_OUTPUT_BASE:
	5922	+ case MSR_IA32_RTIT_OUTPUT_MASK:
	5923	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5924	+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
	5925	+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
	5926	+ continue;
	5927	+ break;
	5928	+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
	5929	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5930	+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
	5931	+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
	5932	+ continue;
	5933	+ break;
	5934	+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
	5935	+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
	5936	+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
	5937	+ continue;
	5938	+ break;
	5939	+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
	5940	+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
	5941	+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
4868	5942	continue;
4869	5943	break;
4870	5944	default:
4871	5945	break;
4872	5946	}
4873	5947
4874		- if (j < i)
4875		- msrs_to_save[j] = msrs_to_save[i];
4876		- j++;
	5948	+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
4877	5949	}
4878		- num_msrs_to_save = j;
4879	5950
4880		- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4881		- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
	5951	+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
	5952	+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
4882	5953	continue;
4883	5954
4884		- if (j < i)
4885		- emulated_msrs[j] = emulated_msrs[i];
4886		- j++;
	5955	+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
4887	5956	}
4888		- num_emulated_msrs = j;
4889	5957
4890		- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
	5958	+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
4891	5959	struct kvm_msr_entry msr;
4892	5960
4893		- msr.index = msr_based_features[i];
	5961	+ msr.index = msr_based_features_all[i];
4894	5962	if (kvm_get_msr_feature(&msr))
4895	5963	continue;
4896	5964
4897		- if (j < i)
4898		- msr_based_features[j] = msr_based_features[i];
4899		- j++;
	5965	+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
4900	5966	}
4901		- num_msr_based_features = j;
4902	5967	}
4903	5968
4904	5969	static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
..	..	@@ -4947,13 +6012,13 @@
4947	6012	static void kvm_set_segment(struct kvm_vcpu *vcpu,
4948	6013	struct kvm_segment *var, int seg)
4949	6014	{
4950		- kvm_x86_ops->set_segment(vcpu, var, seg);
	6015	+ kvm_x86_ops.set_segment(vcpu, var, seg);
4951	6016	}
4952	6017
4953	6018	void kvm_get_segment(struct kvm_vcpu *vcpu,
4954	6019	struct kvm_segment *var, int seg)
4955	6020	{
4956		- kvm_x86_ops->get_segment(vcpu, var, seg);
	6021	+ kvm_x86_ops.get_segment(vcpu, var, seg);
4957	6022	}
4958	6023
4959	6024	gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
..	..	@@ -4965,7 +6030,7 @@
4965	6030
4966	6031	/* NPT walks are always user-walks */
4967	6032	access \|= PFERR_USER_MASK;
4968		- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
	6033	+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
4969	6034
4970	6035	return t_gpa;
4971	6036	}
..	..	@@ -4973,14 +6038,14 @@
4973	6038	gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
4974	6039	struct x86_exception *exception)
4975	6040	{
4976		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6041	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4977	6042	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4978	6043	}
4979	6044
4980	6045	gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
4981	6046	struct x86_exception *exception)
4982	6047	{
4983		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6048	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4984	6049	access \|= PFERR_FETCH_MASK;
4985	6050	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4986	6051	}
..	..	@@ -4988,7 +6053,7 @@
4988	6053	gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
4989	6054	struct x86_exception *exception)
4990	6055	{
4991		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6056	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4992	6057	access \|= PFERR_WRITE_MASK;
4993	6058	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4994	6059	}
..	..	@@ -5037,7 +6102,7 @@
5037	6102	struct x86_exception *exception)
5038	6103	{
5039	6104	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5040		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6105	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5041	6106	unsigned offset;
5042	6107	int ret;
5043	6108
..	..	@@ -5062,7 +6127,7 @@
5062	6127	gva_t addr, void *val, unsigned int bytes,
5063	6128	struct x86_exception *exception)
5064	6129	{
5065		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6130	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5066	6131
5067	6132	/*
5068	6133	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
..	..	@@ -5083,7 +6148,7 @@
5083	6148	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5084	6149	u32 access = 0;
5085	6150
5086		- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
	6151	+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
5087	6152	access \|= PFERR_USER_MASK;
5088	6153
5089	6154	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
..	..	@@ -5136,7 +6201,7 @@
5136	6201	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5137	6202	u32 access = PFERR_WRITE_MASK;
5138	6203
5139		- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
	6204	+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
5140	6205	access \|= PFERR_USER_MASK;
5141	6206
5142	6207	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
..	..	@@ -5149,13 +6214,6 @@
5149	6214	/* kvm_write_guest_virt_system can pull in tons of pages. */
5150	6215	vcpu->arch.l1tf_flush_l1d = true;
5151	6216
5152		- /*
5153		- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5154		- * is returned, but our callers are not ready for that and they blindly
5155		- * call kvm_inject_page_fault. Ensure that they at least do not leak
5156		- * uninitialized kernel stack memory into cr2 and error code.
5157		- */
5158		- memset(exception, 0, sizeof(*exception));
5159	6217	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5160	6218	PFERR_WRITE_MASK, exception);
5161	6219	}
..	..	@@ -5163,25 +6221,23 @@
5163	6221
5164	6222	int handle_ud(struct kvm_vcpu *vcpu)
5165	6223	{
	6224	+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
5166	6225	int emul_type = EMULTYPE_TRAP_UD;
5167		- enum emulation_result er;
5168	6226	char sig[5]; /* ud2; .ascii "kvm" */
5169	6227	struct x86_exception e;
	6228	+
	6229	+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
	6230	+ return 1;
5170	6231
5171	6232	if (force_emulation_prefix &&
5172	6233	kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
5173	6234	sig, sizeof(sig), &e) == 0 &&
5174		- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
	6235	+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
5175	6236	kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
5176		- emul_type = 0;
	6237	+ emul_type = EMULTYPE_TRAP_UD_FORCED;
5177	6238	}
5178	6239
5179		- er = kvm_emulate_instruction(vcpu, emul_type);
5180		- if (er == EMULATE_USER_EXIT)
5181		- return 0;
5182		- if (er != EMULATE_DONE)
5183		- kvm_queue_exception(vcpu, UD_VECTOR);
5184		- return 1;
	6240	+ return kvm_emulate_instruction(vcpu, emul_type);
5185	6241	}
5186	6242	EXPORT_SYMBOL_GPL(handle_ud);
5187	6243
..	..	@@ -5204,7 +6260,7 @@
5204	6260	gpa_t gpa, struct x86_exception exception,
5205	6261	bool write)
5206	6262	{
5207		- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
	6263	+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
5208	6264	\| (write ? PFERR_WRITE_MASK : 0);
5209	6265
5210	6266	/*
..	..	@@ -5214,7 +6270,7 @@
5214	6270	*/
5215	6271	if (vcpu_match_mmio_gva(vcpu, gva)
5216	6272	&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
5217		- vcpu->arch.access, 0, access)) {
	6273	+ vcpu->arch.mmio_access, 0, access)) {
5218	6274	*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT \|
5219	6275	(gva & (PAGE_SIZE - 1));
5220	6276	trace_vcpu_match_mmio(gva, *gpa, write, false);
..	..	@@ -5323,7 +6379,7 @@
5323	6379	int handled, ret;
5324	6380	bool write = ops->write;
5325	6381	struct kvm_mmio_fragment *frag;
5326		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	6382	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5327	6383
5328	6384	/*
5329	6385	* If the exit was due to a NPF we may already have a GPA.
..	..	@@ -5332,10 +6388,9 @@
5332	6388	* operation using rep will only have the initial GPA from the NPF
5333	6389	* occurred.
5334	6390	*/
5335		- if (vcpu->arch.gpa_available &&
5336		- emulator_can_use_gpa(ctxt) &&
5337		- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5338		- gpa = vcpu->arch.gpa_val;
	6391	+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
	6392	+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
	6393	+ gpa = ctxt->gpa_val;
5339	6394	ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
5340	6395	} else {
5341	6396	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
..	..	@@ -5456,9 +6511,10 @@
5456	6511	unsigned int bytes,
5457	6512	struct x86_exception *exception)
5458	6513	{
	6514	+ struct kvm_host_map map;
5459	6515	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6516	+ u64 page_line_mask;
5460	6517	gpa_t gpa;
5461		- struct page *page;
5462	6518	char *kaddr;
5463	6519	bool exchanged;
5464	6520
..	..	@@ -5472,15 +6528,23 @@
5472	6528	(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5473	6529	goto emul_write;
5474	6530
5475		- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
	6531	+ /*
	6532	+ * Emulate the atomic as a straight write to avoid #AC if SLD is
	6533	+ * enabled in the host and the access splits a cache line.
	6534	+ */
	6535	+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
	6536	+ page_line_mask = ~(cache_line_size() - 1);
	6537	+ else
	6538	+ page_line_mask = PAGE_MASK;
	6539	+
	6540	+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
5476	6541	goto emul_write;
5477	6542
5478		- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
5479		- if (is_error_page(page))
	6543	+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
5480	6544	goto emul_write;
5481	6545
5482		- kaddr = kmap_atomic(page);
5483		- kaddr += offset_in_page(gpa);
	6546	+ kaddr = map.hva + offset_in_page(gpa);
	6547	+
5484	6548	switch (bytes) {
5485	6549	case 1:
5486	6550	exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
..	..	@@ -5497,13 +6561,12 @@
5497	6561	default:
5498	6562	BUG();
5499	6563	}
5500		- kunmap_atomic(kaddr);
5501		- kvm_release_page_dirty(page);
	6564	+
	6565	+ kvm_vcpu_unmap(vcpu, &map, true);
5502	6566
5503	6567	if (!exchanged)
5504	6568	return X86EMUL_CMPXCHG_FAILED;
5505	6569
5506		- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5507	6570	kvm_page_track_write(vcpu, gpa, new, bytes);
5508	6571
5509	6572	return X86EMUL_CONTINUE;
..	..	@@ -5557,11 +6620,9 @@
5557	6620	return 0;
5558	6621	}
5559	6622
5560		-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5561		- int size, unsigned short port, void *val,
5562		- unsigned int count)
	6623	+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
	6624	+ unsigned short port, void *val, unsigned int count)
5563	6625	{
5564		- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5565	6626	int ret;
5566	6627
5567	6628	if (vcpu->arch.pio.count)
..	..	@@ -5581,20 +6642,33 @@
5581	6642	return 0;
5582	6643	}
5583	6644
5584		-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
5585		- int size, unsigned short port,
5586		- const void *val, unsigned int count)
	6645	+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
	6646	+ int size, unsigned short port, void *val,
	6647	+ unsigned int count)
5587	6648	{
5588		- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6649	+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
5589	6650
	6651	+}
	6652	+
	6653	+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
	6654	+ unsigned short port, const void *val,
	6655	+ unsigned int count)
	6656	+{
5590	6657	memcpy(vcpu->arch.pio_data, val, size * count);
5591	6658	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
5592	6659	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
5593	6660	}
5594	6661
	6662	+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
	6663	+ int size, unsigned short port,
	6664	+ const void *val, unsigned int count)
	6665	+{
	6666	+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
	6667	+}
	6668	+
5595	6669	static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
5596	6670	{
5597		- return kvm_x86_ops->get_segment_base(vcpu, seg);
	6671	+ return kvm_x86_ops.get_segment_base(vcpu, seg);
5598	6672	}
5599	6673
5600	6674	static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
..	..	@@ -5607,7 +6681,7 @@
5607	6681	if (!need_emulate_wbinvd(vcpu))
5608	6682	return X86EMUL_CONTINUE;
5609	6683
5610		- if (kvm_x86_ops->has_wbinvd_exit()) {
	6684	+ if (kvm_x86_ops.has_wbinvd_exit()) {
5611	6685	int cpu = get_cpu();
5612	6686
5613	6687	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
..	..	@@ -5712,27 +6786,27 @@
5712	6786
5713	6787	static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
5714	6788	{
5715		- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
	6789	+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
5716	6790	}
5717	6791
5718	6792	static void emulator_get_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5719	6793	{
5720		- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
	6794	+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
5721	6795	}
5722	6796
5723	6797	static void emulator_get_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5724	6798	{
5725		- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
	6799	+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
5726	6800	}
5727	6801
5728	6802	static void emulator_set_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5729	6803	{
5730		- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
	6804	+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
5731	6805	}
5732	6806
5733	6807	static void emulator_set_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5734	6808	{
5735		- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
	6809	+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
5736	6810	}
5737	6811
5738	6812	static unsigned long emulator_get_cached_segment_base(
..	..	@@ -5810,28 +6884,33 @@
5810	6884	static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
5811	6885	u32 msr_index, u64 *pdata)
5812	6886	{
5813		- struct msr_data msr;
	6887	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5814	6888	int r;
5815	6889
5816		- msr.index = msr_index;
5817		- msr.host_initiated = false;
5818		- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
5819		- if (r)
5820		- return r;
	6890	+ r = kvm_get_msr(vcpu, msr_index, pdata);
5821	6891
5822		- *pdata = msr.data;
5823		- return 0;
	6892	+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
	6893	+ /* Bounce to user space */
	6894	+ return X86EMUL_IO_NEEDED;
	6895	+ }
	6896	+
	6897	+ return r;
5824	6898	}
5825	6899
5826	6900	static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
5827	6901	u32 msr_index, u64 data)
5828	6902	{
5829		- struct msr_data msr;
	6903	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6904	+ int r;
5830	6905
5831		- msr.data = data;
5832		- msr.index = msr_index;
5833		- msr.host_initiated = false;
5834		- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
	6906	+ r = kvm_set_msr(vcpu, msr_index, data);
	6907	+
	6908	+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
	6909	+ /* Bounce to user space */
	6910	+ return X86EMUL_IO_NEEDED;
	6911	+ }
	6912	+
	6913	+ return r;
5835	6914	}
5836	6915
5837	6916	static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
..	..	@@ -5851,7 +6930,7 @@
5851	6930	static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
5852	6931	u32 pmc)
5853	6932	{
5854		- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
	6933	+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
5855	6934	}
5856	6935
5857	6936	static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
..	..	@@ -5869,13 +6948,35 @@
5869	6948	struct x86_instruction_info *info,
5870	6949	enum x86_intercept_stage stage)
5871	6950	{
5872		- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
	6951	+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
	6952	+ &ctxt->exception);
5873	6953	}
5874	6954
5875	6955	static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5876		- u32 eax, u32 ebx, u32 ecx, u32 edx, bool check_limit)
	6956	+ u32 eax, u32 ebx, u32 ecx, u32 edx,
	6957	+ bool exact_only)
5877	6958	{
5878		- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
	6959	+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
	6960	+}
	6961	+
	6962	+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
	6963	+{
	6964	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
	6965	+}
	6966	+
	6967	+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
	6968	+{
	6969	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
	6970	+}
	6971	+
	6972	+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
	6973	+{
	6974	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
	6975	+}
	6976	+
	6977	+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
	6978	+{
	6979	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
5879	6980	}
5880	6981
5881	6982	static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
..	..	@@ -5890,7 +6991,7 @@
5890	6991
5891	6992	static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
5892	6993	{
5893		- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
	6994	+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
5894	6995	}
5895	6996
5896	6997	static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
..	..	@@ -5900,12 +7001,26 @@
5900	7001
5901	7002	static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
5902	7003	{
5903		- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
	7004	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	7005	+
	7006	+ vcpu->arch.hflags = emul_flags;
	7007	+ kvm_mmu_reset_context(vcpu);
5904	7008	}
5905	7009
5906		-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
	7010	+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
	7011	+ const char *smstate)
5907	7012	{
5908		- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
	7013	+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
	7014	+}
	7015	+
	7016	+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
	7017	+{
	7018	+ kvm_smm_changed(emul_to_vcpu(ctxt));
	7019	+}
	7020	+
	7021	+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
	7022	+{
	7023	+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
5909	7024	}
5910	7025
5911	7026	static const struct x86_emulate_ops emulate_ops = {
..	..	@@ -5944,15 +7059,21 @@
5944	7059	.fix_hypercall = emulator_fix_hypercall,
5945	7060	.intercept = emulator_intercept,
5946	7061	.get_cpuid = emulator_get_cpuid,
	7062	+ .guest_has_long_mode = emulator_guest_has_long_mode,
	7063	+ .guest_has_movbe = emulator_guest_has_movbe,
	7064	+ .guest_has_fxsr = emulator_guest_has_fxsr,
	7065	+ .guest_has_rdpid = emulator_guest_has_rdpid,
5947	7066	.set_nmi_mask = emulator_set_nmi_mask,
5948	7067	.get_hflags = emulator_get_hflags,
5949	7068	.set_hflags = emulator_set_hflags,
5950	7069	.pre_leave_smm = emulator_pre_leave_smm,
	7070	+ .post_leave_smm = emulator_post_leave_smm,
	7071	+ .set_xcr = emulator_set_xcr,
5951	7072	};
5952	7073
5953	7074	static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
5954	7075	{
5955		- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
	7076	+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
5956	7077	/*
5957	7078	* an sti; sti; sequence only disable interrupts for the first
5958	7079	* instruction. So, if the last instruction, be it emulated or
..	..	@@ -5963,7 +7084,7 @@
5963	7084	if (int_shadow & mask)
5964	7085	mask = 0;
5965	7086	if (unlikely(int_shadow \|\| mask)) {
5966		- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
	7087	+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
5967	7088	if (!mask)
5968	7089	kvm_make_request(KVM_REQ_EVENT, vcpu);
5969	7090	}
..	..	@@ -5971,9 +7092,9 @@
5971	7092
5972	7093	static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
5973	7094	{
5974		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7095	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5975	7096	if (ctxt->exception.vector == PF_VECTOR)
5976		- return kvm_propagate_fault(vcpu, &ctxt->exception);
	7097	+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
5977	7098
5978	7099	if (ctxt->exception.error_code_valid)
5979	7100	kvm_queue_exception_e(vcpu, ctxt->exception.vector,
..	..	@@ -5983,13 +7104,31 @@
5983	7104	return false;
5984	7105	}
5985	7106
	7107	+static struct x86_emulate_ctxt alloc_emulate_ctxt(struct kvm_vcpu vcpu)
	7108	+{
	7109	+ struct x86_emulate_ctxt *ctxt;
	7110	+
	7111	+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
	7112	+ if (!ctxt) {
	7113	+ pr_err("kvm: failed to allocate vcpu's emulator\n");
	7114	+ return NULL;
	7115	+ }
	7116	+
	7117	+ ctxt->vcpu = vcpu;
	7118	+ ctxt->ops = &emulate_ops;
	7119	+ vcpu->arch.emulate_ctxt = ctxt;
	7120	+
	7121	+ return ctxt;
	7122	+}
	7123	+
5986	7124	static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
5987	7125	{
5988		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7126	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5989	7127	int cs_db, cs_l;
5990	7128
5991		- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
	7129	+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5992	7130
	7131	+ ctxt->gpa_available = false;
5993	7132	ctxt->eflags = kvm_get_rflags(vcpu);
5994	7133	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
5995	7134
..	..	@@ -6003,13 +7142,18 @@
6003	7142	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
6004	7143	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
6005	7144
	7145	+ ctxt->interruptibility = 0;
	7146	+ ctxt->have_exception = false;
	7147	+ ctxt->exception.vector = -1;
	7148	+ ctxt->perm_ok = false;
	7149	+
6006	7150	init_decode_cache(ctxt);
6007	7151	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
6008	7152	}
6009	7153
6010		-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
	7154	+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
6011	7155	{
6012		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7156	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6013	7157	int ret;
6014	7158
6015	7159	init_emulate_ctxt(vcpu);
..	..	@@ -6019,37 +7163,43 @@
6019	7163	ctxt->_eip = ctxt->eip + inc_eip;
6020	7164	ret = emulate_int_real(ctxt, irq);
6021	7165
6022		- if (ret != X86EMUL_CONTINUE)
6023		- return EMULATE_FAIL;
6024		-
6025		- ctxt->eip = ctxt->_eip;
6026		- kvm_rip_write(vcpu, ctxt->eip);
6027		- kvm_set_rflags(vcpu, ctxt->eflags);
6028		-
6029		- return EMULATE_DONE;
	7166	+ if (ret != X86EMUL_CONTINUE) {
	7167	+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	7168	+ } else {
	7169	+ ctxt->eip = ctxt->_eip;
	7170	+ kvm_rip_write(vcpu, ctxt->eip);
	7171	+ kvm_set_rflags(vcpu, ctxt->eflags);
	7172	+ }
6030	7173	}
6031	7174	EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
6032	7175
6033	7176	static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
6034	7177	{
6035		- int r = EMULATE_DONE;
6036		-
6037	7178	++vcpu->stat.insn_emulation_fail;
6038	7179	trace_kvm_emulate_insn_failed(vcpu);
6039	7180
6040		- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
6041		- return EMULATE_FAIL;
	7181	+ if (emulation_type & EMULTYPE_VMWARE_GP) {
	7182	+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	7183	+ return 1;
	7184	+ }
6042	7185
6043		- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
	7186	+ if (emulation_type & EMULTYPE_SKIP) {
6044	7187	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6045	7188	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6046	7189	vcpu->run->internal.ndata = 0;
6047		- r = EMULATE_USER_EXIT;
	7190	+ return 0;
6048	7191	}
6049	7192
6050	7193	kvm_queue_exception(vcpu, UD_VECTOR);
6051	7194
6052		- return r;
	7195	+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
	7196	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	7197	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	7198	+ vcpu->run->internal.ndata = 0;
	7199	+ return 0;
	7200	+ }
	7201	+
	7202	+ return 1;
6053	7203	}
6054	7204
6055	7205	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
..	..	@@ -6059,13 +7209,14 @@
6059	7209	gpa_t gpa = cr2_or_gpa;
6060	7210	kvm_pfn_t pfn;
6061	7211
6062		- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
	7212	+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
6063	7213	return false;
6064	7214
6065		- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
	7215	+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) \|\|
	7216	+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6066	7217	return false;
6067	7218
6068		- if (!vcpu->arch.mmu.direct_map) {
	7219	+ if (!vcpu->arch.mmu->direct_map) {
6069	7220	/*
6070	7221	* Write permission should be allowed since only
6071	7222	* write access need to be emulated.
..	..	@@ -6098,7 +7249,7 @@
6098	7249	kvm_release_pfn_clean(pfn);
6099	7250
6100	7251	/* The instructions are well-emulated on direct mmu. */
6101		- if (vcpu->arch.mmu.direct_map) {
	7252	+ if (vcpu->arch.mmu->direct_map) {
6102	7253	unsigned int indirect_shadow_pages;
6103	7254
6104	7255	spin_lock(&vcpu->kvm->mmu_lock);
..	..	@@ -6150,10 +7301,11 @@
6150	7301	*/
6151	7302	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
6152	7303
6153		- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
	7304	+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
6154	7305	return false;
6155	7306
6156		- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
	7307	+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) \|\|
	7308	+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6157	7309	return false;
6158	7310
6159	7311	if (x86_page_table_writing_insn(ctxt))
..	..	@@ -6165,7 +7317,7 @@
6165	7317	vcpu->arch.last_retry_eip = ctxt->eip;
6166	7318	vcpu->arch.last_retry_addr = cr2_or_gpa;
6167	7319
6168		- if (!vcpu->arch.mmu.direct_map)
	7320	+ if (!vcpu->arch.mmu->direct_map)
6169	7321	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
6170	7322
6171	7323	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
..	..	@@ -6189,16 +7341,6 @@
6189	7341	kvm_mmu_reset_context(vcpu);
6190	7342	}
6191	7343
6192		-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
6193		-{
6194		- unsigned changed = vcpu->arch.hflags ^ emul_flags;
6195		-
6196		- vcpu->arch.hflags = emul_flags;
6197		-
6198		- if (changed & HF_SMM_MASK)
6199		- kvm_smm_changed(vcpu);
6200		-}
6201		-
6202	7344	static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
6203	7345	unsigned long *db)
6204	7346	{
..	..	@@ -6214,34 +7356,29 @@
6214	7356	return dr6;
6215	7357	}
6216	7358
6217		-static void kvm_vcpu_do_singlestep(struct kvm_vcpu vcpu, int r)
	7359	+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
6218	7360	{
6219	7361	struct kvm_run *kvm_run = vcpu->run;
6220	7362
6221	7363	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
6222	7364	kvm_run->debug.arch.dr6 = DR6_BS \| DR6_FIXED_1 \| DR6_RTM;
6223		- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
	7365	+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
6224	7366	kvm_run->debug.arch.exception = DB_VECTOR;
6225	7367	kvm_run->exit_reason = KVM_EXIT_DEBUG;
6226		- *r = EMULATE_USER_EXIT;
6227		- } else {
6228		- /*
6229		- * "Certain debug exceptions may clear bit 0-3. The
6230		- * remaining contents of the DR6 register are never
6231		- * cleared by the processor".
6232		- */
6233		- vcpu->arch.dr6 &= ~15;
6234		- vcpu->arch.dr6 \|= DR6_BS \| DR6_RTM;
6235		- kvm_queue_exception(vcpu, DB_VECTOR);
	7368	+ return 0;
6236	7369	}
	7370	+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
	7371	+ return 1;
6237	7372	}
6238	7373
6239	7374	int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
6240	7375	{
6241		- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
6242		- int r = EMULATE_DONE;
	7376	+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
	7377	+ int r;
6243	7378
6244		- kvm_x86_ops->skip_emulated_instruction(vcpu);
	7379	+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
	7380	+ if (unlikely(!r))
	7381	+ return 0;
6245	7382
6246	7383	/*
6247	7384	* rflags is the old, "raw" value of the flags. The new value has
..	..	@@ -6252,12 +7389,12 @@
6252	7389	* that sets the TF flag".
6253	7390	*/
6254	7391	if (unlikely(rflags & X86_EFLAGS_TF))
6255		- kvm_vcpu_do_singlestep(vcpu, &r);
6256		- return r == EMULATE_DONE;
	7392	+ r = kvm_vcpu_do_singlestep(vcpu);
	7393	+ return r;
6257	7394	}
6258	7395	EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
6259	7396
6260		-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu vcpu, int r)
	7397	+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu vcpu, int r)
6261	7398	{
6262	7399	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
6263	7400	(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
..	..	@@ -6272,7 +7409,7 @@
6272	7409	kvm_run->debug.arch.pc = eip;
6273	7410	kvm_run->debug.arch.exception = DB_VECTOR;
6274	7411	kvm_run->exit_reason = KVM_EXIT_DEBUG;
6275		- *r = EMULATE_USER_EXIT;
	7412	+ *r = 0;
6276	7413	return true;
6277	7414	}
6278	7415	}
..	..	@@ -6285,10 +7422,8 @@
6285	7422	vcpu->arch.db);
6286	7423
6287	7424	if (dr6 != 0) {
6288		- vcpu->arch.dr6 &= ~15;
6289		- vcpu->arch.dr6 \|= dr6 \| DR6_RTM;
6290		- kvm_queue_exception(vcpu, DB_VECTOR);
6291		- *r = EMULATE_DONE;
	7425	+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
	7426	+ *r = 1;
6292	7427	return true;
6293	7428	}
6294	7429	}
..	..	@@ -6327,13 +7462,45 @@
6327	7462	return false;
6328	7463	}
6329	7464
	7465	+/*
	7466	+ * Decode an instruction for emulation. The caller is responsible for handling
	7467	+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
	7468	+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
	7469	+ * code breakpoints have higher priority and thus have already been done by
	7470	+ * hardware.
	7471	+ *
	7472	+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
	7473	+ * response to a machine check.
	7474	+ */
	7475	+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
	7476	+ void *insn, int insn_len)
	7477	+{
	7478	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
	7479	+ int r;
	7480	+
	7481	+ init_emulate_ctxt(vcpu);
	7482	+
	7483	+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
	7484	+
	7485	+ r = x86_decode_insn(ctxt, insn, insn_len);
	7486	+
	7487	+ trace_kvm_emulate_insn_start(vcpu);
	7488	+ ++vcpu->stat.insn_emulation;
	7489	+
	7490	+ return r;
	7491	+}
	7492	+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
	7493	+
6330	7494	int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
6331	7495	int emulation_type, void *insn, int insn_len)
6332	7496	{
6333	7497	int r;
6334		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7498	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6335	7499	bool writeback = true;
6336		- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
	7500	+ bool write_fault_to_spt;
	7501	+
	7502	+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
	7503	+ return 1;
6337	7504
6338	7505	vcpu->arch.l1tf_flush_l1d = true;
6339	7506
..	..	@@ -6341,39 +7508,33 @@
6341	7508	* Clear write_fault_to_shadow_pgtable here to ensure it is
6342	7509	* never reused.
6343	7510	*/
	7511	+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
6344	7512	vcpu->arch.write_fault_to_shadow_pgtable = false;
6345		- kvm_clear_exception_queue(vcpu);
6346	7513
6347	7514	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
6348		- init_emulate_ctxt(vcpu);
	7515	+ kvm_clear_exception_queue(vcpu);
6349	7516
6350	7517	/*
6351		- * We will reenter on the same instruction since
6352		- * we do not set complete_userspace_io. This does not
6353		- * handle watchpoints yet, those would be handled in
6354		- * the emulate_ops.
	7518	+ * Return immediately if RIP hits a code breakpoint, such #DBs
	7519	+ * are fault-like and are higher priority than any faults on
	7520	+ * the code fetch itself.
6355	7521	*/
6356	7522	if (!(emulation_type & EMULTYPE_SKIP) &&
6357		- kvm_vcpu_check_breakpoint(vcpu, &r))
	7523	+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
6358	7524	return r;
6359	7525
6360		- ctxt->interruptibility = 0;
6361		- ctxt->have_exception = false;
6362		- ctxt->exception.vector = -1;
6363		- ctxt->perm_ok = false;
6364		-
6365		- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
6366		-
6367		- r = x86_decode_insn(ctxt, insn, insn_len);
6368		-
6369		- trace_kvm_emulate_insn_start(vcpu);
6370		- ++vcpu->stat.insn_emulation;
	7526	+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
	7527	+ insn, insn_len);
6371	7528	if (r != EMULATION_OK) {
6372		- if (emulation_type & EMULTYPE_TRAP_UD)
6373		- return EMULATE_FAIL;
6374		- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6375		- emulation_type))
6376		- return EMULATE_DONE;
	7529	+ if ((emulation_type & EMULTYPE_TRAP_UD) \|\|
	7530	+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
	7531	+ kvm_queue_exception(vcpu, UD_VECTOR);
	7532	+ return 1;
	7533	+ }
	7534	+ if (reexecute_instruction(vcpu, cr2_or_gpa,
	7535	+ write_fault_to_spt,
	7536	+ emulation_type))
	7537	+ return 1;
6377	7538	if (ctxt->have_exception) {
6378	7539	/*
6379	7540	* #UD should result in just EMULATION_FAILED, and trap-like
..	..	@@ -6382,27 +7543,32 @@
6382	7543	WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR \|\|
6383	7544	exception_type(ctxt->exception.vector) == EXCPT_TRAP);
6384	7545	inject_emulated_exception(vcpu);
6385		- return EMULATE_DONE;
	7546	+ return 1;
6386	7547	}
6387		- if (emulation_type & EMULTYPE_SKIP)
6388		- return EMULATE_FAIL;
6389	7548	return handle_emulation_failure(vcpu, emulation_type);
6390	7549	}
6391	7550	}
6392	7551
6393		- if ((emulation_type & EMULTYPE_VMWARE) &&
6394		- !is_vmware_backdoor_opcode(ctxt))
6395		- return EMULATE_FAIL;
	7552	+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
	7553	+ !is_vmware_backdoor_opcode(ctxt)) {
	7554	+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	7555	+ return 1;
	7556	+ }
6396	7557
	7558	+ /*
	7559	+ * Note, EMULTYPE_SKIP is intended for use only by vendor callbacks
	7560	+ * for kvm_skip_emulated_instruction(). The caller is responsible for
	7561	+ * updating interruptibility state and injecting single-step #DBs.
	7562	+ */
6397	7563	if (emulation_type & EMULTYPE_SKIP) {
6398	7564	kvm_rip_write(vcpu, ctxt->_eip);
6399	7565	if (ctxt->eflags & X86_EFLAGS_RF)
6400	7566	kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
6401		- return EMULATE_DONE;
	7567	+ return 1;
6402	7568	}
6403	7569
6404	7570	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
6405		- return EMULATE_DONE;
	7571	+ return 1;
6406	7572
6407	7573	/* this is needed for vmware backdoor interface to work since it
6408	7574	changes registers values during IO operation */
..	..	@@ -6412,24 +7578,35 @@
6412	7578	}
6413	7579
6414	7580	restart:
6415		- /* Save the faulting GPA (cr2) in the address field */
6416		- ctxt->exception.address = cr2_or_gpa;
	7581	+ if (emulation_type & EMULTYPE_PF) {
	7582	+ /* Save the faulting GPA (cr2) in the address field */
	7583	+ ctxt->exception.address = cr2_or_gpa;
	7584	+
	7585	+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
	7586	+ if (vcpu->arch.mmu->direct_map) {
	7587	+ ctxt->gpa_available = true;
	7588	+ ctxt->gpa_val = cr2_or_gpa;
	7589	+ }
	7590	+ } else {
	7591	+ /* Sanitize the address out of an abundance of paranoia. */
	7592	+ ctxt->exception.address = 0;
	7593	+ }
6417	7594
6418	7595	r = x86_emulate_insn(ctxt);
6419	7596
6420	7597	if (r == EMULATION_INTERCEPTED)
6421		- return EMULATE_DONE;
	7598	+ return 1;
6422	7599
6423	7600	if (r == EMULATION_FAILED) {
6424	7601	if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6425	7602	emulation_type))
6426		- return EMULATE_DONE;
	7603	+ return 1;
6427	7604
6428	7605	return handle_emulation_failure(vcpu, emulation_type);
6429	7606	}
6430	7607
6431	7608	if (ctxt->have_exception) {
6432		- r = EMULATE_DONE;
	7609	+ r = 1;
6433	7610	if (inject_emulated_exception(vcpu))
6434	7611	return r;
6435	7612	} else if (vcpu->arch.pio.count) {
..	..	@@ -6440,26 +7617,36 @@
6440	7617	writeback = false;
6441	7618	vcpu->arch.complete_userspace_io = complete_emulated_pio;
6442	7619	}
6443		- r = EMULATE_USER_EXIT;
	7620	+ r = 0;
6444	7621	} else if (vcpu->mmio_needed) {
	7622	+ ++vcpu->stat.mmio_exits;
	7623	+
6445	7624	if (!vcpu->mmio_is_write)
6446	7625	writeback = false;
6447		- r = EMULATE_USER_EXIT;
	7626	+ r = 0;
6448	7627	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
6449	7628	} else if (r == EMULATION_RESTART)
6450	7629	goto restart;
6451	7630	else
6452		- r = EMULATE_DONE;
	7631	+ r = 1;
6453	7632
6454	7633	if (writeback) {
6455		- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
	7634	+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
6456	7635	toggle_interruptibility(vcpu, ctxt->interruptibility);
6457	7636	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
	7637	+
	7638	+ /*
	7639	+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
	7640	+ * only supports code breakpoints and general detect #DB, both
	7641	+ * of which are fault-like.
	7642	+ */
6458	7643	if (!ctxt->have_exception \|\|
6459	7644	exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
6460	7645	kvm_rip_write(vcpu, ctxt->eip);
6461		- if (r == EMULATE_DONE && ctxt->tf)
6462		- kvm_vcpu_do_singlestep(vcpu, &r);
	7646	+ if (r && (ctxt->tf \|\| (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
	7647	+ r = kvm_vcpu_do_singlestep(vcpu);
	7648	+ if (kvm_x86_ops.update_emulated_instruction)
	7649	+ kvm_x86_ops.update_emulated_instruction(vcpu);
6463	7650	__kvm_set_rflags(vcpu, ctxt->eflags);
6464	7651	}
6465	7652
..	..	@@ -6509,9 +7696,9 @@
6509	7696	static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6510	7697	unsigned short port)
6511	7698	{
6512		- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6513		- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6514		- size, port, &val, 1);
	7699	+ unsigned long val = kvm_rax_read(vcpu);
	7700	+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
	7701	+
6515	7702	if (ret)
6516	7703	return ret;
6517	7704
..	..	@@ -6544,16 +7731,14 @@
6544	7731	}
6545	7732
6546	7733	/* For size less than 4 we merge, else we zero extend */
6547		- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
6548		- : 0;
	7734	+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
6549	7735
6550	7736	/*
6551		- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
	7737	+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
6552	7738	* the copy and tracing
6553	7739	*/
6554		- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
6555		- vcpu->arch.pio.port, &val, 1);
6556		- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
	7740	+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
	7741	+ kvm_rax_write(vcpu, val);
6557	7742
6558	7743	return kvm_skip_emulated_instruction(vcpu);
6559	7744	}
..	..	@@ -6565,12 +7750,11 @@
6565	7750	int ret;
6566	7751
6567	7752	/* For size less than 4 we merge, else we zero extend */
6568		- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
	7753	+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
6569	7754
6570		- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
6571		- &val, 1);
	7755	+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
6572	7756	if (ret) {
6573		- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
	7757	+ kvm_rax_write(vcpu, val);
6574	7758	return ret;
6575	7759	}
6576	7760
..	..	@@ -6649,10 +7833,8 @@
6649	7833	}
6650	7834	#endif
6651	7835
6652		-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
6653		- void *data)
	7836	+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
6654	7837	{
6655		- struct cpufreq_freqs *freq = data;
6656	7838	struct kvm *kvm;
6657	7839	struct kvm_vcpu *vcpu;
6658	7840	int i, send_ipi = 0;
..	..	@@ -6696,17 +7878,12 @@
6696	7878	*
6697	7879	*/
6698	7880
6699		- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
6700		- return 0;
6701		- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
6702		- return 0;
6703		-
6704		- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
	7881	+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
6705	7882
6706	7883	mutex_lock(&kvm_lock);
6707	7884	list_for_each_entry(kvm, &vm_list, vm_list) {
6708	7885	kvm_for_each_vcpu(i, vcpu, kvm) {
6709		- if (vcpu->cpu != freq->cpu)
	7886	+ if (vcpu->cpu != cpu)
6710	7887	continue;
6711	7888	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6712	7889	if (vcpu->cpu != raw_smp_processor_id())
..	..	@@ -6728,8 +7905,24 @@
6728	7905	* guest context is entered kvmclock will be updated,
6729	7906	* so the guest will not see stale values.
6730	7907	*/
6731		- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
	7908	+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
6732	7909	}
	7910	+}
	7911	+
	7912	+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
	7913	+ void *data)
	7914	+{
	7915	+ struct cpufreq_freqs *freq = data;
	7916	+ int cpu;
	7917	+
	7918	+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
	7919	+ return 0;
	7920	+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
	7921	+ return 0;
	7922	+
	7923	+ for_each_cpu(cpu, freq->policy->cpus)
	7924	+ __kvmclock_cpufreq_notifier(freq, cpu);
	7925	+
6733	7926	return 0;
6734	7927	}
6735	7928
..	..	@@ -6749,20 +7942,21 @@
6749	7942
6750	7943	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
6751	7944	#ifdef CONFIG_CPU_FREQ
6752		- struct cpufreq_policy policy;
	7945	+ struct cpufreq_policy *policy;
6753	7946	int cpu;
6754	7947
6755		- memset(&policy, 0, sizeof(policy));
6756	7948	cpu = get_cpu();
6757		- cpufreq_get_policy(&policy, cpu);
6758		- if (policy.cpuinfo.max_freq)
6759		- max_tsc_khz = policy.cpuinfo.max_freq;
	7949	+ policy = cpufreq_cpu_get(cpu);
	7950	+ if (policy) {
	7951	+ if (policy->cpuinfo.max_freq)
	7952	+ max_tsc_khz = policy->cpuinfo.max_freq;
	7953	+ cpufreq_cpu_put(policy);
	7954	+ }
6760	7955	put_cpu();
6761	7956	#endif
6762	7957	cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
6763	7958	CPUFREQ_TRANSITION_NOTIFIER);
6764	7959	}
6765		- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
6766	7960
6767	7961	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
6768	7962	kvmclock_cpu_online, kvmclock_cpu_down_prep);
..	..	@@ -6781,7 +7975,7 @@
6781	7975	int user_mode = 3;
6782	7976
6783	7977	if (__this_cpu_read(current_vcpu))
6784		- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
	7978	+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
6785	7979
6786	7980	return user_mode != 0;
6787	7981	}
..	..	@@ -6796,10 +7990,20 @@
6796	7990	return ip;
6797	7991	}
6798	7992
	7993	+static void kvm_handle_intel_pt_intr(void)
	7994	+{
	7995	+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
	7996	+
	7997	+ kvm_make_request(KVM_REQ_PMI, vcpu);
	7998	+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
	7999	+ (unsigned long *)&vcpu->arch.pmu.global_status);
	8000	+}
	8001	+
6799	8002	static struct perf_guest_info_callbacks kvm_guest_cbs = {
6800	8003	.is_in_guest = kvm_is_in_guest,
6801	8004	.is_user_mode = kvm_is_user_mode,
6802	8005	.get_guest_ip = kvm_get_guest_ip,
	8006	+ .handle_intel_pt_intr = NULL,
6803	8007	};
6804	8008
6805	8009	#ifdef CONFIG_X86_64
..	..	@@ -6821,6 +8025,18 @@
6821	8025	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
6822	8026
6823	8027	/*
	8028	+ * Indirection to move queue_work() out of the tk_core.seq write held
	8029	+ * region to prevent possible deadlocks against time accessors which
	8030	+ * are invoked with work related locks held.
	8031	+ */
	8032	+static void pvclock_irq_work_fn(struct irq_work *w)
	8033	+{
	8034	+ queue_work(system_long_wq, &pvclock_gtod_work);
	8035	+}
	8036	+
	8037	+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
	8038	+
	8039	+/*
6824	8040	* Notification about pvclock gtod data update.
6825	8041	*/
6826	8042	static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
..	..	@@ -6831,13 +8047,14 @@
6831	8047
6832	8048	update_pvclock_gtod(tk);
6833	8049
6834		- /* disable master clock if host does not trust, or does not
6835		- * use, TSC based clocksource.
	8050	+ /*
	8051	+ * Disable master clock if host does not trust, or does not use,
	8052	+ * TSC based clocksource. Delegate queue_work() to irq_work as
	8053	+ * this is invoked with tk_core.seq write held.
6836	8054	*/
6837	8055	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
6838	8056	atomic_read(&kvm_guest_has_master_clock) != 0)
6839		- queue_work(system_long_wq, &pvclock_gtod_work);
6840		-
	8057	+ irq_work_queue(&pvclock_irq_work);
6841	8058	return 0;
6842	8059	}
6843	8060
..	..	@@ -6848,50 +8065,87 @@
6848	8065
6849	8066	int kvm_arch_init(void *opaque)
6850	8067	{
	8068	+ struct kvm_x86_init_ops *ops = opaque;
6851	8069	int r;
6852		- struct kvm_x86_ops *ops = opaque;
6853	8070
6854		- if (kvm_x86_ops) {
	8071	+ if (kvm_x86_ops.hardware_enable) {
6855	8072	printk(KERN_ERR "kvm: already loaded the other module\n");
6856	8073	r = -EEXIST;
6857	8074	goto out;
6858	8075	}
6859	8076
6860	8077	if (!ops->cpu_has_kvm_support()) {
6861		- printk(KERN_ERR "kvm: no hardware support\n");
	8078	+ pr_err_ratelimited("kvm: no hardware support\n");
6862	8079	r = -EOPNOTSUPP;
6863	8080	goto out;
6864	8081	}
6865	8082	if (ops->disabled_by_bios()) {
6866		- printk(KERN_ERR "kvm: disabled by bios\n");
	8083	+ pr_err_ratelimited("kvm: disabled by bios\n");
6867	8084	r = -EOPNOTSUPP;
6868	8085	goto out;
6869	8086	}
6870	8087
6871		- r = -ENOMEM;
6872		- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
6873		- if (!shared_msrs) {
6874		- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
	8088	+ /*
	8089	+ * KVM explicitly assumes that the guest has an FPU and
	8090	+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
	8091	+ * vCPU's FPU state as a fxregs_state struct.
	8092	+ */
	8093	+ if (!boot_cpu_has(X86_FEATURE_FPU) \|\| !boot_cpu_has(X86_FEATURE_FXSR)) {
	8094	+ printk(KERN_ERR "kvm: inadequate fpu\n");
	8095	+ r = -EOPNOTSUPP;
6875	8096	goto out;
6876	8097	}
6877	8098
6878		- r = kvm_mmu_module_init();
	8099	+#ifdef CONFIG_PREEMPT_RT
	8100	+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
	8101	+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
	8102	+ r = -EOPNOTSUPP;
	8103	+ goto out;
	8104	+ }
	8105	+#endif
	8106	+
	8107	+ r = -ENOMEM;
	8108	+ x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
	8109	+ __alignof__(struct fpu), SLAB_ACCOUNT,
	8110	+ NULL);
	8111	+ if (!x86_fpu_cache) {
	8112	+ printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
	8113	+ goto out;
	8114	+ }
	8115	+
	8116	+ x86_emulator_cache = kvm_alloc_emulator_cache();
	8117	+ if (!x86_emulator_cache) {
	8118	+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
	8119	+ goto out_free_x86_fpu_cache;
	8120	+ }
	8121	+
	8122	+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
	8123	+ if (!user_return_msrs) {
	8124	+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
	8125	+ goto out_free_x86_emulator_cache;
	8126	+ }
	8127	+
	8128	+ r = kvm_mmu_vendor_module_init();
6879	8129	if (r)
6880	8130	goto out_free_percpu;
6881		-
6882		- kvm_x86_ops = ops;
6883	8131
6884	8132	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
6885	8133	PT_DIRTY_MASK, PT64_NX_MASK, 0,
6886	8134	PT_PRESENT_MASK, 0, sme_me_mask);
6887	8135	kvm_timer_init();
6888	8136
	8137	+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
	8138	+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
6889	8139	perf_register_guest_info_callbacks(&kvm_guest_cbs);
6890	8140
6891		- if (boot_cpu_has(X86_FEATURE_XSAVE))
	8141	+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
6892	8142	host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
	8143	+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
	8144	+ }
6893	8145
6894	8146	kvm_lapic_init();
	8147	+ if (pi_inject_timer == -1)
	8148	+ pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
6895	8149	#ifdef CONFIG_X86_64
6896	8150	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
6897	8151
..	..	@@ -6902,7 +8156,11 @@
6902	8156	return 0;
6903	8157
6904	8158	out_free_percpu:
6905		- free_percpu(shared_msrs);
	8159	+ free_percpu(user_return_msrs);
	8160	+out_free_x86_emulator_cache:
	8161	+ kmem_cache_destroy(x86_emulator_cache);
	8162	+out_free_x86_fpu_cache:
	8163	+ kmem_cache_destroy(x86_fpu_cache);
6906	8164	out:
6907	8165	return r;
6908	8166	}
..	..	@@ -6915,6 +8173,7 @@
6915	8173	#endif
6916	8174	kvm_lapic_exit();
6917	8175	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
	8176	+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
6918	8177
6919	8178	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
6920	8179	cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
..	..	@@ -6922,11 +8181,14 @@
6922	8181	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
6923	8182	#ifdef CONFIG_X86_64
6924	8183	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
	8184	+ irq_work_sync(&pvclock_irq_work);
6925	8185	cancel_work_sync(&pvclock_gtod_work);
6926	8186	#endif
6927		- kvm_x86_ops = NULL;
6928		- kvm_mmu_module_exit();
6929		- free_percpu(shared_msrs);
	8187	+ kvm_x86_ops.hardware_enable = NULL;
	8188	+ kvm_mmu_vendor_module_exit();
	8189	+ free_percpu(user_return_msrs);
	8190	+ kmem_cache_destroy(x86_emulator_cache);
	8191	+ kmem_cache_destroy(x86_fpu_cache);
6930	8192	}
6931	8193
6932	8194	int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
..	..	@@ -6990,22 +8252,52 @@
6990	8252	*/
6991	8253	static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
6992	8254	{
6993		- struct kvm_lapic_irq lapic_irq;
	8255	+ /*
	8256	+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
	8257	+ * common code, e.g. for tracing. Defer initialization to the compiler.
	8258	+ */
	8259	+ struct kvm_lapic_irq lapic_irq = {
	8260	+ .delivery_mode = APIC_DM_REMRD,
	8261	+ .dest_mode = APIC_DEST_PHYSICAL,
	8262	+ .shorthand = APIC_DEST_NOSHORT,
	8263	+ .dest_id = apicid,
	8264	+ };
6994	8265
6995		- lapic_irq.shorthand = 0;
6996		- lapic_irq.dest_mode = 0;
6997		- lapic_irq.level = 0;
6998		- lapic_irq.dest_id = apicid;
6999		- lapic_irq.msi_redir_hint = false;
7000		-
7001		- lapic_irq.delivery_mode = APIC_DM_REMRD;
7002	8266	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
7003	8267	}
7004	8268
7005		-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
	8269	+bool kvm_apicv_activated(struct kvm *kvm)
7006	8270	{
7007		- vcpu->arch.apicv_active = false;
7008		- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
	8271	+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
	8272	+}
	8273	+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
	8274	+
	8275	+void kvm_apicv_init(struct kvm *kvm, bool enable)
	8276	+{
	8277	+ if (enable)
	8278	+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
	8279	+ &kvm->arch.apicv_inhibit_reasons);
	8280	+ else
	8281	+ set_bit(APICV_INHIBIT_REASON_DISABLE,
	8282	+ &kvm->arch.apicv_inhibit_reasons);
	8283	+}
	8284	+EXPORT_SYMBOL_GPL(kvm_apicv_init);
	8285	+
	8286	+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
	8287	+{
	8288	+ struct kvm_vcpu *target = NULL;
	8289	+ struct kvm_apic_map *map;
	8290	+
	8291	+ rcu_read_lock();
	8292	+ map = rcu_dereference(kvm->arch.apic_map);
	8293	+
	8294	+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
	8295	+ target = map->phys_map[dest_id]->vcpu;
	8296	+
	8297	+ rcu_read_unlock();
	8298	+
	8299	+ if (target && READ_ONCE(target->ready))
	8300	+ kvm_vcpu_yield_to(target);
7009	8301	}
7010	8302
7011	8303	int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
..	..	@@ -7016,11 +8308,11 @@
7016	8308	if (kvm_hv_hypercall_enabled(vcpu->kvm))
7017	8309	return kvm_hv_hypercall(vcpu);
7018	8310
7019		- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
7020		- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
7021		- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
7022		- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
7023		- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
	8311	+ nr = kvm_rax_read(vcpu);
	8312	+ a0 = kvm_rbx_read(vcpu);
	8313	+ a1 = kvm_rcx_read(vcpu);
	8314	+ a2 = kvm_rdx_read(vcpu);
	8315	+ a3 = kvm_rsi_read(vcpu);
7024	8316
7025	8317	trace_kvm_hypercall(nr, a0, a1, a2, a3);
7026	8318
..	..	@@ -7033,17 +8325,23 @@
7033	8325	a3 &= 0xFFFFFFFF;
7034	8326	}
7035	8327
7036		- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
	8328	+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
7037	8329	ret = -KVM_EPERM;
7038	8330	goto out;
7039	8331	}
	8332	+
	8333	+ ret = -KVM_ENOSYS;
7040	8334
7041	8335	switch (nr) {
7042	8336	case KVM_HC_VAPIC_POLL_IRQ:
7043	8337	ret = 0;
7044	8338	break;
7045	8339	case KVM_HC_KICK_CPU:
	8340	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
	8341	+ break;
	8342	+
7046	8343	kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
	8344	+ kvm_sched_yield(vcpu->kvm, a1);
7047	8345	ret = 0;
7048	8346	break;
7049	8347	#ifdef CONFIG_X86_64
..	..	@@ -7052,7 +8350,17 @@
7052	8350	break;
7053	8351	#endif
7054	8352	case KVM_HC_SEND_IPI:
	8353	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
	8354	+ break;
	8355	+
7055	8356	ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
	8357	+ break;
	8358	+ case KVM_HC_SCHED_YIELD:
	8359	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
	8360	+ break;
	8361	+
	8362	+ kvm_sched_yield(vcpu->kvm, a0);
	8363	+ ret = 0;
7056	8364	break;
7057	8365	default:
7058	8366	ret = -KVM_ENOSYS;
..	..	@@ -7061,7 +8369,7 @@
7061	8369	out:
7062	8370	if (!op_64_bit)
7063	8371	ret = (u32)ret;
7064		- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
	8372	+ kvm_rax_write(vcpu, ret);
7065	8373
7066	8374	++vcpu->stat.hypercalls;
7067	8375	return kvm_skip_emulated_instruction(vcpu);
..	..	@@ -7074,7 +8382,7 @@
7074	8382	char instruction[3];
7075	8383	unsigned long rip = kvm_rip_read(vcpu);
7076	8384
7077		- kvm_x86_ops->patch_hypercall(vcpu, instruction);
	8385	+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
7078	8386
7079	8387	return emulator_write_emulated(ctxt, rip, instruction, 3,
7080	8388	&ctxt->exception);
..	..	@@ -7103,7 +8411,7 @@
7103	8411	{
7104	8412	int max_irr, tpr;
7105	8413
7106		- if (!kvm_x86_ops->update_cr8_intercept)
	8414	+ if (!kvm_x86_ops.update_cr8_intercept)
7107	8415	return;
7108	8416
7109	8417	if (!lapic_in_kernel(vcpu))
..	..	@@ -7122,24 +8430,32 @@
7122	8430
7123	8431	tpr = kvm_lapic_get_cr8(vcpu);
7124	8432
7125		- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
	8433	+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
7126	8434	}
7127	8435
7128	8436	static void kvm_inject_exception(struct kvm_vcpu *vcpu)
7129	8437	{
7130		- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
7131		- vcpu->arch.exception.error_code = false;
7132		- kvm_x86_ops->queue_exception(vcpu);
	8438	+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
	8439	+ vcpu->arch.exception.has_error_code,
	8440	+ vcpu->arch.exception.error_code,
	8441	+ vcpu->arch.exception.injected);
	8442	+
	8443	+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
	8444	+ vcpu->arch.exception.error_code = false;
	8445	+ kvm_x86_ops.queue_exception(vcpu);
7133	8446	}
7134	8447
7135		-static int inject_pending_event(struct kvm_vcpu *vcpu)
	8448	+static void inject_pending_event(struct kvm_vcpu vcpu, bool req_immediate_exit)
7136	8449	{
7137	8450	int r;
	8451	+ bool can_inject = true;
7138	8452
7139	8453	/* try to reinject previous events if any */
7140	8454
7141		- if (vcpu->arch.exception.injected)
	8455	+ if (vcpu->arch.exception.injected) {
7142	8456	kvm_inject_exception(vcpu);
	8457	+ can_inject = false;
	8458	+ }
7143	8459	/*
7144	8460	* Do not inject an NMI or interrupt if there is a pending
7145	8461	* exception. Exceptions and interrupts are recognized at
..	..	@@ -7155,11 +8471,17 @@
7155	8471	* fully complete the previous instruction.
7156	8472	*/
7157	8473	else if (!vcpu->arch.exception.pending) {
7158		- if (vcpu->arch.nmi_injected)
7159		- kvm_x86_ops->set_nmi(vcpu);
7160		- else if (vcpu->arch.interrupt.injected)
7161		- kvm_x86_ops->set_irq(vcpu);
	8474	+ if (vcpu->arch.nmi_injected) {
	8475	+ kvm_x86_ops.set_nmi(vcpu);
	8476	+ can_inject = false;
	8477	+ } else if (vcpu->arch.interrupt.injected) {
	8478	+ kvm_x86_ops.set_irq(vcpu);
	8479	+ can_inject = false;
	8480	+ }
7162	8481	}
	8482	+
	8483	+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
	8484	+ vcpu->arch.exception.pending);
7163	8485
7164	8486	/*
7165	8487	* Call check_nested_events() even if we reinjected a previous event
..	..	@@ -7167,69 +8489,107 @@
7167	8489	* from L2 to L1 due to pending L1 events which require exit
7168	8490	* from L2 to L1.
7169	8491	*/
7170		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7171		- r = kvm_x86_ops->check_nested_events(vcpu);
7172		- if (r != 0)
7173		- return r;
	8492	+ if (is_guest_mode(vcpu)) {
	8493	+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
	8494	+ if (r < 0)
	8495	+ goto busy;
7174	8496	}
7175	8497
7176	8498	/* try to inject new event if pending */
7177	8499	if (vcpu->arch.exception.pending) {
7178		- trace_kvm_inj_exception(vcpu->arch.exception.nr,
7179		- vcpu->arch.exception.has_error_code,
7180		- vcpu->arch.exception.error_code);
7181		-
7182		- WARN_ON_ONCE(vcpu->arch.exception.injected);
7183		- vcpu->arch.exception.pending = false;
7184		- vcpu->arch.exception.injected = true;
7185		-
	8500	+ /*
	8501	+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
	8502	+ * value pushed on the stack. Trap-like exception and all #DBs
	8503	+ * leave RF as-is (KVM follows Intel's behavior in this regard;
	8504	+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
	8505	+ *
	8506	+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
	8507	+ * describe the behavior of General Detect #DBs, which are
	8508	+ * fault-like. They do _not_ set RF, a la code breakpoints.
	8509	+ */
7186	8510	if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
7187	8511	__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) \|
7188	8512	X86_EFLAGS_RF);
7189	8513
7190		- if (vcpu->arch.exception.nr == DB_VECTOR &&
7191		- (vcpu->arch.dr7 & DR7_GD)) {
7192		- vcpu->arch.dr7 &= ~DR7_GD;
7193		- kvm_update_dr7(vcpu);
	8514	+ if (vcpu->arch.exception.nr == DB_VECTOR) {
	8515	+ kvm_deliver_exception_payload(vcpu);
	8516	+ if (vcpu->arch.dr7 & DR7_GD) {
	8517	+ vcpu->arch.dr7 &= ~DR7_GD;
	8518	+ kvm_update_dr7(vcpu);
	8519	+ }
7194	8520	}
7195	8521
7196	8522	kvm_inject_exception(vcpu);
	8523	+
	8524	+ vcpu->arch.exception.pending = false;
	8525	+ vcpu->arch.exception.injected = true;
	8526	+
	8527	+ can_inject = false;
7197	8528	}
7198	8529
7199		- /* Don't consider new event if we re-injected an event */
7200		- if (kvm_event_needs_reinjection(vcpu))
7201		- return 0;
7202		-
7203		- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
7204		- kvm_x86_ops->smi_allowed(vcpu)) {
7205		- vcpu->arch.smi_pending = false;
7206		- ++vcpu->arch.smi_count;
7207		- enter_smm(vcpu);
7208		- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
7209		- --vcpu->arch.nmi_pending;
7210		- vcpu->arch.nmi_injected = true;
7211		- kvm_x86_ops->set_nmi(vcpu);
7212		- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
7213		- /*
7214		- * Because interrupts can be injected asynchronously, we are
7215		- * calling check_nested_events again here to avoid a race condition.
7216		- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
7217		- * proposal and current concerns. Perhaps we should be setting
7218		- * KVM_REQ_EVENT only on certain events and not unconditionally?
7219		- */
7220		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7221		- r = kvm_x86_ops->check_nested_events(vcpu);
7222		- if (r != 0)
7223		- return r;
7224		- }
7225		- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
7226		- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
7227		- false);
7228		- kvm_x86_ops->set_irq(vcpu);
7229		- }
	8530	+ /*
	8531	+ * Finally, inject interrupt events. If an event cannot be injected
	8532	+ * due to architectural conditions (e.g. IF=0) a window-open exit
	8533	+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
	8534	+ * and can architecturally be injected, but we cannot do it right now:
	8535	+ * an interrupt could have arrived just now and we have to inject it
	8536	+ * as a vmexit, or there could already an event in the queue, which is
	8537	+ * indicated by can_inject. In that case we request an immediate exit
	8538	+ * in order to make progress and get back here for another iteration.
	8539	+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
	8540	+ */
	8541	+ if (vcpu->arch.smi_pending) {
	8542	+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
	8543	+ if (r < 0)
	8544	+ goto busy;
	8545	+ if (r) {
	8546	+ vcpu->arch.smi_pending = false;
	8547	+ ++vcpu->arch.smi_count;
	8548	+ enter_smm(vcpu);
	8549	+ can_inject = false;
	8550	+ } else
	8551	+ kvm_x86_ops.enable_smi_window(vcpu);
7230	8552	}
7231	8553
7232		- return 0;
	8554	+ if (vcpu->arch.nmi_pending) {
	8555	+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
	8556	+ if (r < 0)
	8557	+ goto busy;
	8558	+ if (r) {
	8559	+ --vcpu->arch.nmi_pending;
	8560	+ vcpu->arch.nmi_injected = true;
	8561	+ kvm_x86_ops.set_nmi(vcpu);
	8562	+ can_inject = false;
	8563	+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
	8564	+ }
	8565	+ if (vcpu->arch.nmi_pending)
	8566	+ kvm_x86_ops.enable_nmi_window(vcpu);
	8567	+ }
	8568	+
	8569	+ if (kvm_cpu_has_injectable_intr(vcpu)) {
	8570	+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
	8571	+ if (r < 0)
	8572	+ goto busy;
	8573	+ if (r) {
	8574	+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
	8575	+ kvm_x86_ops.set_irq(vcpu);
	8576	+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
	8577	+ }
	8578	+ if (kvm_cpu_has_injectable_intr(vcpu))
	8579	+ kvm_x86_ops.enable_irq_window(vcpu);
	8580	+ }
	8581	+
	8582	+ if (is_guest_mode(vcpu) &&
	8583	+ kvm_x86_ops.nested_ops->hv_timer_pending &&
	8584	+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
	8585	+ *req_immediate_exit = true;
	8586	+
	8587	+ WARN_ON(vcpu->arch.exception.pending);
	8588	+ return;
	8589	+
	8590	+busy:
	8591	+ *req_immediate_exit = true;
	8592	+ return;
7233	8593	}
7234	8594
7235	8595	static void process_nmi(struct kvm_vcpu *vcpu)
..	..	@@ -7241,7 +8601,7 @@
7241	8601	* If an NMI is already in progress, limit further NMIs to just one.
7242	8602	* Otherwise, allow two (and we'll inject the first one immediately).
7243	8603	*/
7244		- if (kvm_x86_ops->get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
	8604	+ if (kvm_x86_ops.get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
7245	8605	limit = 1;
7246	8606
7247	8607	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
..	..	@@ -7331,11 +8691,11 @@
7331	8691	put_smstate(u32, buf, 0x7f7c, seg.limit);
7332	8692	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
7333	8693
7334		- kvm_x86_ops->get_gdt(vcpu, &dt);
	8694	+ kvm_x86_ops.get_gdt(vcpu, &dt);
7335	8695	put_smstate(u32, buf, 0x7f74, dt.address);
7336	8696	put_smstate(u32, buf, 0x7f70, dt.size);
7337	8697
7338		- kvm_x86_ops->get_idt(vcpu, &dt);
	8698	+ kvm_x86_ops.get_idt(vcpu, &dt);
7339	8699	put_smstate(u32, buf, 0x7f58, dt.address);
7340	8700	put_smstate(u32, buf, 0x7f54, dt.size);
7341	8701
..	..	@@ -7385,7 +8745,7 @@
7385	8745	put_smstate(u32, buf, 0x7e94, seg.limit);
7386	8746	put_smstate(u64, buf, 0x7e98, seg.base);
7387	8747
7388		- kvm_x86_ops->get_idt(vcpu, &dt);
	8748	+ kvm_x86_ops.get_idt(vcpu, &dt);
7389	8749	put_smstate(u32, buf, 0x7e84, dt.size);
7390	8750	put_smstate(u64, buf, 0x7e88, dt.address);
7391	8751
..	..	@@ -7395,7 +8755,7 @@
7395	8755	put_smstate(u32, buf, 0x7e74, seg.limit);
7396	8756	put_smstate(u64, buf, 0x7e78, seg.base);
7397	8757
7398		- kvm_x86_ops->get_gdt(vcpu, &dt);
	8758	+ kvm_x86_ops.get_gdt(vcpu, &dt);
7399	8759	put_smstate(u32, buf, 0x7e64, dt.size);
7400	8760	put_smstate(u64, buf, 0x7e68, dt.address);
7401	8761
..	..	@@ -7425,28 +8785,28 @@
7425	8785	* vCPU state (e.g. leave guest mode) after we've saved the state into
7426	8786	* the SMM state-save area.
7427	8787	*/
7428		- kvm_x86_ops->pre_enter_smm(vcpu, buf);
	8788	+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
7429	8789
7430	8790	vcpu->arch.hflags \|= HF_SMM_MASK;
7431	8791	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
7432	8792
7433		- if (kvm_x86_ops->get_nmi_mask(vcpu))
	8793	+ if (kvm_x86_ops.get_nmi_mask(vcpu))
7434	8794	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
7435	8795	else
7436		- kvm_x86_ops->set_nmi_mask(vcpu, true);
	8796	+ kvm_x86_ops.set_nmi_mask(vcpu, true);
7437	8797
7438	8798	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
7439	8799	kvm_rip_write(vcpu, 0x8000);
7440	8800
7441	8801	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE \| X86_CR0_EM \| X86_CR0_TS \| X86_CR0_PG);
7442		- kvm_x86_ops->set_cr0(vcpu, cr0);
	8802	+ kvm_x86_ops.set_cr0(vcpu, cr0);
7443	8803	vcpu->arch.cr0 = cr0;
7444	8804
7445		- kvm_x86_ops->set_cr4(vcpu, 0);
	8805	+ kvm_x86_ops.set_cr4(vcpu, 0);
7446	8806
7447	8807	/* Undocumented: IDT limit is set to zero on entry to SMM. */
7448	8808	dt.address = dt.size = 0;
7449		- kvm_x86_ops->set_idt(vcpu, &dt);
	8809	+ kvm_x86_ops.set_idt(vcpu, &dt);
7450	8810
7451	8811	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
7452	8812
..	..	@@ -7477,10 +8837,10 @@
7477	8837
7478	8838	#ifdef CONFIG_X86_64
7479	8839	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
7480		- kvm_x86_ops->set_efer(vcpu, 0);
	8840	+ kvm_x86_ops.set_efer(vcpu, 0);
7481	8841	#endif
7482	8842
7483		- kvm_update_cpuid(vcpu);
	8843	+ kvm_update_cpuid_runtime(vcpu);
7484	8844	kvm_mmu_reset_context(vcpu);
7485	8845	}
7486	8846
..	..	@@ -7490,10 +8850,82 @@
7490	8850	kvm_make_request(KVM_REQ_EVENT, vcpu);
7491	8851	}
7492	8852
	8853	+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
	8854	+ unsigned long *vcpu_bitmap)
	8855	+{
	8856	+ cpumask_var_t cpus;
	8857	+
	8858	+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
	8859	+
	8860	+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
	8861	+ NULL, vcpu_bitmap, cpus);
	8862	+
	8863	+ free_cpumask_var(cpus);
	8864	+}
	8865	+
7493	8866	void kvm_make_scan_ioapic_request(struct kvm *kvm)
7494	8867	{
7495	8868	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
7496	8869	}
	8870	+
	8871	+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
	8872	+{
	8873	+ if (!lapic_in_kernel(vcpu))
	8874	+ return;
	8875	+
	8876	+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
	8877	+ kvm_apic_update_apicv(vcpu);
	8878	+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
	8879	+}
	8880	+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
	8881	+
	8882	+/*
	8883	+ * NOTE: Do not hold any lock prior to calling this.
	8884	+ *
	8885	+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
	8886	+ * locked, because it calls __x86_set_memory_region() which does
	8887	+ * synchronize_srcu(&kvm->srcu).
	8888	+ */
	8889	+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
	8890	+{
	8891	+ struct kvm_vcpu *except;
	8892	+ unsigned long old, new, expected;
	8893	+
	8894	+ if (!kvm_x86_ops.check_apicv_inhibit_reasons \|\|
	8895	+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
	8896	+ return;
	8897	+
	8898	+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
	8899	+ do {
	8900	+ expected = new = old;
	8901	+ if (activate)
	8902	+ __clear_bit(bit, &new);
	8903	+ else
	8904	+ __set_bit(bit, &new);
	8905	+ if (new == old)
	8906	+ break;
	8907	+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
	8908	+ } while (old != expected);
	8909	+
	8910	+ if (!!old == !!new)
	8911	+ return;
	8912	+
	8913	+ trace_kvm_apicv_update_request(activate, bit);
	8914	+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
	8915	+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
	8916	+
	8917	+ /*
	8918	+ * Sending request to update APICV for all other vcpus,
	8919	+ * while update the calling vcpu immediately instead of
	8920	+ * waiting for another #VMEXIT to handle the request.
	8921	+ */
	8922	+ except = kvm_get_running_vcpu();
	8923	+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
	8924	+ except);
	8925	+ if (except)
	8926	+ kvm_vcpu_update_apicv(except);
	8927	+}
	8928	+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
7497	8929
7498	8930	static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
7499	8931	{
..	..	@@ -7506,7 +8938,7 @@
7506	8938	kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
7507	8939	else {
7508	8940	if (vcpu->arch.apicv_active)
7509		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	8941	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
7510	8942	if (ioapic_in_kernel(vcpu->kvm))
7511	8943	kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
7512	8944	}
..	..	@@ -7526,7 +8958,7 @@
7526	8958
7527	8959	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
7528	8960	vcpu_to_synic(vcpu)->vec_bitmap, 256);
7529		- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
	8961	+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
7530	8962	}
7531	8963
7532	8964	void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
..	..	@@ -7543,28 +8975,22 @@
7543	8975	kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
7544	8976	}
7545	8977
	8978	+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
	8979	+{
	8980	+ if (kvm_x86_ops.guest_memory_reclaimed)
	8981	+ kvm_x86_ops.guest_memory_reclaimed(kvm);
	8982	+}
	8983	+
7546	8984	void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
7547	8985	{
7548		- struct page *page = NULL;
7549		-
7550	8986	if (!lapic_in_kernel(vcpu))
7551	8987	return;
7552	8988
7553		- if (!kvm_x86_ops->set_apic_access_page_addr)
	8989	+ if (!kvm_x86_ops.set_apic_access_page_addr)
7554	8990	return;
7555	8991
7556		- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
7557		- if (is_error_page(page))
7558		- return;
7559		- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
7560		-
7561		- /*
7562		- * Do not pin apic access page in memory, the MMU notifier
7563		- * will call us again if it is migrated or swapped out.
7564		- */
7565		- put_page(page);
	8992	+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
7566	8993	}
7567		-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
7568	8994
7569	8995	void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
7570	8996	{
..	..	@@ -7583,12 +9009,17 @@
7583	9009	bool req_int_win =
7584	9010	dm_request_for_irq_injection(vcpu) &&
7585	9011	kvm_cpu_accept_dm_intr(vcpu);
	9012	+ fastpath_t exit_fastpath;
7586	9013
7587	9014	bool req_immediate_exit = false;
7588	9015
7589	9016	if (kvm_request_pending(vcpu)) {
7590		- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
7591		- kvm_x86_ops->get_vmcs12_pages(vcpu);
	9017	+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
	9018	+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
	9019	+ r = 0;
	9020	+ goto out;
	9021	+ }
	9022	+ }
7592	9023	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
7593	9024	kvm_mmu_unload(vcpu);
7594	9025	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
..	..	@@ -7604,10 +9035,19 @@
7604	9035	}
7605	9036	if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
7606	9037	kvm_mmu_sync_roots(vcpu);
7607		- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
7608		- kvm_mmu_load_cr3(vcpu);
7609		- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
7610		- kvm_vcpu_flush_tlb(vcpu, true);
	9038	+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
	9039	+ kvm_mmu_load_pgd(vcpu);
	9040	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
	9041	+ kvm_vcpu_flush_tlb_all(vcpu);
	9042	+
	9043	+ /* Flushing all ASIDs flushes the current ASID... */
	9044	+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
	9045	+ }
	9046	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
	9047	+ kvm_vcpu_flush_tlb_current(vcpu);
	9048	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
	9049	+ kvm_vcpu_flush_tlb_guest(vcpu);
	9050	+
7611	9051	if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
7612	9052	vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
7613	9053	r = 0;
..	..	@@ -7678,6 +9118,12 @@
7678	9118	*/
7679	9119	if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
7680	9120	kvm_hv_process_stimers(vcpu);
	9121	+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
	9122	+ kvm_vcpu_update_apicv(vcpu);
	9123	+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
	9124	+ kvm_check_async_pf_completion(vcpu);
	9125	+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
	9126	+ kvm_x86_ops.msr_filter_changed(vcpu);
7681	9127	}
7682	9128
7683	9129	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {
..	..	@@ -7688,32 +9134,9 @@
7688	9134	goto out;
7689	9135	}
7690	9136
7691		- if (inject_pending_event(vcpu) != 0)
7692		- req_immediate_exit = true;
7693		- else {
7694		- /* Enable SMI/NMI/IRQ window open exits if needed.
7695		- *
7696		- * SMIs have three cases:
7697		- * 1) They can be nested, and then there is nothing to
7698		- * do here because RSM will cause a vmexit anyway.
7699		- * 2) There is an ISA-specific reason why SMI cannot be
7700		- * injected, and the moment when this changes can be
7701		- * intercepted.
7702		- * 3) Or the SMI can be pending because
7703		- * inject_pending_event has completed the injection
7704		- * of an IRQ or NMI from the previous vmexit, and
7705		- * then we request an immediate exit to inject the
7706		- * SMI.
7707		- */
7708		- if (vcpu->arch.smi_pending && !is_smm(vcpu))
7709		- if (!kvm_x86_ops->enable_smi_window(vcpu))
7710		- req_immediate_exit = true;
7711		- if (vcpu->arch.nmi_pending)
7712		- kvm_x86_ops->enable_nmi_window(vcpu);
7713		- if (kvm_cpu_has_injectable_intr(vcpu) \|\| req_int_win)
7714		- kvm_x86_ops->enable_irq_window(vcpu);
7715		- WARN_ON(vcpu->arch.exception.pending);
7716		- }
	9137	+ inject_pending_event(vcpu, &req_immediate_exit);
	9138	+ if (req_int_win)
	9139	+ kvm_x86_ops.enable_irq_window(vcpu);
7717	9140
7718	9141	if (kvm_lapic_enabled(vcpu)) {
7719	9142	update_cr8_intercept(vcpu);
..	..	@@ -7728,7 +9151,7 @@
7728	9151
7729	9152	preempt_disable();
7730	9153
7731		- kvm_x86_ops->prepare_guest_switch(vcpu);
	9154	+ kvm_x86_ops.prepare_guest_switch(vcpu);
7732	9155
7733	9156	/*
7734	9157	* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
..	..	@@ -7744,7 +9167,7 @@
7744	9167	* 1) We should set ->mode before checking ->requests. Please see
7745	9168	* the comment in kvm_vcpu_exiting_guest_mode().
7746	9169	*
7747		- * 2) For APICv, we should set ->mode before checking PIR.ON. This
	9170	+ * 2) For APICv, we should set ->mode before checking PID.ON. This
7748	9171	* pairs with the memory barrier implicit in pi_test_and_set_on
7749	9172	* (see vmx_deliver_posted_interrupt).
7750	9173	*
..	..	@@ -7759,10 +9182,9 @@
7759	9182	* notified with kvm_vcpu_kick.
7760	9183	*/
7761	9184	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
7762		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	9185	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
7763	9186
7764		- if (vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu)
7765		- \|\| need_resched() \|\| signal_pending(current)) {
	9187	+ if (kvm_vcpu_exit_request(vcpu)) {
7766	9188	vcpu->mode = OUTSIDE_GUEST_MODE;
7767	9189	smp_wmb();
7768	9190	local_irq_enable();
..	..	@@ -7774,13 +9196,14 @@
7774	9196
7775	9197	if (req_immediate_exit) {
7776	9198	kvm_make_request(KVM_REQ_EVENT, vcpu);
7777		- kvm_x86_ops->request_immediate_exit(vcpu);
	9199	+ kvm_x86_ops.request_immediate_exit(vcpu);
7778	9200	}
7779	9201
7780		- trace_kvm_entry(vcpu->vcpu_id);
7781		- if (lapic_timer_advance_ns)
7782		- wait_lapic_expire(vcpu);
7783		- guest_enter_irqoff();
	9202	+ trace_kvm_entry(vcpu);
	9203	+
	9204	+ fpregs_assert_state_consistent();
	9205	+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
	9206	+ switch_fpu_return();
7784	9207
7785	9208	if (unlikely(vcpu->arch.switch_db_regs)) {
7786	9209	set_debugreg(0, 7);
..	..	@@ -7794,7 +9217,7 @@
7794	9217	set_debugreg(0, 7);
7795	9218	}
7796	9219
7797		- kvm_x86_ops->run(vcpu);
	9220	+ exit_fastpath = kvm_x86_ops.run(vcpu);
7798	9221
7799	9222	/*
7800	9223	* Do this here before restoring debug registers on the host. And
..	..	@@ -7804,9 +9227,8 @@
7804	9227	*/
7805	9228	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
7806	9229	WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
7807		- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
	9230	+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
7808	9231	kvm_update_dr0123(vcpu);
7809		- kvm_update_dr6(vcpu);
7810	9232	kvm_update_dr7(vcpu);
7811	9233	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
7812	9234	}
..	..	@@ -7821,18 +9243,43 @@
7821	9243	if (hw_breakpoint_active())
7822	9244	hw_breakpoint_restore();
7823	9245
	9246	+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
7824	9247	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
7825	9248
7826	9249	vcpu->mode = OUTSIDE_GUEST_MODE;
7827	9250	smp_wmb();
7828	9251
	9252	+ kvm_x86_ops.handle_exit_irqoff(vcpu);
	9253	+
	9254	+ /*
	9255	+ * Consume any pending interrupts, including the possible source of
	9256	+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
	9257	+ * An instruction is required after local_irq_enable() to fully unblock
	9258	+ * interrupts on processors that implement an interrupt shadow, the
	9259	+ * stat.exits increment will do nicely.
	9260	+ */
7829	9261	kvm_before_interrupt(vcpu);
7830		- kvm_x86_ops->handle_external_intr(vcpu);
	9262	+ local_irq_enable();
	9263	+ ++vcpu->stat.exits;
	9264	+ local_irq_disable();
7831	9265	kvm_after_interrupt(vcpu);
7832	9266
7833		- ++vcpu->stat.exits;
	9267	+ /*
	9268	+ * Wait until after servicing IRQs to account guest time so that any
	9269	+ * ticks that occurred while running the guest are properly accounted
	9270	+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
	9271	+ * of accounting via context tracking, but the loss of accuracy is
	9272	+ * acceptable for all known use cases.
	9273	+ */
	9274	+ vtime_account_guest_exit();
7834	9275
7835		- guest_exit_irqoff();
	9276	+ if (lapic_in_kernel(vcpu)) {
	9277	+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
	9278	+ if (delta != S64_MIN) {
	9279	+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
	9280	+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
	9281	+ }
	9282	+ }
7836	9283
7837	9284	local_irq_enable();
7838	9285	preempt_enable();
..	..	@@ -7853,12 +9300,13 @@
7853	9300	if (vcpu->arch.apic_attention)
7854	9301	kvm_lapic_sync_from_vapic(vcpu);
7855	9302
7856		- vcpu->arch.gpa_available = false;
7857		- r = kvm_x86_ops->handle_exit(vcpu);
	9303	+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
7858	9304	return r;
7859	9305
7860	9306	cancel_injection:
7861		- kvm_x86_ops->cancel_injection(vcpu);
	9307	+ if (req_immediate_exit)
	9308	+ kvm_make_request(KVM_REQ_EVENT, vcpu);
	9309	+ kvm_x86_ops.cancel_injection(vcpu);
7862	9310	if (unlikely(vcpu->arch.apic_attention))
7863	9311	kvm_lapic_sync_from_vapic(vcpu);
7864	9312	out:
..	..	@@ -7868,13 +9316,13 @@
7868	9316	static inline int vcpu_block(struct kvm kvm, struct kvm_vcpu vcpu)
7869	9317	{
7870	9318	if (!kvm_arch_vcpu_runnable(vcpu) &&
7871		- (!kvm_x86_ops->pre_block \|\| kvm_x86_ops->pre_block(vcpu) == 0)) {
	9319	+ (!kvm_x86_ops.pre_block \|\| kvm_x86_ops.pre_block(vcpu) == 0)) {
7872	9320	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7873	9321	kvm_vcpu_block(vcpu);
7874	9322	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
7875	9323
7876		- if (kvm_x86_ops->post_block)
7877		- kvm_x86_ops->post_block(vcpu);
	9324	+ if (kvm_x86_ops.post_block)
	9325	+ kvm_x86_ops.post_block(vcpu);
7878	9326
7879	9327	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
7880	9328	return 1;
..	..	@@ -7886,6 +9334,7 @@
7886	9334	vcpu->arch.pv.pv_unhalted = false;
7887	9335	vcpu->arch.mp_state =
7888	9336	KVM_MP_STATE_RUNNABLE;
	9337	+ fallthrough;
7889	9338	case KVM_MP_STATE_RUNNABLE:
7890	9339	vcpu->arch.apf.halted = false;
7891	9340	break;
..	..	@@ -7893,15 +9342,14 @@
7893	9342	break;
7894	9343	default:
7895	9344	return -EINTR;
7896		- break;
7897	9345	}
7898	9346	return 1;
7899	9347	}
7900	9348
7901	9349	static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
7902	9350	{
7903		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7904		- kvm_x86_ops->check_nested_events(vcpu);
	9351	+ if (is_guest_mode(vcpu))
	9352	+ kvm_x86_ops.nested_ops->check_events(vcpu);
7905	9353
7906	9354	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7907	9355	!vcpu->arch.apf.halted);
..	..	@@ -7937,17 +9385,11 @@
7937	9385	break;
7938	9386	}
7939	9387
7940		- kvm_check_async_pf_completion(vcpu);
7941		-
7942		- if (signal_pending(current)) {
7943		- r = -EINTR;
7944		- vcpu->run->exit_reason = KVM_EXIT_INTR;
7945		- ++vcpu->stat.signal_exits;
7946		- break;
7947		- }
7948		- if (need_resched()) {
	9388	+ if (__xfer_to_guest_mode_work_pending()) {
7949	9389	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7950		- cond_resched();
	9390	+ r = xfer_to_guest_mode_handle_work(vcpu);
	9391	+ if (r)
	9392	+ return r;
7951	9393	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
7952	9394	}
7953	9395	}
..	..	@@ -7960,12 +9402,11 @@
7960	9402	static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
7961	9403	{
7962	9404	int r;
	9405	+
7963	9406	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
7964	9407	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
7965	9408	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
7966		- if (r != EMULATE_DONE)
7967		- return 0;
7968		- return 1;
	9409	+ return r;
7969	9410	}
7970	9411
7971	9412	static int complete_emulated_pio(struct kvm_vcpu *vcpu)
..	..	@@ -8038,31 +9479,55 @@
8038	9479	return 0;
8039	9480	}
8040	9481
	9482	+static void kvm_save_current_fpu(struct fpu *fpu)
	9483	+{
	9484	+ /*
	9485	+ * If the target FPU state is not resident in the CPU registers, just
	9486	+ * memcpy() from current, else save CPU state directly to the target.
	9487	+ */
	9488	+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
	9489	+ memcpy(&fpu->state, &current->thread.fpu.state,
	9490	+ fpu_kernel_xstate_size);
	9491	+ else
	9492	+ copy_fpregs_to_fpstate(fpu);
	9493	+}
	9494	+
8041	9495	/* Swap (qemu) user FPU context for the guest FPU context. */
8042	9496	static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
8043	9497	{
8044		- preempt_disable();
8045		- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
8046		- /* PKRU is separately restored in kvm_x86_ops->run. */
8047		- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
	9498	+ fpregs_lock();
	9499	+
	9500	+ kvm_save_current_fpu(vcpu->arch.user_fpu);
	9501	+
	9502	+ /* PKRU is separately restored in kvm_x86_ops.run. */
	9503	+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
8048	9504	~XFEATURE_MASK_PKRU);
8049		- preempt_enable();
	9505	+
	9506	+ fpregs_mark_activate();
	9507	+ fpregs_unlock();
	9508	+
8050	9509	trace_kvm_fpu(1);
8051	9510	}
8052	9511
8053	9512	/* When vcpu_run ends, restore user space FPU context. */
8054	9513	static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
8055	9514	{
8056		- preempt_disable();
8057		- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
8058		- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
8059		- preempt_enable();
	9515	+ fpregs_lock();
	9516	+
	9517	+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
	9518	+
	9519	+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
	9520	+
	9521	+ fpregs_mark_activate();
	9522	+ fpregs_unlock();
	9523	+
8060	9524	++vcpu->stat.fpu_reload;
8061	9525	trace_kvm_fpu(0);
8062	9526	}
8063	9527
8064		-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu vcpu, struct kvm_run kvm_run)
	9528	+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
8065	9529	{
	9530	+ struct kvm_run *kvm_run = vcpu->run;
8066	9531	int r;
8067	9532
8068	9533	vcpu_load(vcpu);
..	..	@@ -8080,18 +9545,18 @@
8080	9545	r = -EAGAIN;
8081	9546	if (signal_pending(current)) {
8082	9547	r = -EINTR;
8083		- vcpu->run->exit_reason = KVM_EXIT_INTR;
	9548	+ kvm_run->exit_reason = KVM_EXIT_INTR;
8084	9549	++vcpu->stat.signal_exits;
8085	9550	}
8086	9551	goto out;
8087	9552	}
8088	9553
8089		- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
	9554	+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
8090	9555	r = -EINVAL;
8091	9556	goto out;
8092	9557	}
8093	9558
8094		- if (vcpu->run->kvm_dirty_regs) {
	9559	+ if (kvm_run->kvm_dirty_regs) {
8095	9560	r = sync_regs(vcpu);
8096	9561	if (r != 0)
8097	9562	goto out;
..	..	@@ -8121,7 +9586,7 @@
8121	9586
8122	9587	out:
8123	9588	kvm_put_guest_fpu(vcpu);
8124		- if (vcpu->run->kvm_valid_regs)
	9589	+ if (kvm_run->kvm_valid_regs)
8125	9590	store_regs(vcpu);
8126	9591	post_kvm_run_save(vcpu);
8127	9592	kvm_sigset_deactivate(vcpu);
..	..	@@ -8140,26 +9605,26 @@
8140	9605	* that usually, but some bad designed PV devices (vmware
8141	9606	* backdoor interface) need this to work
8142	9607	*/
8143		- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
	9608	+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
8144	9609	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8145	9610	}
8146		- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
8147		- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
8148		- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
8149		- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
8150		- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
8151		- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
8152		- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
8153		- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
	9611	+ regs->rax = kvm_rax_read(vcpu);
	9612	+ regs->rbx = kvm_rbx_read(vcpu);
	9613	+ regs->rcx = kvm_rcx_read(vcpu);
	9614	+ regs->rdx = kvm_rdx_read(vcpu);
	9615	+ regs->rsi = kvm_rsi_read(vcpu);
	9616	+ regs->rdi = kvm_rdi_read(vcpu);
	9617	+ regs->rsp = kvm_rsp_read(vcpu);
	9618	+ regs->rbp = kvm_rbp_read(vcpu);
8154	9619	#ifdef CONFIG_X86_64
8155		- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
8156		- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
8157		- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
8158		- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
8159		- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
8160		- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
8161		- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
8162		- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
	9620	+ regs->r8 = kvm_r8_read(vcpu);
	9621	+ regs->r9 = kvm_r9_read(vcpu);
	9622	+ regs->r10 = kvm_r10_read(vcpu);
	9623	+ regs->r11 = kvm_r11_read(vcpu);
	9624	+ regs->r12 = kvm_r12_read(vcpu);
	9625	+ regs->r13 = kvm_r13_read(vcpu);
	9626	+ regs->r14 = kvm_r14_read(vcpu);
	9627	+ regs->r15 = kvm_r15_read(vcpu);
8163	9628	#endif
8164	9629
8165	9630	regs->rip = kvm_rip_read(vcpu);
..	..	@@ -8179,23 +9644,23 @@
8179	9644	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
8180	9645	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8181	9646
8182		- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
8183		- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
8184		- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
8185		- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
8186		- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
8187		- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
8188		- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
8189		- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
	9647	+ kvm_rax_write(vcpu, regs->rax);
	9648	+ kvm_rbx_write(vcpu, regs->rbx);
	9649	+ kvm_rcx_write(vcpu, regs->rcx);
	9650	+ kvm_rdx_write(vcpu, regs->rdx);
	9651	+ kvm_rsi_write(vcpu, regs->rsi);
	9652	+ kvm_rdi_write(vcpu, regs->rdi);
	9653	+ kvm_rsp_write(vcpu, regs->rsp);
	9654	+ kvm_rbp_write(vcpu, regs->rbp);
8190	9655	#ifdef CONFIG_X86_64
8191		- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
8192		- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
8193		- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
8194		- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
8195		- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
8196		- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
8197		- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
8198		- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
	9656	+ kvm_r8_write(vcpu, regs->r8);
	9657	+ kvm_r9_write(vcpu, regs->r9);
	9658	+ kvm_r10_write(vcpu, regs->r10);
	9659	+ kvm_r11_write(vcpu, regs->r11);
	9660	+ kvm_r12_write(vcpu, regs->r12);
	9661	+ kvm_r13_write(vcpu, regs->r13);
	9662	+ kvm_r14_write(vcpu, regs->r14);
	9663	+ kvm_r15_write(vcpu, regs->r15);
8199	9664	#endif
8200	9665
8201	9666	kvm_rip_write(vcpu, regs->rip);
..	..	@@ -8238,10 +9703,10 @@
8238	9703	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
8239	9704	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
8240	9705
8241		- kvm_x86_ops->get_idt(vcpu, &dt);
	9706	+ kvm_x86_ops.get_idt(vcpu, &dt);
8242	9707	sregs->idt.limit = dt.size;
8243	9708	sregs->idt.base = dt.address;
8244		- kvm_x86_ops->get_gdt(vcpu, &dt);
	9709	+ kvm_x86_ops.get_gdt(vcpu, &dt);
8245	9710	sregs->gdt.limit = dt.size;
8246	9711	sregs->gdt.base = dt.address;
8247	9712
..	..	@@ -8253,7 +9718,7 @@
8253	9718	sregs->efer = vcpu->arch.efer;
8254	9719	sregs->apic_base = kvm_get_apic_base(vcpu);
8255	9720
8256		- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
	9721	+ memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
8257	9722
8258	9723	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
8259	9724	set_bit(vcpu->arch.interrupt.nr,
..	..	@@ -8300,8 +9765,12 @@
8300	9765	mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
8301	9766	goto out;
8302	9767
8303		- /* INITs are latched while in SMM */
8304		- if ((is_smm(vcpu) \|\| vcpu->arch.smi_pending) &&
	9768	+ /*
	9769	+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
	9770	+ * INIT state; latched init should be reported using
	9771	+ * KVM_SET_VCPU_EVENTS, so reject it here.
	9772	+ */
	9773	+ if ((kvm_vcpu_latch_init(vcpu) \|\| vcpu->arch.smi_pending) &&
8305	9774	(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED \|\|
8306	9775	mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
8307	9776	goto out;
..	..	@@ -8322,21 +9791,23 @@
8322	9791	int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
8323	9792	int reason, bool has_error_code, u32 error_code)
8324	9793	{
8325		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	9794	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8326	9795	int ret;
8327	9796
8328	9797	init_emulate_ctxt(vcpu);
8329	9798
8330	9799	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
8331	9800	has_error_code, error_code);
8332		-
8333		- if (ret)
8334		- return EMULATE_FAIL;
	9801	+ if (ret) {
	9802	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	9803	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	9804	+ vcpu->run->internal.ndata = 0;
	9805	+ return 0;
	9806	+ }
8335	9807
8336	9808	kvm_rip_write(vcpu, ctxt->eip);
8337	9809	kvm_set_rflags(vcpu, ctxt->eflags);
8338		- kvm_make_request(KVM_REQ_EVENT, vcpu);
8339		- return EMULATE_DONE;
	9810	+ return 1;
8340	9811	}
8341	9812	EXPORT_SYMBOL_GPL(kvm_task_switch);
8342	9813
..	..	@@ -8350,6 +9821,8 @@
8350	9821	*/
8351	9822	if (!(sregs->cr4 & X86_CR4_PAE)
8352	9823	\|\| !(sregs->efer & EFER_LMA))
	9824	+ return -EINVAL;
	9825	+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
8353	9826	return -EINVAL;
8354	9827	} else {
8355	9828	/*
..	..	@@ -8382,31 +9855,31 @@
8382	9855
8383	9856	dt.size = sregs->idt.limit;
8384	9857	dt.address = sregs->idt.base;
8385		- kvm_x86_ops->set_idt(vcpu, &dt);
	9858	+ kvm_x86_ops.set_idt(vcpu, &dt);
8386	9859	dt.size = sregs->gdt.limit;
8387	9860	dt.address = sregs->gdt.base;
8388		- kvm_x86_ops->set_gdt(vcpu, &dt);
	9861	+ kvm_x86_ops.set_gdt(vcpu, &dt);
8389	9862
8390	9863	vcpu->arch.cr2 = sregs->cr2;
8391	9864	mmu_reset_needed \|= kvm_read_cr3(vcpu) != sregs->cr3;
8392	9865	vcpu->arch.cr3 = sregs->cr3;
8393		- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
	9866	+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
8394	9867
8395	9868	kvm_set_cr8(vcpu, sregs->cr8);
8396	9869
8397	9870	mmu_reset_needed \|= vcpu->arch.efer != sregs->efer;
8398		- kvm_x86_ops->set_efer(vcpu, sregs->efer);
	9871	+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
8399	9872
8400	9873	mmu_reset_needed \|= kvm_read_cr0(vcpu) != sregs->cr0;
8401		- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
	9874	+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
8402	9875	vcpu->arch.cr0 = sregs->cr0;
8403	9876
8404	9877	mmu_reset_needed \|= kvm_read_cr4(vcpu) != sregs->cr4;
8405	9878	cpuid_update_needed \|= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
8406	9879	(X86_CR4_OSXSAVE \| X86_CR4_PKE));
8407		- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
	9880	+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
8408	9881	if (cpuid_update_needed)
8409		- kvm_update_cpuid(vcpu);
	9882	+ kvm_update_cpuid_runtime(vcpu);
8410	9883
8411	9884	idx = srcu_read_lock(&vcpu->kvm->srcu);
8412	9885	if (is_pae_paging(vcpu)) {
..	..	@@ -8510,7 +9983,7 @@
8510	9983	*/
8511	9984	kvm_set_rflags(vcpu, rflags);
8512	9985
8513		- kvm_x86_ops->update_bp_intercept(vcpu);
	9986	+ kvm_x86_ops.update_exception_bitmap(vcpu);
8514	9987
8515	9988	r = 0;
8516	9989
..	..	@@ -8549,7 +10022,7 @@
8549	10022
8550	10023	vcpu_load(vcpu);
8551	10024
8552		- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
	10025	+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8553	10026	memcpy(fpu->fpr, fxsave->st_space, 128);
8554	10027	fpu->fcw = fxsave->cwd;
8555	10028	fpu->fsw = fxsave->swd;
..	..	@@ -8557,7 +10030,7 @@
8557	10030	fpu->last_opcode = fxsave->fop;
8558	10031	fpu->last_ip = fxsave->rip;
8559	10032	fpu->last_dp = fxsave->rdp;
8560		- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
	10033	+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
8561	10034
8562	10035	vcpu_put(vcpu);
8563	10036	return 0;
..	..	@@ -8569,7 +10042,7 @@
8569	10042
8570	10043	vcpu_load(vcpu);
8571	10044
8572		- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
	10045	+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8573	10046
8574	10047	memcpy(fxsave->st_space, fpu->fpr, 128);
8575	10048	fxsave->cwd = fpu->fcw;
..	..	@@ -8578,7 +10051,7 @@
8578	10051	fxsave->fop = fpu->last_opcode;
8579	10052	fxsave->rip = fpu->last_ip;
8580	10053	fxsave->rdp = fpu->last_dp;
8581		- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
	10054	+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
8582	10055
8583	10056	vcpu_put(vcpu);
8584	10057	return 0;
..	..	@@ -8625,9 +10098,9 @@
8625	10098
8626	10099	static void fx_init(struct kvm_vcpu *vcpu)
8627	10100	{
8628		- fpstate_init(&vcpu->arch.guest_fpu.state);
	10101	+ fpstate_init(&vcpu->arch.guest_fpu->state);
8629	10102	if (boot_cpu_has(X86_FEATURE_XSAVES))
8630		- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
	10103	+ vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
8631	10104	host_xcr0 \| XSTATE_COMPACTION_ENABLED;
8632	10105
8633	10106	/*
..	..	@@ -8638,48 +10111,122 @@
8638	10111	vcpu->arch.cr0 \|= X86_CR0_ET;
8639	10112	}
8640	10113
8641		-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
	10114	+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
8642	10115	{
8643		- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
8644		- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
8645		-
8646		- kvm_release_pfn(cache->pfn, cache->dirty, cache);
8647		-
8648		- kvmclock_reset(vcpu);
8649		-
8650		- kvm_x86_ops->vcpu_free(vcpu);
8651		- free_cpumask_var(wbinvd_dirty_mask);
8652		-}
8653		-
8654		-struct kvm_vcpu kvm_arch_vcpu_create(struct kvm kvm,
8655		- unsigned int id)
8656		-{
8657		- struct kvm_vcpu *vcpu;
8658		-
8659	10116	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
8660		- printk_once(KERN_WARNING
8661		- "kvm: SMP vm created on host with unstable TSC; "
8662		- "guest TSC will not be reliable\n");
	10117	+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
	10118	+ "guest TSC will not be reliable\n");
8663	10119
8664		- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
8665		-
8666		- return vcpu;
	10120	+ return 0;
8667	10121	}
8668	10122
8669		-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
	10123	+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
8670	10124	{
	10125	+ struct page *page;
	10126	+ int r;
	10127	+
	10128	+ if (!irqchip_in_kernel(vcpu->kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
	10129	+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
	10130	+ else
	10131	+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
	10132	+
	10133	+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
	10134	+
	10135	+ r = kvm_mmu_create(vcpu);
	10136	+ if (r < 0)
	10137	+ return r;
	10138	+
	10139	+ if (irqchip_in_kernel(vcpu->kvm)) {
	10140	+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
	10141	+ if (r < 0)
	10142	+ goto fail_mmu_destroy;
	10143	+ if (kvm_apicv_activated(vcpu->kvm))
	10144	+ vcpu->arch.apicv_active = true;
	10145	+ } else
	10146	+ static_key_slow_inc(&kvm_no_apic_vcpu);
	10147	+
	10148	+ r = -ENOMEM;
	10149	+
	10150	+ page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
	10151	+ if (!page)
	10152	+ goto fail_free_lapic;
	10153	+ vcpu->arch.pio_data = page_address(page);
	10154	+
	10155	+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
	10156	+ GFP_KERNEL_ACCOUNT);
	10157	+ if (!vcpu->arch.mce_banks)
	10158	+ goto fail_free_pio_data;
	10159	+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
	10160	+
	10161	+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
	10162	+ GFP_KERNEL_ACCOUNT))
	10163	+ goto fail_free_mce_banks;
	10164	+
	10165	+ if (!alloc_emulate_ctxt(vcpu))
	10166	+ goto free_wbinvd_dirty_mask;
	10167	+
	10168	+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
	10169	+ GFP_KERNEL_ACCOUNT);
	10170	+ if (!vcpu->arch.user_fpu) {
	10171	+ pr_err("kvm: failed to allocate userspace's fpu\n");
	10172	+ goto free_emulate_ctxt;
	10173	+ }
	10174	+
	10175	+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
	10176	+ GFP_KERNEL_ACCOUNT);
	10177	+ if (!vcpu->arch.guest_fpu) {
	10178	+ pr_err("kvm: failed to allocate vcpu's fpu\n");
	10179	+ goto free_user_fpu;
	10180	+ }
	10181	+ fx_init(vcpu);
	10182	+
	10183	+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
	10184	+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
	10185	+
	10186	+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
	10187	+
	10188	+ kvm_async_pf_hash_reset(vcpu);
	10189	+ kvm_pmu_init(vcpu);
	10190	+
	10191	+ vcpu->arch.pending_external_vector = -1;
	10192	+ vcpu->arch.preempted_in_kernel = false;
	10193	+
	10194	+ kvm_hv_vcpu_init(vcpu);
	10195	+
	10196	+ r = kvm_x86_ops.vcpu_create(vcpu);
	10197	+ if (r)
	10198	+ goto free_guest_fpu;
	10199	+
8671	10200	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
	10201	+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8672	10202	kvm_vcpu_mtrr_init(vcpu);
8673	10203	vcpu_load(vcpu);
8674	10204	kvm_vcpu_reset(vcpu, false);
8675		- kvm_mmu_setup(vcpu);
	10205	+ kvm_init_mmu(vcpu, false);
8676	10206	vcpu_put(vcpu);
8677	10207	return 0;
	10208	+
	10209	+free_guest_fpu:
	10210	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
	10211	+free_user_fpu:
	10212	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
	10213	+free_emulate_ctxt:
	10214	+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
	10215	+free_wbinvd_dirty_mask:
	10216	+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
	10217	+fail_free_mce_banks:
	10218	+ kfree(vcpu->arch.mce_banks);
	10219	+fail_free_pio_data:
	10220	+ free_page((unsigned long)vcpu->arch.pio_data);
	10221	+fail_free_lapic:
	10222	+ kvm_free_lapic(vcpu);
	10223	+fail_mmu_destroy:
	10224	+ kvm_mmu_destroy(vcpu);
	10225	+ return r;
8678	10226	}
8679	10227
8680	10228	void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
8681	10229	{
8682		- struct msr_data msr;
8683	10230	struct kvm *kvm = vcpu->kvm;
8684	10231
8685	10232	kvm_hv_vcpu_postcreate(vcpu);
..	..	@@ -8687,23 +10234,46 @@
8687	10234	if (mutex_lock_killable(&vcpu->mutex))
8688	10235	return;
8689	10236	vcpu_load(vcpu);
8690		- msr.data = 0x0;
8691		- msr.index = MSR_IA32_TSC;
8692		- msr.host_initiated = true;
8693		- kvm_write_tsc(vcpu, &msr);
	10237	+ kvm_synchronize_tsc(vcpu, 0);
8694	10238	vcpu_put(vcpu);
	10239	+
	10240	+ /* poll control enabled by default */
	10241	+ vcpu->arch.msr_kvm_poll_control = 1;
	10242	+
8695	10243	mutex_unlock(&vcpu->mutex);
8696	10244
8697		- if (!kvmclock_periodic_sync)
8698		- return;
8699		-
8700		- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
8701		- KVMCLOCK_SYNC_PERIOD);
	10245	+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
	10246	+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
	10247	+ KVMCLOCK_SYNC_PERIOD);
8702	10248	}
8703	10249
8704	10250	void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
8705	10251	{
8706		- kvm_arch_vcpu_free(vcpu);
	10252	+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
	10253	+ int idx;
	10254	+
	10255	+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
	10256	+
	10257	+ kvmclock_reset(vcpu);
	10258	+
	10259	+ kvm_x86_ops.vcpu_free(vcpu);
	10260	+
	10261	+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
	10262	+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
	10263	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
	10264	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
	10265	+
	10266	+ kvm_hv_vcpu_uninit(vcpu);
	10267	+ kvm_pmu_destroy(vcpu);
	10268	+ kfree(vcpu->arch.mce_banks);
	10269	+ kvm_free_lapic(vcpu);
	10270	+ idx = srcu_read_lock(&vcpu->kvm->srcu);
	10271	+ kvm_mmu_destroy(vcpu);
	10272	+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
	10273	+ free_page((unsigned long)vcpu->arch.pio_data);
	10274	+ kvfree(vcpu->arch.cpuid_entries);
	10275	+ if (!lapic_in_kernel(vcpu))
	10276	+ static_key_slow_dec(&kvm_no_apic_vcpu);
8707	10277	}
8708	10278
8709	10279	void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
..	..	@@ -8719,19 +10289,18 @@
8719	10289	vcpu->arch.nmi_injected = false;
8720	10290	kvm_clear_interrupt_queue(vcpu);
8721	10291	kvm_clear_exception_queue(vcpu);
8722		- vcpu->arch.exception.pending = false;
8723	10292
8724	10293	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
8725	10294	kvm_update_dr0123(vcpu);
8726	10295	vcpu->arch.dr6 = DR6_INIT;
8727		- kvm_update_dr6(vcpu);
8728	10296	vcpu->arch.dr7 = DR7_FIXED_1;
8729	10297	kvm_update_dr7(vcpu);
8730	10298
8731	10299	vcpu->arch.cr2 = 0;
8732	10300
8733	10301	kvm_make_request(KVM_REQ_EVENT, vcpu);
8734		- vcpu->arch.apf.msr_val = 0;
	10302	+ vcpu->arch.apf.msr_en_val = 0;
	10303	+ vcpu->arch.apf.msr_int_val = 0;
8735	10304	vcpu->arch.st.msr_val = 0;
8736	10305
8737	10306	kvmclock_reset(vcpu);
..	..	@@ -8749,12 +10318,12 @@
8749	10318	*/
8750	10319	if (init_event)
8751	10320	kvm_put_guest_fpu(vcpu);
8752		- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8753		- XFEATURE_MASK_BNDREGS);
	10321	+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
	10322	+ XFEATURE_BNDREGS);
8754	10323	if (mpx_state_buffer)
8755	10324	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
8756		- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8757		- XFEATURE_MASK_BNDCSR);
	10325	+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
	10326	+ XFEATURE_BNDCSR);
8758	10327	if (mpx_state_buffer)
8759	10328	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
8760	10329	if (init_event)
..	..	@@ -8765,7 +10334,6 @@
8765	10334	kvm_pmu_reset(vcpu);
8766	10335	vcpu->arch.smbase = 0x30000;
8767	10336
8768		- vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8769	10337	vcpu->arch.msr_misc_features_enables = 0;
8770	10338
8771	10339	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
..	..	@@ -8777,7 +10345,7 @@
8777	10345
8778	10346	vcpu->arch.ia32_xss = 0;
8779	10347
8780		- kvm_x86_ops->vcpu_reset(vcpu, init_event);
	10348	+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
8781	10349	}
8782	10350
8783	10351	void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
..	..	@@ -8801,8 +10369,8 @@
8801	10369	u64 max_tsc = 0;
8802	10370	bool stable, backwards_tsc = false;
8803	10371
8804		- kvm_shared_msr_cpu_online();
8805		- ret = kvm_x86_ops->hardware_enable();
	10372	+ kvm_user_return_msr_cpu_online();
	10373	+ ret = kvm_x86_ops.hardware_enable();
8806	10374	if (ret != 0)
8807	10375	return ret;
8808	10376
..	..	@@ -8828,7 +10396,7 @@
8828	10396	* before any KVM threads can be running. Unfortunately, we can't
8829	10397	* bring the TSCs fully up to date with real time, as we aren't yet far
8830	10398	* enough into CPU bringup that we know how much real time has actually
8831		- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
	10399	+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
8832	10400	* variables that haven't been updated yet.
8833	10401	*
8834	10402	* So we simply find the maximum observed TSC above, then record the
..	..	@@ -8884,19 +10452,32 @@
8884	10452
8885	10453	void kvm_arch_hardware_disable(void)
8886	10454	{
8887		- kvm_x86_ops->hardware_disable();
	10455	+ kvm_x86_ops.hardware_disable();
8888	10456	drop_user_return_notifiers();
8889	10457	}
8890	10458
8891		-int kvm_arch_hardware_setup(void)
	10459	+int kvm_arch_hardware_setup(void *opaque)
8892	10460	{
	10461	+ struct kvm_x86_init_ops *ops = opaque;
8893	10462	int r;
8894	10463
8895		- r = kvm_x86_ops->hardware_setup();
	10464	+ rdmsrl_safe(MSR_EFER, &host_efer);
	10465	+
	10466	+ if (boot_cpu_has(X86_FEATURE_XSAVES))
	10467	+ rdmsrl(MSR_IA32_XSS, host_xss);
	10468	+
	10469	+ r = ops->hardware_setup();
8896	10470	if (r != 0)
8897	10471	return r;
8898	10472
8899		- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
	10473	+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
	10474	+
	10475	+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
	10476	+ supported_xss = 0;
	10477	+
	10478	+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
	10479	+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
	10480	+#undef __kvm_cpu_cap_has
8900	10481
8901	10482	if (kvm_has_tsc_control) {
8902	10483	/*
..	..	@@ -8918,12 +10499,21 @@
8918	10499
8919	10500	void kvm_arch_hardware_unsetup(void)
8920	10501	{
8921		- kvm_x86_ops->hardware_unsetup();
	10502	+ kvm_x86_ops.hardware_unsetup();
8922	10503	}
8923	10504
8924		-void kvm_arch_check_processor_compat(void *rtn)
	10505	+int kvm_arch_check_processor_compat(void *opaque)
8925	10506	{
8926		- kvm_x86_ops->check_processor_compatibility(rtn);
	10507	+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
	10508	+ struct kvm_x86_init_ops *ops = opaque;
	10509	+
	10510	+ WARN_ON(!irqs_disabled());
	10511	+
	10512	+ if (__cr4_reserved_bits(cpu_has, c) !=
	10513	+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
	10514	+ return -EIO;
	10515	+
	10516	+ return ops->check_processor_compatibility();
8927	10517	}
8928	10518
8929	10519	bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
..	..	@@ -8940,107 +10530,35 @@
8940	10530	struct static_key kvm_no_apic_vcpu __read_mostly;
8941	10531	EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
8942	10532
8943		-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8944		-{
8945		- struct page *page;
8946		- int r;
8947		-
8948		- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
8949		- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
8950		- if (!irqchip_in_kernel(vcpu->kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
8951		- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
8952		- else
8953		- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
8954		-
8955		- page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
8956		- if (!page) {
8957		- r = -ENOMEM;
8958		- goto fail;
8959		- }
8960		- vcpu->arch.pio_data = page_address(page);
8961		-
8962		- kvm_set_tsc_khz(vcpu, max_tsc_khz);
8963		-
8964		- r = kvm_mmu_create(vcpu);
8965		- if (r < 0)
8966		- goto fail_free_pio_data;
8967		-
8968		- if (irqchip_in_kernel(vcpu->kvm)) {
8969		- r = kvm_create_lapic(vcpu);
8970		- if (r < 0)
8971		- goto fail_mmu_destroy;
8972		- } else
8973		- static_key_slow_inc(&kvm_no_apic_vcpu);
8974		-
8975		- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
8976		- GFP_KERNEL);
8977		- if (!vcpu->arch.mce_banks) {
8978		- r = -ENOMEM;
8979		- goto fail_free_lapic;
8980		- }
8981		- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
8982		-
8983		- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
8984		- r = -ENOMEM;
8985		- goto fail_free_mce_banks;
8986		- }
8987		-
8988		- fx_init(vcpu);
8989		-
8990		- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
8991		-
8992		- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
8993		-
8994		- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
8995		-
8996		- kvm_async_pf_hash_reset(vcpu);
8997		- kvm_pmu_init(vcpu);
8998		-
8999		- vcpu->arch.pending_external_vector = -1;
9000		- vcpu->arch.preempted_in_kernel = false;
9001		-
9002		- kvm_hv_vcpu_init(vcpu);
9003		-
9004		- return 0;
9005		-
9006		-fail_free_mce_banks:
9007		- kfree(vcpu->arch.mce_banks);
9008		-fail_free_lapic:
9009		- kvm_free_lapic(vcpu);
9010		-fail_mmu_destroy:
9011		- kvm_mmu_destroy(vcpu);
9012		-fail_free_pio_data:
9013		- free_page((unsigned long)vcpu->arch.pio_data);
9014		-fail:
9015		- return r;
9016		-}
9017		-
9018		-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
9019		-{
9020		- int idx;
9021		-
9022		- kvm_hv_vcpu_uninit(vcpu);
9023		- kvm_pmu_destroy(vcpu);
9024		- kfree(vcpu->arch.mce_banks);
9025		- kvm_free_lapic(vcpu);
9026		- idx = srcu_read_lock(&vcpu->kvm->srcu);
9027		- kvm_mmu_destroy(vcpu);
9028		- srcu_read_unlock(&vcpu->kvm->srcu, idx);
9029		- free_page((unsigned long)vcpu->arch.pio_data);
9030		- if (!lapic_in_kernel(vcpu))
9031		- static_key_slow_dec(&kvm_no_apic_vcpu);
9032		-}
9033		-
9034	10533	void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
9035	10534	{
	10535	+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
	10536	+
9036	10537	vcpu->arch.l1tf_flush_l1d = true;
9037		- kvm_x86_ops->sched_in(vcpu, cpu);
	10538	+ if (pmu->version && unlikely(pmu->event_count)) {
	10539	+ pmu->need_cleanup = true;
	10540	+ kvm_make_request(KVM_REQ_PMU, vcpu);
	10541	+ }
	10542	+ kvm_x86_ops.sched_in(vcpu, cpu);
9038	10543	}
	10544	+
	10545	+void kvm_arch_free_vm(struct kvm *kvm)
	10546	+{
	10547	+ kfree(kvm->arch.hyperv.hv_pa_pg);
	10548	+ vfree(kvm);
	10549	+}
	10550	+
9039	10551
9040	10552	int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
9041	10553	{
	10554	+ int ret;
	10555	+
9042	10556	if (type)
9043	10557	return -EINVAL;
	10558	+
	10559	+ ret = kvm_page_track_init(kvm);
	10560	+ if (ret)
	10561	+ return ret;
9044	10562
9045	10563	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
9046	10564	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
..	..	@@ -9059,7 +10577,7 @@
9059	10577	mutex_init(&kvm->arch.apic_map_lock);
9060	10578	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
9061	10579
9062		- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
	10580	+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
9063	10581	pvclock_update_vm_gtod_copy(kvm);
9064	10582
9065	10583	kvm->arch.guest_can_read_msr_platform_info = true;
..	..	@@ -9068,13 +10586,9 @@
9068	10586	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
9069	10587
9070	10588	kvm_hv_init_vm(kvm);
9071		- kvm_page_track_init(kvm);
9072	10589	kvm_mmu_init_vm(kvm);
9073	10590
9074		- if (kvm_x86_ops->vm_init)
9075		- return kvm_x86_ops->vm_init(kvm);
9076		-
9077		- return 0;
	10591	+ return kvm_x86_ops.vm_init(kvm);
9078	10592	}
9079	10593
9080	10594	int kvm_arch_post_init_vm(struct kvm *kvm)
..	..	@@ -9102,7 +10616,7 @@
9102	10616	kvm_unload_vcpu_mmu(vcpu);
9103	10617	}
9104	10618	kvm_for_each_vcpu(i, vcpu, kvm)
9105		- kvm_arch_vcpu_free(vcpu);
	10619	+ kvm_vcpu_destroy(vcpu);
9106	10620
9107	10621	mutex_lock(&kvm->lock);
9108	10622	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
..	..	@@ -9122,9 +10636,9 @@
9122	10636	int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9123	10637	{
9124	10638	int i, r;
9125		- unsigned long hva;
	10639	+ unsigned long hva, old_npages;
9126	10640	struct kvm_memslots *slots = kvm_memslots(kvm);
9127		- struct kvm_memory_slot *slot, old;
	10641	+ struct kvm_memory_slot *slot;
9128	10642
9129	10643	/* Called with kvm->slots_lock held. */
9130	10644	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
..	..	@@ -9132,7 +10646,7 @@
9132	10646
9133	10647	slot = id_to_memslot(slots, id);
9134	10648	if (size) {
9135		- if (slot->npages)
	10649	+ if (slot && slot->npages)
9136	10650	return -EEXIST;
9137	10651
9138	10652	/*
..	..	@@ -9144,13 +10658,13 @@
9144	10658	if (IS_ERR((void *)hva))
9145	10659	return PTR_ERR((void *)hva);
9146	10660	} else {
9147		- if (!slot->npages)
	10661	+ if (!slot \|\| !slot->npages)
9148	10662	return 0;
9149	10663
	10664	+ old_npages = slot->npages;
9150	10665	hva = 0;
9151	10666	}
9152	10667
9153		- old = *slot;
9154	10668	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
9155	10669	struct kvm_userspace_memory_region m;
9156	10670
..	..	@@ -9165,23 +10679,11 @@
9165	10679	}
9166	10680
9167	10681	if (!size)
9168		- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
	10682	+ vm_munmap(hva, old_npages * PAGE_SIZE);
9169	10683
9170	10684	return 0;
9171	10685	}
9172	10686	EXPORT_SYMBOL_GPL(__x86_set_memory_region);
9173		-
9174		-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9175		-{
9176		- int r;
9177		-
9178		- mutex_lock(&kvm->slots_lock);
9179		- r = __x86_set_memory_region(kvm, id, gpa, size);
9180		- mutex_unlock(&kvm->slots_lock);
9181		-
9182		- return r;
9183		-}
9184		-EXPORT_SYMBOL_GPL(x86_set_memory_region);
9185	10687
9186	10688	void kvm_arch_pre_destroy_vm(struct kvm *kvm)
9187	10689	{
..	..	@@ -9196,46 +10698,47 @@
9196	10698	* unless the the memory map has changed due to process exit
9197	10699	* or fd copying.
9198	10700	*/
9199		- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
9200		- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
9201		- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
	10701	+ mutex_lock(&kvm->slots_lock);
	10702	+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
	10703	+ 0, 0);
	10704	+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
	10705	+ 0, 0);
	10706	+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
	10707	+ mutex_unlock(&kvm->slots_lock);
9202	10708	}
9203		- if (kvm_x86_ops->vm_destroy)
9204		- kvm_x86_ops->vm_destroy(kvm);
	10709	+ if (kvm_x86_ops.vm_destroy)
	10710	+ kvm_x86_ops.vm_destroy(kvm);
	10711	+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
9205	10712	kvm_pic_destroy(kvm);
9206	10713	kvm_ioapic_destroy(kvm);
9207	10714	kvm_free_vcpus(kvm);
9208	10715	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
	10716	+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
9209	10717	kvm_mmu_uninit_vm(kvm);
9210	10718	kvm_page_track_cleanup(kvm);
9211	10719	kvm_hv_destroy_vm(kvm);
9212	10720	}
9213	10721
9214		-void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
9215		- struct kvm_memory_slot *dont)
	10722	+void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot slot)
9216	10723	{
9217	10724	int i;
9218	10725
9219	10726	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
9220		- if (!dont \|\| free->arch.rmap[i] != dont->arch.rmap[i]) {
9221		- kvfree(free->arch.rmap[i]);
9222		- free->arch.rmap[i] = NULL;
9223		- }
	10727	+ kvfree(slot->arch.rmap[i]);
	10728	+ slot->arch.rmap[i] = NULL;
	10729	+
9224	10730	if (i == 0)
9225	10731	continue;
9226	10732
9227		- if (!dont \|\| free->arch.lpage_info[i - 1] !=
9228		- dont->arch.lpage_info[i - 1]) {
9229		- kvfree(free->arch.lpage_info[i - 1]);
9230		- free->arch.lpage_info[i - 1] = NULL;
9231		- }
	10733	+ kvfree(slot->arch.lpage_info[i - 1]);
	10734	+ slot->arch.lpage_info[i - 1] = NULL;
9232	10735	}
9233	10736
9234		- kvm_page_track_free_memslot(free, dont);
	10737	+ kvm_page_track_free_memslot(slot);
9235	10738	}
9236	10739
9237		-int kvm_arch_create_memslot(struct kvm kvm, struct kvm_memory_slot slot,
9238		- unsigned long npages)
	10740	+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
	10741	+ unsigned long npages)
9239	10742	{
9240	10743	int i;
9241	10744
..	..	@@ -9257,13 +10760,13 @@
9257	10760
9258	10761	slot->arch.rmap[i] =
9259	10762	kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
9260		- GFP_KERNEL);
	10763	+ GFP_KERNEL_ACCOUNT);
9261	10764	if (!slot->arch.rmap[i])
9262	10765	goto out_free;
9263	10766	if (i == 0)
9264	10767	continue;
9265	10768
9266		- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
	10769	+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
9267	10770	if (!linfo)
9268	10771	goto out_free;
9269	10772
..	..	@@ -9276,11 +10779,9 @@
9276	10779	ugfn = slot->userspace_addr >> PAGE_SHIFT;
9277	10780	/*
9278	10781	* If the gfn and userspace address are not aligned wrt each
9279		- * other, or if explicitly asked to, disable large page
9280		- * support for this slot
	10782	+ * other, disable large page support for this slot.
9281	10783	*/
9282		- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) \|\|
9283		- !kvm_largepages_enabled()) {
	10784	+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
9284	10785	unsigned long j;
9285	10786
9286	10787	for (j = 0; j < lpages; ++j)
..	..	@@ -9327,76 +10828,23 @@
9327	10828	const struct kvm_userspace_memory_region *mem,
9328	10829	enum kvm_mr_change change)
9329	10830	{
9330		- if (change == KVM_MR_MOVE)
9331		- return kvm_arch_create_memslot(kvm, memslot,
9332		- mem->memory_size >> PAGE_SHIFT);
9333		-
	10831	+ if (change == KVM_MR_CREATE \|\| change == KVM_MR_MOVE)
	10832	+ return kvm_alloc_memslot_metadata(memslot,
	10833	+ mem->memory_size >> PAGE_SHIFT);
9334	10834	return 0;
9335	10835	}
9336	10836
9337	10837	static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
9338		- struct kvm_memory_slot *new)
	10838	+ struct kvm_memory_slot *old,
	10839	+ struct kvm_memory_slot *new,
	10840	+ enum kvm_mr_change change)
9339	10841	{
9340		- /* Still write protect RO slot */
9341		- if (new->flags & KVM_MEM_READONLY) {
9342		- kvm_mmu_slot_remove_write_access(kvm, new);
9343		- return;
9344		- }
9345		-
9346	10842	/*
9347		- * Call kvm_x86_ops dirty logging hooks when they are valid.
9348		- *
9349		- * kvm_x86_ops->slot_disable_log_dirty is called when:
9350		- *
9351		- * - KVM_MR_CREATE with dirty logging is disabled
9352		- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
9353		- *
9354		- * The reason is, in case of PML, we need to set D-bit for any slots
9355		- * with dirty logging disabled in order to eliminate unnecessary GPA
9356		- * logging in PML buffer (and potential PML buffer full VMEXT). This
9357		- * guarantees leaving PML enabled during guest's lifetime won't have
9358		- * any additonal overhead from PML when guest is running with dirty
9359		- * logging disabled for memory slots.
9360		- *
9361		- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
9362		- * to dirty logging mode.
9363		- *
9364		- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
9365		- *
9366		- * In case of write protect:
9367		- *
9368		- * Write protect all pages for dirty logging.
9369		- *
9370		- * All the sptes including the large sptes which point to this
9371		- * slot are set to readonly. We can not create any new large
9372		- * spte on this slot until the end of the logging.
9373		- *
9374		- * See the comments in fast_page_fault().
	10843	+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
	10844	+ * See comments below.
9375	10845	*/
9376		- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
9377		- if (kvm_x86_ops->slot_enable_log_dirty)
9378		- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
9379		- else
9380		- kvm_mmu_slot_remove_write_access(kvm, new);
9381		- } else {
9382		- if (kvm_x86_ops->slot_disable_log_dirty)
9383		- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
9384		- }
9385		-}
9386		-
9387		-void kvm_arch_commit_memory_region(struct kvm *kvm,
9388		- const struct kvm_userspace_memory_region *mem,
9389		- const struct kvm_memory_slot *old,
9390		- const struct kvm_memory_slot *new,
9391		- enum kvm_mr_change change)
9392		-{
9393		- int nr_mmu_pages = 0;
9394		-
9395		- if (!kvm->arch.n_requested_mmu_pages)
9396		- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
9397		-
9398		- if (nr_mmu_pages)
9399		- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
	10846	+ if ((change != KVM_MR_FLAGS_ONLY) \|\| (new->flags & KVM_MEM_READONLY))
	10847	+ return;
9400	10848
9401	10849	/*
9402	10850	* Dirty logging tracks sptes in 4k granularity, meaning that large
..	..	@@ -9409,29 +10857,91 @@
9409	10857	* Scan sptes if dirty logging has been stopped, dropping those
9410	10858	* which can be collapsed into a single large-page spte. Later
9411	10859	* page faults will create the large-page sptes.
	10860	+ *
	10861	+ * There is no need to do this in any of the following cases:
	10862	+ * CREATE: No dirty mappings will already exist.
	10863	+ * MOVE/DELETE: The old mappings will already have been cleaned up by
	10864	+ * kvm_arch_flush_shadow_memslot()
9412	10865	*/
9413		- if ((change != KVM_MR_DELETE) &&
9414		- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
9415		- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
	10866	+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
	10867	+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
9416	10868	kvm_mmu_zap_collapsible_sptes(kvm, new);
9417	10869
9418	10870	/*
9419		- * Set up write protection and/or dirty logging for the new slot.
	10871	+ * Enable or disable dirty logging for the slot.
9420	10872	*
9421		- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
9422		- * been zapped so no dirty logging staff is needed for old slot. For
9423		- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
9424		- * new and it's also covered when dealing with the new slot.
	10873	+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
	10874	+ * slot have been zapped so no dirty logging updates are needed for
	10875	+ * the old slot.
	10876	+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
	10877	+ * any mappings that might be created in it will consume the
	10878	+ * properties of the new slot and do not need to be updated here.
9425	10879	*
	10880	+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
	10881	+ * called to enable/disable dirty logging.
	10882	+ *
	10883	+ * When disabling dirty logging with PML enabled, the D-bit is set
	10884	+ * for sptes in the slot in order to prevent unnecessary GPA
	10885	+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
	10886	+ * This guarantees leaving PML enabled for the guest's lifetime
	10887	+ * won't have any additional overhead from PML when the guest is
	10888	+ * running with dirty logging disabled.
	10889	+ *
	10890	+ * When enabling dirty logging, large sptes are write-protected
	10891	+ * so they can be split on first write. New large sptes cannot
	10892	+ * be created for this slot until the end of the logging.
	10893	+ * See the comments in fast_page_fault().
	10894	+ * For small sptes, nothing is done if the dirty log is in the
	10895	+ * initial-all-set state. Otherwise, depending on whether pml
	10896	+ * is enabled the D-bit or the W-bit will be cleared.
	10897	+ */
	10898	+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
	10899	+ if (kvm_x86_ops.slot_enable_log_dirty) {
	10900	+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
	10901	+ } else {
	10902	+ int level =
	10903	+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
	10904	+ PG_LEVEL_2M : PG_LEVEL_4K;
	10905	+
	10906	+ /*
	10907	+ * If we're with initial-all-set, we don't need
	10908	+ * to write protect any small page because
	10909	+ * they're reported as dirty already. However
	10910	+ * we still need to write-protect huge pages
	10911	+ * so that the page split can happen lazily on
	10912	+ * the first write to the huge page.
	10913	+ */
	10914	+ kvm_mmu_slot_remove_write_access(kvm, new, level);
	10915	+ }
	10916	+ } else {
	10917	+ if (kvm_x86_ops.slot_disable_log_dirty)
	10918	+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
	10919	+ }
	10920	+}
	10921	+
	10922	+void kvm_arch_commit_memory_region(struct kvm *kvm,
	10923	+ const struct kvm_userspace_memory_region *mem,
	10924	+ struct kvm_memory_slot *old,
	10925	+ const struct kvm_memory_slot *new,
	10926	+ enum kvm_mr_change change)
	10927	+{
	10928	+ if (!kvm->arch.n_requested_mmu_pages)
	10929	+ kvm_mmu_change_mmu_pages(kvm,
	10930	+ kvm_mmu_calculate_default_mmu_pages(kvm));
	10931	+
	10932	+ /*
9426	10933	* FIXME: const-ify all uses of struct kvm_memory_slot.
9427	10934	*/
9428		- if (change != KVM_MR_DELETE)
9429		- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
	10935	+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
	10936	+
	10937	+ /* Free the arrays associated with the old memslot. */
	10938	+ if (change == KVM_MR_MOVE)
	10939	+ kvm_arch_free_memslot(kvm, old);
9430	10940	}
9431	10941
9432	10942	void kvm_arch_flush_shadow_all(struct kvm *kvm)
9433	10943	{
9434		- kvm_mmu_invalidate_zap_all_pages(kvm);
	10944	+ kvm_mmu_zap_all(kvm);
9435	10945	}
9436	10946
9437	10947	void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
..	..	@@ -9443,8 +10953,8 @@
9443	10953	static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
9444	10954	{
9445	10955	return (is_guest_mode(vcpu) &&
9446		- kvm_x86_ops->guest_apic_has_interrupt &&
9447		- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
	10956	+ kvm_x86_ops.guest_apic_has_interrupt &&
	10957	+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
9448	10958	}
9449	10959
9450	10960	static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
..	..	@@ -9463,11 +10973,12 @@
9463	10973
9464	10974	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
9465	10975	(vcpu->arch.nmi_pending &&
9466		- kvm_x86_ops->nmi_allowed(vcpu)))
	10976	+ kvm_x86_ops.nmi_allowed(vcpu, false)))
9467	10977	return true;
9468	10978
9469	10979	if (kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
9470		- (vcpu->arch.smi_pending && !is_smm(vcpu)))
	10980	+ (vcpu->arch.smi_pending &&
	10981	+ kvm_x86_ops.smi_allowed(vcpu, false)))
9471	10982	return true;
9472	10983
9473	10984	if (kvm_arch_interrupt_allowed(vcpu) &&
..	..	@@ -9476,6 +10987,11 @@
9476	10987	return true;
9477	10988
9478	10989	if (kvm_hv_has_stimer_pending(vcpu))
	10990	+ return true;
	10991	+
	10992	+ if (is_guest_mode(vcpu) &&
	10993	+ kvm_x86_ops.nested_ops->hv_timer_pending &&
	10994	+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
9479	10995	return true;
9480	10996
9481	10997	return false;
..	..	@@ -9496,7 +11012,7 @@
9496	11012	kvm_test_request(KVM_REQ_EVENT, vcpu))
9497	11013	return true;
9498	11014
9499		- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
	11015	+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
9500	11016	return true;
9501	11017
9502	11018	return false;
..	..	@@ -9514,7 +11030,7 @@
9514	11030
9515	11031	int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
9516	11032	{
9517		- return kvm_x86_ops->interrupt_allowed(vcpu);
	11033	+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
9518	11034	}
9519	11035
9520	11036	unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
..	..	@@ -9536,7 +11052,7 @@
9536	11052	{
9537	11053	unsigned long rflags;
9538	11054
9539		- rflags = kvm_x86_ops->get_rflags(vcpu);
	11055	+ rflags = kvm_x86_ops.get_rflags(vcpu);
9540	11056	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
9541	11057	rflags &= ~X86_EFLAGS_TF;
9542	11058	return rflags;
..	..	@@ -9548,7 +11064,7 @@
9548	11064	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
9549	11065	kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
9550	11066	rflags \|= X86_EFLAGS_TF;
9551		- kvm_x86_ops->set_rflags(vcpu, rflags);
	11067	+ kvm_x86_ops.set_rflags(vcpu, rflags);
9552	11068	}
9553	11069
9554	11070	void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
..	..	@@ -9562,7 +11078,7 @@
9562	11078	{
9563	11079	int r;
9564	11080
9565		- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) \|\|
	11081	+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) \|\|
9566	11082	work->wakeup_all)
9567	11083	return;
9568	11084
..	..	@@ -9570,21 +11086,23 @@
9570	11086	if (unlikely(r))
9571	11087	return;
9572	11088
9573		- if (!vcpu->arch.mmu.direct_map &&
9574		- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
	11089	+ if (!vcpu->arch.mmu->direct_map &&
	11090	+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
9575	11091	return;
9576	11092
9577		- vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
	11093	+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
9578	11094	}
9579	11095
9580	11096	static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
9581	11097	{
	11098	+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
	11099	+
9582	11100	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
9583	11101	}
9584	11102
9585	11103	static inline u32 kvm_async_pf_next_probe(u32 key)
9586	11104	{
9587		- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
	11105	+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
9588	11106	}
9589	11107
9590	11108	static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
..	..	@@ -9602,7 +11120,7 @@
9602	11120	int i;
9603	11121	u32 key = kvm_async_pf_hash_fn(gfn);
9604	11122
9605		- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
	11123	+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
9606	11124	(vcpu->arch.apf.gfns[key] != gfn &&
9607	11125	vcpu->arch.apf.gfns[key] != ~0); i++)
9608	11126	key = kvm_async_pf_next_probe(key);
..	..	@@ -9620,6 +11138,10 @@
9620	11138	u32 i, j, k;
9621	11139
9622	11140	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
	11141	+
	11142	+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
	11143	+ return;
	11144	+
9623	11145	while (true) {
9624	11146	vcpu->arch.apf.gfns[i] = ~0;
9625	11147	do {
..	..	@@ -9638,21 +11160,64 @@
9638	11160	}
9639	11161	}
9640	11162
9641		-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
	11163	+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
9642	11164	{
	11165	+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
9643	11166
9644		- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
9645		- sizeof(val));
	11167	+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
	11168	+ sizeof(reason));
9646	11169	}
9647	11170
9648		-static int apf_get_user(struct kvm_vcpu vcpu, u32 val)
	11171	+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
9649	11172	{
	11173	+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
9650	11174
9651		- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
9652		- sizeof(u32));
	11175	+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
	11176	+ &token, offset, sizeof(token));
9653	11177	}
9654	11178
9655		-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
	11179	+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
	11180	+{
	11181	+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
	11182	+ u32 val;
	11183	+
	11184	+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
	11185	+ &val, offset, sizeof(val)))
	11186	+ return false;
	11187	+
	11188	+ return !val;
	11189	+}
	11190	+
	11191	+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
	11192	+{
	11193	+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
	11194	+ return false;
	11195	+
	11196	+ if (!kvm_pv_async_pf_enabled(vcpu) \|\|
	11197	+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
	11198	+ return false;
	11199	+
	11200	+ return true;
	11201	+}
	11202	+
	11203	+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
	11204	+{
	11205	+ if (unlikely(!lapic_in_kernel(vcpu) \|\|
	11206	+ kvm_event_needs_reinjection(vcpu) \|\|
	11207	+ vcpu->arch.exception.pending))
	11208	+ return false;
	11209	+
	11210	+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
	11211	+ return false;
	11212	+
	11213	+ /*
	11214	+ * If interrupts are off we cannot even use an artificial
	11215	+ * halt state.
	11216	+ */
	11217	+ return kvm_arch_interrupt_allowed(vcpu);
	11218	+}
	11219	+
	11220	+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9656	11221	struct kvm_async_pf *work)
9657	11222	{
9658	11223	struct x86_exception fault;
..	..	@@ -9660,11 +11225,8 @@
9660	11225	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
9661	11226	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
9662	11227
9663		- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) \|\|
9664		- (vcpu->arch.apf.send_user_only &&
9665		- kvm_x86_ops->get_cpl(vcpu) == 0))
9666		- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9667		- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
	11228	+ if (kvm_can_deliver_async_pf(vcpu) &&
	11229	+ !apf_put_user_notpresent(vcpu)) {
9668	11230	fault.vector = PF_VECTOR;
9669	11231	fault.error_code_valid = true;
9670	11232	fault.error_code = 0;
..	..	@@ -9672,14 +11234,28 @@
9672	11234	fault.address = work->arch.token;
9673	11235	fault.async_page_fault = true;
9674	11236	kvm_inject_page_fault(vcpu, &fault);
	11237	+ return true;
	11238	+ } else {
	11239	+ /*
	11240	+ * It is not possible to deliver a paravirtualized asynchronous
	11241	+ * page fault, but putting the guest in an artificial halt state
	11242	+ * can be beneficial nevertheless: if an interrupt arrives, we
	11243	+ * can deliver it timely and perhaps the guest will schedule
	11244	+ * another process. When the instruction that triggered a page
	11245	+ * fault is retried, hopefully the page will be ready in the host.
	11246	+ */
	11247	+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
	11248	+ return false;
9675	11249	}
9676	11250	}
9677	11251
9678	11252	void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
9679	11253	struct kvm_async_pf *work)
9680	11254	{
9681		- struct x86_exception fault;
9682		- u32 val;
	11255	+ struct kvm_lapic_irq irq = {
	11256	+ .delivery_mode = APIC_DM_FIXED,
	11257	+ .vector = vcpu->arch.apf.vec
	11258	+ };
9683	11259
9684	11260	if (work->wakeup_all)
9685	11261	work->arch.token = ~0; /* broadcast wakeup */
..	..	@@ -9687,37 +11263,30 @@
9687	11263	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
9688	11264	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
9689	11265
9690		- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
9691		- !apf_get_user(vcpu, &val)) {
9692		- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
9693		- vcpu->arch.exception.pending &&
9694		- vcpu->arch.exception.nr == PF_VECTOR &&
9695		- !apf_put_user(vcpu, 0)) {
9696		- vcpu->arch.exception.injected = false;
9697		- vcpu->arch.exception.pending = false;
9698		- vcpu->arch.exception.nr = 0;
9699		- vcpu->arch.exception.has_error_code = false;
9700		- vcpu->arch.exception.error_code = 0;
9701		- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
9702		- fault.vector = PF_VECTOR;
9703		- fault.error_code_valid = true;
9704		- fault.error_code = 0;
9705		- fault.nested_page_fault = false;
9706		- fault.address = work->arch.token;
9707		- fault.async_page_fault = true;
9708		- kvm_inject_page_fault(vcpu, &fault);
9709		- }
	11266	+ if ((work->wakeup_all \|\| work->notpresent_injected) &&
	11267	+ kvm_pv_async_pf_enabled(vcpu) &&
	11268	+ !apf_put_user_ready(vcpu, work->arch.token)) {
	11269	+ vcpu->arch.apf.pageready_pending = true;
	11270	+ kvm_apic_set_irq(vcpu, &irq, NULL);
9710	11271	}
	11272	+
9711	11273	vcpu->arch.apf.halted = false;
9712	11274	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9713	11275	}
9714	11276
9715		-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
	11277	+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
9716	11278	{
9717		- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
	11279	+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
	11280	+ if (!vcpu->arch.apf.pageready_pending)
	11281	+ kvm_vcpu_kick(vcpu);
	11282	+}
	11283	+
	11284	+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
	11285	+{
	11286	+ if (!kvm_pv_async_pf_enabled(vcpu))
9718	11287	return true;
9719	11288	else
9720		- return kvm_can_do_async_pf(vcpu);
	11289	+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
9721	11290	}
9722	11291
9723	11292	void kvm_arch_start_assignment(struct kvm *kvm)
..	..	@@ -9732,9 +11301,9 @@
9732	11301	}
9733	11302	EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
9734	11303
9735		-bool kvm_arch_has_assigned_device(struct kvm *kvm)
	11304	+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
9736	11305	{
9737		- return atomic_read(&kvm->arch.assigned_device_count);
	11306	+ return arch_atomic_read(&kvm->arch.assigned_device_count);
9738	11307	}
9739	11308	EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
9740	11309
..	..	@@ -9758,7 +11327,7 @@
9758	11327
9759	11328	bool kvm_arch_has_irq_bypass(void)
9760	11329	{
9761		- return kvm_x86_ops->update_pi_irte != NULL;
	11330	+ return true;
9762	11331	}
9763	11332
9764	11333	int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
..	..	@@ -9766,11 +11335,17 @@
9766	11335	{
9767	11336	struct kvm_kernel_irqfd *irqfd =
9768	11337	container_of(cons, struct kvm_kernel_irqfd, consumer);
	11338	+ int ret;
9769	11339
9770	11340	irqfd->producer = prod;
	11341	+ kvm_arch_start_assignment(irqfd->kvm);
	11342	+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
	11343	+ prod->irq, irqfd->gsi, 1);
9771	11344
9772		- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
9773		- prod->irq, irqfd->gsi, 1);
	11345	+ if (ret)
	11346	+ kvm_arch_end_assignment(irqfd->kvm);
	11347	+
	11348	+ return ret;
9774	11349	}
9775	11350
9776	11351	void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
..	..	@@ -9789,26 +11364,185 @@
9789	11364	* when the irq is masked/disabled or the consumer side (KVM
9790	11365	* int this case doesn't want to receive the interrupts.
9791	11366	*/
9792		- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
	11367	+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
9793	11368	if (ret)
9794	11369	printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
9795	11370	" fails: %d\n", irqfd->consumer.token, ret);
	11371	+
	11372	+ kvm_arch_end_assignment(irqfd->kvm);
9796	11373	}
9797	11374
9798	11375	int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
9799	11376	uint32_t guest_irq, bool set)
9800	11377	{
9801		- if (!kvm_x86_ops->update_pi_irte)
9802		- return -EINVAL;
9803		-
9804		- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
	11378	+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
9805	11379	}
9806	11380
9807	11381	bool kvm_vector_hashing_enabled(void)
9808	11382	{
9809	11383	return vector_hashing;
9810	11384	}
9811		-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
	11385	+
	11386	+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
	11387	+{
	11388	+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
	11389	+}
	11390	+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
	11391	+
	11392	+
	11393	+int kvm_spec_ctrl_test_value(u64 value)
	11394	+{
	11395	+ /*
	11396	+ * test that setting IA32_SPEC_CTRL to given value
	11397	+ * is allowed by the host processor
	11398	+ */
	11399	+
	11400	+ u64 saved_value;
	11401	+ unsigned long flags;
	11402	+ int ret = 0;
	11403	+
	11404	+ local_irq_save(flags);
	11405	+
	11406	+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
	11407	+ ret = 1;
	11408	+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
	11409	+ ret = 1;
	11410	+ else
	11411	+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
	11412	+
	11413	+ local_irq_restore(flags);
	11414	+
	11415	+ return ret;
	11416	+}
	11417	+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
	11418	+
	11419	+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
	11420	+{
	11421	+ struct x86_exception fault;
	11422	+ u32 access = error_code &
	11423	+ (PFERR_WRITE_MASK \| PFERR_FETCH_MASK \| PFERR_USER_MASK);
	11424	+
	11425	+ if (!(error_code & PFERR_PRESENT_MASK) \|\|
	11426	+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
	11427	+ /*
	11428	+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
	11429	+ * tables probably do not match the TLB. Just proceed
	11430	+ * with the error code that the processor gave.
	11431	+ */
	11432	+ fault.vector = PF_VECTOR;
	11433	+ fault.error_code_valid = true;
	11434	+ fault.error_code = error_code;
	11435	+ fault.nested_page_fault = false;
	11436	+ fault.address = gva;
	11437	+ }
	11438	+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
	11439	+}
	11440	+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
	11441	+
	11442	+/*
	11443	+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
	11444	+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
	11445	+ * indicates whether exit to userspace is needed.
	11446	+ */
	11447	+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
	11448	+ struct x86_exception *e)
	11449	+{
	11450	+ if (r == X86EMUL_PROPAGATE_FAULT) {
	11451	+ kvm_inject_emulated_page_fault(vcpu, e);
	11452	+ return 1;
	11453	+ }
	11454	+
	11455	+ /*
	11456	+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
	11457	+ * while handling a VMX instruction KVM could've handled the request
	11458	+ * correctly by exiting to userspace and performing I/O but there
	11459	+ * doesn't seem to be a real use-case behind such requests, just return
	11460	+ * KVM_EXIT_INTERNAL_ERROR for now.
	11461	+ */
	11462	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	11463	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	11464	+ vcpu->run->internal.ndata = 0;
	11465	+
	11466	+ return 0;
	11467	+}
	11468	+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
	11469	+
	11470	+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
	11471	+{
	11472	+ bool pcid_enabled;
	11473	+ struct x86_exception e;
	11474	+ unsigned i;
	11475	+ unsigned long roots_to_free = 0;
	11476	+ struct {
	11477	+ u64 pcid;
	11478	+ u64 gla;
	11479	+ } operand;
	11480	+ int r;
	11481	+
	11482	+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
	11483	+ if (r != X86EMUL_CONTINUE)
	11484	+ return kvm_handle_memory_failure(vcpu, r, &e);
	11485	+
	11486	+ if (operand.pcid >> 12 != 0) {
	11487	+ kvm_inject_gp(vcpu, 0);
	11488	+ return 1;
	11489	+ }
	11490	+
	11491	+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
	11492	+
	11493	+ switch (type) {
	11494	+ case INVPCID_TYPE_INDIV_ADDR:
	11495	+ if ((!pcid_enabled && (operand.pcid != 0)) \|\|
	11496	+ is_noncanonical_address(operand.gla, vcpu)) {
	11497	+ kvm_inject_gp(vcpu, 0);
	11498	+ return 1;
	11499	+ }
	11500	+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
	11501	+ return kvm_skip_emulated_instruction(vcpu);
	11502	+
	11503	+ case INVPCID_TYPE_SINGLE_CTXT:
	11504	+ if (!pcid_enabled && (operand.pcid != 0)) {
	11505	+ kvm_inject_gp(vcpu, 0);
	11506	+ return 1;
	11507	+ }
	11508	+
	11509	+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
	11510	+ kvm_mmu_sync_roots(vcpu);
	11511	+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
	11512	+ }
	11513	+
	11514	+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
	11515	+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
	11516	+ == operand.pcid)
	11517	+ roots_to_free \|= KVM_MMU_ROOT_PREVIOUS(i);
	11518	+
	11519	+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
	11520	+ /*
	11521	+ * If neither the current cr3 nor any of the prev_roots use the
	11522	+ * given PCID, then nothing needs to be done here because a
	11523	+ * resync will happen anyway before switching to any other CR3.
	11524	+ */
	11525	+
	11526	+ return kvm_skip_emulated_instruction(vcpu);
	11527	+
	11528	+ case INVPCID_TYPE_ALL_NON_GLOBAL:
	11529	+ /*
	11530	+ * Currently, KVM doesn't mark global entries in the shadow
	11531	+ * page tables, so a non-global flush just degenerates to a
	11532	+ * global flush. If needed, we could optimize this later by
	11533	+ * keeping track of global entries in shadow page tables.
	11534	+ */
	11535	+
	11536	+ fallthrough;
	11537	+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
	11538	+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
	11539	+ return kvm_skip_emulated_instruction(vcpu);
	11540	+
	11541	+ default:
	11542	+ BUG(); /* We have already checked above that type <= 3 */
	11543	+ }
	11544	+}
	11545	+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
9812	11546
9813	11547	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
9814	11548	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
..	..	@@ -9820,12 +11554,31 @@
9820	11554	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
9821	11555	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
9822	11556	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
	11557	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
9823	11558	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
9824	11559	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
9825	11560	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
9826	11561	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
9827		-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
	11562	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
9828	11563	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
9829	11564	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
9830	11565	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
9831	11566	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
	11567	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
	11568	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
	11569	+
	11570	+static int __init kvm_x86_init(void)
	11571	+{
	11572	+ kvm_mmu_x86_module_init();
	11573	+ return 0;
	11574	+}
	11575	+module_init(kvm_x86_init);
	11576	+
	11577	+static void __exit kvm_x86_exit(void)
	11578	+{
	11579	+ /*
	11580	+ * If module_init() is implemented, module_exit() must also be
	11581	+ * implemented to allow module unload.
	11582	+ */
	11583	+}
	11584	+module_exit(kvm_x86_exit);