~hc/RK356X_SDK_RELEASE.git

..	..	@@ -1,3 +1,4 @@
	1	+// SPDX-License-Identifier: GPL-2.0-only
1	2	/*
2	3	* Kernel-based Virtual Machine driver for Linux
3	4	*
..	..	@@ -13,22 +14,21 @@
13	14	* Yaniv Kamay <yaniv@qumranet.com>
14	15	* Amit Shah <amit.shah@qumranet.com>
15	16	* Ben-Ami Yassour <benami@il.ibm.com>
16		- *
17		- * This work is licensed under the terms of the GNU GPL, version 2. See
18		- * the COPYING file in the top-level directory.
19		- *
20	17	*/
21	18
22	19	#include <linux/kvm_host.h>
23	20	#include "irq.h"
	21	+#include "ioapic.h"
24	22	#include "mmu.h"
25	23	#include "i8254.h"
26	24	#include "tss.h"
27	25	#include "kvm_cache_regs.h"
	26	+#include "kvm_emulate.h"
28	27	#include "x86.h"
29	28	#include "cpuid.h"
30	29	#include "pmu.h"
31	30	#include "hyperv.h"
	31	+#include "lapic.h"
32	32
33	33	#include <linux/clocksource.h>
34	34	#include <linux/interrupt.h>
..	..	@@ -54,7 +54,9 @@
54	54	#include <linux/kvm_irqfd.h>
55	55	#include <linux/irqbypass.h>
56	56	#include <linux/sched/stat.h>
	57	+#include <linux/sched/isolation.h>
57	58	#include <linux/mem_encrypt.h>
	59	+#include <linux/entry-kvm.h>
58	60
59	61	#include <trace/events/kvm.h>
60	62
..	..	@@ -69,6 +71,10 @@
69	71	#include <asm/irq_remapping.h>
70	72	#include <asm/mshyperv.h>
71	73	#include <asm/hypervisor.h>
	74	+#include <asm/tlbflush.h>
	75	+#include <asm/intel_pt.h>
	76	+#include <asm/emulate_prefix.h>
	77	+#include <clocksource/hyperv_timer.h>
72	78
73	79	#define CREATE_TRACE_POINTS
74	80	#include "trace.h"
..	..	@@ -79,7 +85,7 @@
79	85	EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
80	86
81	87	#define emul_to_vcpu(ctxt) \
82		- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
	88	+ ((struct kvm_vcpu *)(ctxt)->vcpu)
83	89
84	90	/* EFER defaults:
85	91	* - enable syscall per default because its emulated by KVM
..	..	@@ -94,9 +100,6 @@
94	100
95	101	static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
96	102
97		-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
98		-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
99		-
100	103	#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS \| \
101	104	KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
102	105
..	..	@@ -108,7 +111,7 @@
108	111	static void store_regs(struct kvm_vcpu *vcpu);
109	112	static int sync_regs(struct kvm_vcpu *vcpu);
110	113
111		-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
	114	+struct kvm_x86_ops kvm_x86_ops __read_mostly;
112	115	EXPORT_SYMBOL_GPL(kvm_x86_ops);
113	116
114	117	static bool __read_mostly ignore_msrs = 0;
..	..	@@ -138,10 +141,14 @@
138	141	static u32 __read_mostly tsc_tolerance_ppm = 250;
139	142	module_param(tsc_tolerance_ppm, uint, S_IRUGO \| S_IWUSR);
140	143
141		-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
142		-unsigned int __read_mostly lapic_timer_advance_ns = 0;
143		-module_param(lapic_timer_advance_ns, uint, S_IRUGO \| S_IWUSR);
144		-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
	144	+/*
	145	+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
	146	+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
	147	+ * advancement entirely. Any other value is used as-is and disables adaptive
	148	+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
	149	+ */
	150	+static int __read_mostly lapic_timer_advance_ns = -1;
	151	+module_param(lapic_timer_advance_ns, int, S_IRUGO \| S_IWUSR);
145	152
146	153	static bool __read_mostly vector_hashing = true;
147	154	module_param(vector_hashing, bool, S_IRUGO);
..	..	@@ -153,85 +160,149 @@
153	160	static bool __read_mostly force_emulation_prefix = false;
154	161	module_param(force_emulation_prefix, bool, S_IRUGO);
155	162
156		-#define KVM_NR_SHARED_MSRS 16
	163	+int __read_mostly pi_inject_timer = -1;
	164	+module_param(pi_inject_timer, bint, S_IRUGO \| S_IWUSR);
157	165
158		-struct kvm_shared_msrs_global {
	166	+/*
	167	+ * Restoring the host value for MSRs that are only consumed when running in
	168	+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
	169	+ * returns to userspace, i.e. the kernel can run with the guest's value.
	170	+ */
	171	+#define KVM_MAX_NR_USER_RETURN_MSRS 16
	172	+
	173	+struct kvm_user_return_msrs_global {
159	174	int nr;
160		- u32 msrs[KVM_NR_SHARED_MSRS];
	175	+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
161	176	};
162	177
163		-struct kvm_shared_msrs {
	178	+struct kvm_user_return_msrs {
164	179	struct user_return_notifier urn;
165	180	bool registered;
166		- struct kvm_shared_msr_values {
	181	+ struct kvm_user_return_msr_values {
167	182	u64 host;
168	183	u64 curr;
169		- } values[KVM_NR_SHARED_MSRS];
	184	+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
170	185	};
171	186
172		-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
173		-static struct kvm_shared_msrs __percpu *shared_msrs;
	187	+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
	188	+static struct kvm_user_return_msrs __percpu *user_return_msrs;
	189	+
	190	+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP \| XFEATURE_MASK_SSE \
	191	+ \| XFEATURE_MASK_YMM \| XFEATURE_MASK_BNDREGS \
	192	+ \| XFEATURE_MASK_BNDCSR \| XFEATURE_MASK_AVX512 \
	193	+ \| XFEATURE_MASK_PKRU)
	194	+
	195	+u64 __read_mostly host_efer;
	196	+EXPORT_SYMBOL_GPL(host_efer);
	197	+
	198	+bool __read_mostly allow_smaller_maxphyaddr = 0;
	199	+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
	200	+
	201	+static u64 __read_mostly host_xss;
	202	+u64 __read_mostly supported_xss;
	203	+EXPORT_SYMBOL_GPL(supported_xss);
174	204
175	205	struct kvm_stats_debugfs_item debugfs_entries[] = {
176		- { "pf_fixed", VCPU_STAT(pf_fixed) },
177		- { "pf_guest", VCPU_STAT(pf_guest) },
178		- { "tlb_flush", VCPU_STAT(tlb_flush) },
179		- { "invlpg", VCPU_STAT(invlpg) },
180		- { "exits", VCPU_STAT(exits) },
181		- { "io_exits", VCPU_STAT(io_exits) },
182		- { "mmio_exits", VCPU_STAT(mmio_exits) },
183		- { "signal_exits", VCPU_STAT(signal_exits) },
184		- { "irq_window", VCPU_STAT(irq_window_exits) },
185		- { "nmi_window", VCPU_STAT(nmi_window_exits) },
186		- { "halt_exits", VCPU_STAT(halt_exits) },
187		- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
188		- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
189		- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
190		- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
191		- { "hypercalls", VCPU_STAT(hypercalls) },
192		- { "request_irq", VCPU_STAT(request_irq_exits) },
193		- { "irq_exits", VCPU_STAT(irq_exits) },
194		- { "host_state_reload", VCPU_STAT(host_state_reload) },
195		- { "fpu_reload", VCPU_STAT(fpu_reload) },
196		- { "insn_emulation", VCPU_STAT(insn_emulation) },
197		- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
198		- { "irq_injections", VCPU_STAT(irq_injections) },
199		- { "nmi_injections", VCPU_STAT(nmi_injections) },
200		- { "req_event", VCPU_STAT(req_event) },
201		- { "l1d_flush", VCPU_STAT(l1d_flush) },
202		- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
203		- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
204		- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
205		- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
206		- { "mmu_flooded", VM_STAT(mmu_flooded) },
207		- { "mmu_recycled", VM_STAT(mmu_recycled) },
208		- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
209		- { "mmu_unsync", VM_STAT(mmu_unsync) },
210		- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
211		- { "largepages", VM_STAT(lpages, .mode = 0444) },
212		- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
213		- { "max_mmu_page_hash_collisions",
214		- VM_STAT(max_mmu_page_hash_collisions) },
	206	+ VCPU_STAT("pf_fixed", pf_fixed),
	207	+ VCPU_STAT("pf_guest", pf_guest),
	208	+ VCPU_STAT("tlb_flush", tlb_flush),
	209	+ VCPU_STAT("invlpg", invlpg),
	210	+ VCPU_STAT("exits", exits),
	211	+ VCPU_STAT("io_exits", io_exits),
	212	+ VCPU_STAT("mmio_exits", mmio_exits),
	213	+ VCPU_STAT("signal_exits", signal_exits),
	214	+ VCPU_STAT("irq_window", irq_window_exits),
	215	+ VCPU_STAT("nmi_window", nmi_window_exits),
	216	+ VCPU_STAT("halt_exits", halt_exits),
	217	+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
	218	+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
	219	+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
	220	+ VCPU_STAT("halt_wakeup", halt_wakeup),
	221	+ VCPU_STAT("hypercalls", hypercalls),
	222	+ VCPU_STAT("request_irq", request_irq_exits),
	223	+ VCPU_STAT("irq_exits", irq_exits),
	224	+ VCPU_STAT("host_state_reload", host_state_reload),
	225	+ VCPU_STAT("fpu_reload", fpu_reload),
	226	+ VCPU_STAT("insn_emulation", insn_emulation),
	227	+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
	228	+ VCPU_STAT("irq_injections", irq_injections),
	229	+ VCPU_STAT("nmi_injections", nmi_injections),
	230	+ VCPU_STAT("req_event", req_event),
	231	+ VCPU_STAT("l1d_flush", l1d_flush),
	232	+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
	233	+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
	234	+ VCPU_STAT("preemption_reported", preemption_reported),
	235	+ VCPU_STAT("preemption_other", preemption_other),
	236	+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
	237	+ VM_STAT("mmu_pte_write", mmu_pte_write),
	238	+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
	239	+ VM_STAT("mmu_flooded", mmu_flooded),
	240	+ VM_STAT("mmu_recycled", mmu_recycled),
	241	+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
	242	+ VM_STAT("mmu_unsync", mmu_unsync),
	243	+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
	244	+ VM_STAT("largepages", lpages, .mode = 0444),
	245	+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
	246	+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
215	247	{ NULL }
216	248	};
217	249
218	250	u64 __read_mostly host_xcr0;
	251	+u64 __read_mostly supported_xcr0;
	252	+EXPORT_SYMBOL_GPL(supported_xcr0);
	253	+
	254	+static struct kmem_cache *x86_fpu_cache;
	255	+
	256	+static struct kmem_cache *x86_emulator_cache;
	257	+
	258	+/*
	259	+ * When called, it means the previous get/set msr reached an invalid msr.
	260	+ * Return true if we want to ignore/silent this failed msr access.
	261	+ */
	262	+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
	263	+ u64 data, bool write)
	264	+{
	265	+ const char *op = write ? "wrmsr" : "rdmsr";
	266	+
	267	+ if (ignore_msrs) {
	268	+ if (report_ignored_msrs)
	269	+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
	270	+ op, msr, data);
	271	+ /* Mask the error */
	272	+ return true;
	273	+ } else {
	274	+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
	275	+ op, msr, data);
	276	+ return false;
	277	+ }
	278	+}
	279	+
	280	+static struct kmem_cache *kvm_alloc_emulator_cache(void)
	281	+{
	282	+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
	283	+ unsigned int size = sizeof(struct x86_emulate_ctxt);
	284	+
	285	+ return kmem_cache_create_usercopy("x86_emulator", size,
	286	+ __alignof__(struct x86_emulate_ctxt),
	287	+ SLAB_ACCOUNT, useroffset,
	288	+ size - useroffset, NULL);
	289	+}
219	290
220	291	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
221	292
222	293	static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
223	294	{
224	295	int i;
225		- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
	296	+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
226	297	vcpu->arch.apf.gfns[i] = ~0;
227	298	}
228	299
229	300	static void kvm_on_user_return(struct user_return_notifier *urn)
230	301	{
231	302	unsigned slot;
232		- struct kvm_shared_msrs *locals
233		- = container_of(urn, struct kvm_shared_msrs, urn);
234		- struct kvm_shared_msr_values *values;
	303	+ struct kvm_user_return_msrs *msrs
	304	+ = container_of(urn, struct kvm_user_return_msrs, urn);
	305	+ struct kvm_user_return_msr_values *values;
235	306	unsigned long flags;
236	307
237	308	/*
..	..	@@ -239,84 +310,89 @@
239	310	* interrupted and executed through kvm_arch_hardware_disable()
240	311	*/
241	312	local_irq_save(flags);
242		- if (locals->registered) {
243		- locals->registered = false;
	313	+ if (msrs->registered) {
	314	+ msrs->registered = false;
244	315	user_return_notifier_unregister(urn);
245	316	}
246	317	local_irq_restore(flags);
247		- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
248		- values = &locals->values[slot];
	318	+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
	319	+ values = &msrs->values[slot];
249	320	if (values->host != values->curr) {
250		- wrmsrl(shared_msrs_global.msrs[slot], values->host);
	321	+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
251	322	values->curr = values->host;
252	323	}
253	324	}
254	325	}
255	326
256		-static void shared_msr_update(unsigned slot, u32 msr)
	327	+int kvm_probe_user_return_msr(u32 msr)
257	328	{
	329	+ u64 val;
	330	+ int ret;
	331	+
	332	+ preempt_disable();
	333	+ ret = rdmsrl_safe(msr, &val);
	334	+ if (ret)
	335	+ goto out;
	336	+ ret = wrmsrl_safe(msr, val);
	337	+out:
	338	+ preempt_enable();
	339	+ return ret;
	340	+}
	341	+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
	342	+
	343	+void kvm_define_user_return_msr(unsigned slot, u32 msr)
	344	+{
	345	+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
	346	+ user_return_msrs_global.msrs[slot] = msr;
	347	+ if (slot >= user_return_msrs_global.nr)
	348	+ user_return_msrs_global.nr = slot + 1;
	349	+}
	350	+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
	351	+
	352	+static void kvm_user_return_msr_cpu_online(void)
	353	+{
	354	+ unsigned int cpu = smp_processor_id();
	355	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
258	356	u64 value;
259		- unsigned int cpu = smp_processor_id();
260		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	357	+ int i;
261	358
262		- /* only read, and nobody should modify it at this time,
263		- * so don't need lock */
264		- if (slot >= shared_msrs_global.nr) {
265		- printk(KERN_ERR "kvm: invalid MSR slot!");
266		- return;
	359	+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
	360	+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
	361	+ msrs->values[i].host = value;
	362	+ msrs->values[i].curr = value;
267	363	}
268		- rdmsrl_safe(msr, &value);
269		- smsr->values[slot].host = value;
270		- smsr->values[slot].curr = value;
271	364	}
272	365
273		-void kvm_define_shared_msr(unsigned slot, u32 msr)
274		-{
275		- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
276		- shared_msrs_global.msrs[slot] = msr;
277		- if (slot >= shared_msrs_global.nr)
278		- shared_msrs_global.nr = slot + 1;
279		-}
280		-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
281		-
282		-static void kvm_shared_msr_cpu_online(void)
283		-{
284		- unsigned i;
285		-
286		- for (i = 0; i < shared_msrs_global.nr; ++i)
287		- shared_msr_update(i, shared_msrs_global.msrs[i]);
288		-}
289		-
290		-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
	366	+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
291	367	{
292	368	unsigned int cpu = smp_processor_id();
293		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	369	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
294	370	int err;
295	371
296		- value = (value & mask) \| (smsr->values[slot].host & ~mask);
297		- if (value == smsr->values[slot].curr)
	372	+ value = (value & mask) \| (msrs->values[slot].host & ~mask);
	373	+ if (value == msrs->values[slot].curr)
298	374	return 0;
299		- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
	375	+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
300	376	if (err)
301	377	return 1;
302	378
303		- smsr->values[slot].curr = value;
304		- if (!smsr->registered) {
305		- smsr->urn.on_user_return = kvm_on_user_return;
306		- user_return_notifier_register(&smsr->urn);
307		- smsr->registered = true;
	379	+ msrs->values[slot].curr = value;
	380	+ if (!msrs->registered) {
	381	+ msrs->urn.on_user_return = kvm_on_user_return;
	382	+ user_return_notifier_register(&msrs->urn);
	383	+ msrs->registered = true;
308	384	}
309	385	return 0;
310	386	}
311		-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
	387	+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
312	388
313	389	static void drop_user_return_notifiers(void)
314	390	{
315	391	unsigned int cpu = smp_processor_id();
316		- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
	392	+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
317	393
318		- if (smsr->registered)
319		- kvm_on_user_return(&smsr->urn);
	394	+ if (msrs->registered)
	395	+ kvm_on_user_return(&msrs->urn);
320	396	}
321	397
322	398	u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
..	..	@@ -348,14 +424,15 @@
348	424	}
349	425
350	426	kvm_lapic_set_base(vcpu, msr_info->data);
	427	+ kvm_recalculate_apic_map(vcpu->kvm);
351	428	return 0;
352	429	}
353	430	EXPORT_SYMBOL_GPL(kvm_set_apic_base);
354	431
355		-asmlinkage __visible void kvm_spurious_fault(void)
	432	+asmlinkage __visible noinstr void kvm_spurious_fault(void)
356	433	{
357	434	/* Fault while not rebooting. We want the trace. */
358		- BUG();
	435	+ BUG_ON(!kvm_rebooting);
359	436	}
360	437	EXPORT_SYMBOL_GPL(kvm_spurious_fault);
361	438
..	..	@@ -384,6 +461,7 @@
384	461	#define EXCPT_TRAP 1
385	462	#define EXCPT_ABORT 2
386	463	#define EXCPT_INTERRUPT 3
	464	+#define EXCPT_DB 4
387	465
388	466	static int exception_type(int vector)
389	467	{
..	..	@@ -394,8 +472,14 @@
394	472
395	473	mask = 1 << vector;
396	474
397		- /* #DB is trap, as instruction watchpoints are handled elsewhere */
398		- if (mask & ((1 << DB_VECTOR) \| (1 << BP_VECTOR) \| (1 << OF_VECTOR)))
	475	+ /*
	476	+ * #DBs can be trap-like or fault-like, the caller must check other CPU
	477	+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
	478	+ */
	479	+ if (mask & (1 << DB_VECTOR))
	480	+ return EXCPT_DB;
	481	+
	482	+ if (mask & ((1 << BP_VECTOR) \| (1 << OF_VECTOR)))
399	483	return EXCPT_TRAP;
400	484
401	485	if (mask & ((1 << DF_VECTOR) \| (1 << MC_VECTOR)))
..	..	@@ -405,9 +489,59 @@
405	489	return EXCPT_FAULT;
406	490	}
407	491
	492	+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
	493	+{
	494	+ unsigned nr = vcpu->arch.exception.nr;
	495	+ bool has_payload = vcpu->arch.exception.has_payload;
	496	+ unsigned long payload = vcpu->arch.exception.payload;
	497	+
	498	+ if (!has_payload)
	499	+ return;
	500	+
	501	+ switch (nr) {
	502	+ case DB_VECTOR:
	503	+ /*
	504	+ * "Certain debug exceptions may clear bit 0-3. The
	505	+ * remaining contents of the DR6 register are never
	506	+ * cleared by the processor".
	507	+ */
	508	+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
	509	+ /*
	510	+ * DR6.RTM is set by all #DB exceptions that don't clear it.
	511	+ */
	512	+ vcpu->arch.dr6 \|= DR6_RTM;
	513	+ vcpu->arch.dr6 \|= payload;
	514	+ /*
	515	+ * Bit 16 should be set in the payload whenever the #DB
	516	+ * exception should clear DR6.RTM. This makes the payload
	517	+ * compatible with the pending debug exceptions under VMX.
	518	+ * Though not currently documented in the SDM, this also
	519	+ * makes the payload compatible with the exit qualification
	520	+ * for #DB exceptions under VMX.
	521	+ */
	522	+ vcpu->arch.dr6 ^= payload & DR6_RTM;
	523	+
	524	+ /*
	525	+ * The #DB payload is defined as compatible with the 'pending
	526	+ * debug exceptions' field under VMX, not DR6. While bit 12 is
	527	+ * defined in the 'pending debug exceptions' field (enabled
	528	+ * breakpoint), it is reserved and must be zero in DR6.
	529	+ */
	530	+ vcpu->arch.dr6 &= ~BIT(12);
	531	+ break;
	532	+ case PF_VECTOR:
	533	+ vcpu->arch.cr2 = payload;
	534	+ break;
	535	+ }
	536	+
	537	+ vcpu->arch.exception.has_payload = false;
	538	+ vcpu->arch.exception.payload = 0;
	539	+}
	540	+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
	541	+
408	542	static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
409	543	unsigned nr, bool has_error, u32 error_code,
410		- bool reinject)
	544	+ bool has_payload, unsigned long payload, bool reinject)
411	545	{
412	546	u32 prev_nr;
413	547	int class1, class2;
..	..	@@ -427,6 +561,14 @@
427	561	*/
428	562	WARN_ON_ONCE(vcpu->arch.exception.pending);
429	563	vcpu->arch.exception.injected = true;
	564	+ if (WARN_ON_ONCE(has_payload)) {
	565	+ /*
	566	+ * A reinjected event has already
	567	+ * delivered its payload.
	568	+ */
	569	+ has_payload = false;
	570	+ payload = 0;
	571	+ }
430	572	} else {
431	573	vcpu->arch.exception.pending = true;
432	574	vcpu->arch.exception.injected = false;
..	..	@@ -434,6 +576,10 @@
434	576	vcpu->arch.exception.has_error_code = has_error;
435	577	vcpu->arch.exception.nr = nr;
436	578	vcpu->arch.exception.error_code = error_code;
	579	+ vcpu->arch.exception.has_payload = has_payload;
	580	+ vcpu->arch.exception.payload = payload;
	581	+ if (!is_guest_mode(vcpu))
	582	+ kvm_deliver_exception_payload(vcpu);
437	583	return;
438	584	}
439	585
..	..	@@ -458,6 +604,8 @@
458	604	vcpu->arch.exception.has_error_code = true;
459	605	vcpu->arch.exception.nr = DF_VECTOR;
460	606	vcpu->arch.exception.error_code = 0;
	607	+ vcpu->arch.exception.has_payload = false;
	608	+ vcpu->arch.exception.payload = 0;
461	609	} else
462	610	/* replace previous exception with a new one in a hope
463	611	that instruction re-execution will regenerate lost
..	..	@@ -467,15 +615,29 @@
467	615
468	616	void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
469	617	{
470		- kvm_multiple_exception(vcpu, nr, false, 0, false);
	618	+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
471	619	}
472	620	EXPORT_SYMBOL_GPL(kvm_queue_exception);
473	621
474	622	void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
475	623	{
476		- kvm_multiple_exception(vcpu, nr, false, 0, true);
	624	+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
477	625	}
478	626	EXPORT_SYMBOL_GPL(kvm_requeue_exception);
	627	+
	628	+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
	629	+ unsigned long payload)
	630	+{
	631	+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
	632	+}
	633	+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
	634	+
	635	+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
	636	+ u32 error_code, unsigned long payload)
	637	+{
	638	+ kvm_multiple_exception(vcpu, nr, true, error_code,
	639	+ true, payload, false);
	640	+}
479	641
480	642	int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
481	643	{
..	..	@@ -493,23 +655,38 @@
493	655	++vcpu->stat.pf_guest;
494	656	vcpu->arch.exception.nested_apf =
495	657	is_guest_mode(vcpu) && fault->async_page_fault;
496		- if (vcpu->arch.exception.nested_apf)
	658	+ if (vcpu->arch.exception.nested_apf) {
497	659	vcpu->arch.apf.nested_apf_token = fault->address;
498		- else
499		- vcpu->arch.cr2 = fault->address;
500		- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
	660	+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
	661	+ } else {
	662	+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
	663	+ fault->address);
	664	+ }
501	665	}
502	666	EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
503	667
504		-static bool kvm_propagate_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
	668	+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
	669	+ struct x86_exception *fault)
505	670	{
506		- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
507		- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
508		- else
509		- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
	671	+ struct kvm_mmu *fault_mmu;
	672	+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
510	673
	674	+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
	675	+ vcpu->arch.walk_mmu;
	676	+
	677	+ /*
	678	+ * Invalidate the TLB entry for the faulting address, if it exists,
	679	+ * else the access will fault indefinitely (and to emulate hardware).
	680	+ */
	681	+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
	682	+ !(fault->error_code & PFERR_RSVD_MASK))
	683	+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
	684	+ fault_mmu->root_hpa);
	685	+
	686	+ fault_mmu->inject_page_fault(vcpu, fault);
511	687	return fault->nested_page_fault;
512	688	}
	689	+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
513	690
514	691	void kvm_inject_nmi(struct kvm_vcpu *vcpu)
515	692	{
..	..	@@ -520,13 +697,13 @@
520	697
521	698	void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
522	699	{
523		- kvm_multiple_exception(vcpu, nr, true, error_code, false);
	700	+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
524	701	}
525	702	EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
526	703
527	704	void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
528	705	{
529		- kvm_multiple_exception(vcpu, nr, true, error_code, true);
	706	+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
530	707	}
531	708	EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
532	709
..	..	@@ -536,7 +713,7 @@
536	713	*/
537	714	bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
538	715	{
539		- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
	716	+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
540	717	return true;
541	718	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
542	719	return false;
..	..	@@ -618,10 +795,8 @@
618	795	ret = 1;
619	796
620	797	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
621		- __set_bit(VCPU_EXREG_PDPTR,
622		- (unsigned long *)&vcpu->arch.regs_avail);
623		- __set_bit(VCPU_EXREG_PDPTR,
624		- (unsigned long *)&vcpu->arch.regs_dirty);
	798	+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
	799	+
625	800	out:
626	801
627	802	return ret;
..	..	@@ -631,7 +806,6 @@
631	806	bool pdptrs_changed(struct kvm_vcpu *vcpu)
632	807	{
633	808	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
634		- bool changed = true;
635	809	int offset;
636	810	gfn_t gfn;
637	811	int r;
..	..	@@ -639,8 +813,7 @@
639	813	if (!is_pae_paging(vcpu))
640	814	return false;
641	815
642		- if (!test_bit(VCPU_EXREG_PDPTR,
643		- (unsigned long *)&vcpu->arch.regs_avail))
	816	+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
644	817	return true;
645	818
646	819	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
..	..	@@ -648,17 +821,16 @@
648	821	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
649	822	PFERR_USER_MASK \| PFERR_WRITE_MASK);
650	823	if (r < 0)
651		- goto out;
652		- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
653		-out:
	824	+ return true;
654	825
655		- return changed;
	826	+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
656	827	}
657	828	EXPORT_SYMBOL_GPL(pdptrs_changed);
658	829
659	830	int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
660	831	{
661	832	unsigned long old_cr0 = kvm_read_cr0(vcpu);
	833	+ unsigned long pdptr_bits = X86_CR0_CD \| X86_CR0_NW \| X86_CR0_PG;
662	834	unsigned long update_bits = X86_CR0_PG \| X86_CR0_WP;
663	835
664	836	cr0 \|= X86_CR0_ET;
..	..	@@ -676,27 +848,27 @@
676	848	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
677	849	return 1;
678	850
679		- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
680	851	#ifdef CONFIG_X86_64
681		- if ((vcpu->arch.efer & EFER_LME)) {
682		- int cs_db, cs_l;
	852	+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
	853	+ (cr0 & X86_CR0_PG)) {
	854	+ int cs_db, cs_l;
683	855
684		- if (!is_pae(vcpu))
685		- return 1;
686		- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
687		- if (cs_l)
688		- return 1;
689		- } else
690		-#endif
691		- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
692		- kvm_read_cr3(vcpu)))
	856	+ if (!is_pae(vcpu))
	857	+ return 1;
	858	+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
	859	+ if (cs_l)
693	860	return 1;
694	861	}
	862	+#endif
	863	+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
	864	+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
	865	+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
	866	+ return 1;
695	867
696	868	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
697	869	return 1;
698	870
699		- kvm_x86_ops->set_cr0(vcpu, cr0);
	871	+ kvm_x86_ops.set_cr0(vcpu, cr0);
700	872
701	873	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
702	874	kvm_clear_async_pf_completion_queue(vcpu);
..	..	@@ -721,27 +893,48 @@
721	893	}
722	894	EXPORT_SYMBOL_GPL(kvm_lmsw);
723	895
724		-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
	896	+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
725	897	{
726		- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
727		- !vcpu->guest_xcr0_loaded) {
728		- /* kvm_set_xcr() also depends on this */
	898	+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
	899	+
729	900	if (vcpu->arch.xcr0 != host_xcr0)
730	901	xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
731		- vcpu->guest_xcr0_loaded = 1;
732		- }
733		-}
734		-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
735	902
736		-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
	903	+ if (vcpu->arch.xsaves_enabled &&
	904	+ vcpu->arch.ia32_xss != host_xss)
	905	+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
	906	+ }
	907	+
	908	+ if (static_cpu_has(X86_FEATURE_PKU) &&
	909	+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
	910	+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
	911	+ vcpu->arch.pkru != vcpu->arch.host_pkru)
	912	+ __write_pkru(vcpu->arch.pkru);
	913	+}
	914	+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
	915	+
	916	+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
737	917	{
738		- if (vcpu->guest_xcr0_loaded) {
	918	+ if (static_cpu_has(X86_FEATURE_PKU) &&
	919	+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
	920	+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
	921	+ vcpu->arch.pkru = rdpkru();
	922	+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
	923	+ __write_pkru(vcpu->arch.host_pkru);
	924	+ }
	925	+
	926	+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
	927	+
739	928	if (vcpu->arch.xcr0 != host_xcr0)
740	929	xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
741		- vcpu->guest_xcr0_loaded = 0;
	930	+
	931	+ if (vcpu->arch.xsaves_enabled &&
	932	+ vcpu->arch.ia32_xss != host_xss)
	933	+ wrmsrl(MSR_IA32_XSS, host_xss);
742	934	}
	935	+
743	936	}
744		-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
	937	+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
745	938
746	939	static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
747	940	{
..	..	@@ -779,13 +972,13 @@
779	972	vcpu->arch.xcr0 = xcr0;
780	973
781	974	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
782		- kvm_update_cpuid(vcpu);
	975	+ kvm_update_cpuid_runtime(vcpu);
783	976	return 0;
784	977	}
785	978
786	979	int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
787	980	{
788		- if (kvm_x86_ops->get_cpl(vcpu) != 0 \|\|
	981	+ if (kvm_x86_ops.get_cpl(vcpu) != 0 \|\|
789	982	__kvm_set_xcr(vcpu, index, xcr)) {
790	983	kvm_inject_gp(vcpu, 0);
791	984	return 1;
..	..	@@ -794,63 +987,20 @@
794	987	}
795	988	EXPORT_SYMBOL_GPL(kvm_set_xcr);
796	989
797		-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
798		-{
799		- u64 reserved_bits = CR4_RESERVED_BITS;
800		-
801		- if (!cpu_has(c, X86_FEATURE_XSAVE))
802		- reserved_bits \|= X86_CR4_OSXSAVE;
803		-
804		- if (!cpu_has(c, X86_FEATURE_SMEP))
805		- reserved_bits \|= X86_CR4_SMEP;
806		-
807		- if (!cpu_has(c, X86_FEATURE_SMAP))
808		- reserved_bits \|= X86_CR4_SMAP;
809		-
810		- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
811		- reserved_bits \|= X86_CR4_FSGSBASE;
812		-
813		- if (!cpu_has(c, X86_FEATURE_PKU))
814		- reserved_bits \|= X86_CR4_PKE;
815		-
816		- if (!cpu_has(c, X86_FEATURE_LA57) &&
817		- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
818		- reserved_bits \|= X86_CR4_LA57;
819		-
820		- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
821		- reserved_bits \|= X86_CR4_UMIP;
822		-
823		- return reserved_bits;
824		-}
825		-
826		-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
	990	+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
827	991	{
828	992	if (cr4 & cr4_reserved_bits)
829	993	return -EINVAL;
830	994
831		- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
	995	+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
832	996	return -EINVAL;
833	997
834		- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
835		- return -EINVAL;
836		-
837		- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
838		- return -EINVAL;
839		-
840		- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
841		- return -EINVAL;
842		-
843		- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
844		- return -EINVAL;
845		-
846		- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
847		- return -EINVAL;
848		-
849		- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
	998	+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
850	999	return -EINVAL;
851	1000
852	1001	return 0;
853	1002	}
	1003	+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
854	1004
855	1005	int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
856	1006	{
..	..	@@ -882,15 +1032,14 @@
882	1032	return 1;
883	1033	}
884	1034
885		- if (kvm_x86_ops->set_cr4(vcpu, cr4))
886		- return 1;
	1035	+ kvm_x86_ops.set_cr4(vcpu, cr4);
887	1036
888	1037	if (((cr4 ^ old_cr4) & mmu_role_bits) \|\|
889	1038	(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
890	1039	kvm_mmu_reset_context(vcpu);
891	1040
892	1041	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE \| X86_CR4_PKE))
893		- kvm_update_cpuid(vcpu);
	1042	+ kvm_update_cpuid_runtime(vcpu);
894	1043
895	1044	return 0;
896	1045	}
..	..	@@ -911,21 +1060,21 @@
911	1060	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
912	1061	if (!skip_tlb_flush) {
913	1062	kvm_mmu_sync_roots(vcpu);
914		- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
	1063	+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
915	1064	}
916	1065	return 0;
917	1066	}
918	1067
919	1068	if (is_long_mode(vcpu) &&
920		- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
	1069	+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
921	1070	return 1;
922	1071	else if (is_pae_paging(vcpu) &&
923	1072	!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
924	1073	return 1;
925	1074
926		- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
	1075	+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
927	1076	vcpu->arch.cr3 = cr3;
928		- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
	1077	+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
929	1078
930	1079	return 0;
931	1080	}
..	..	@@ -963,13 +1112,7 @@
963	1112	}
964	1113	}
965	1114
966		-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
967		-{
968		- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
969		- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
970		-}
971		-
972		-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
	1115	+void kvm_update_dr7(struct kvm_vcpu *vcpu)
973	1116	{
974	1117	unsigned long dr7;
975	1118
..	..	@@ -977,11 +1120,12 @@
977	1120	dr7 = vcpu->arch.guest_debug_dr7;
978	1121	else
979	1122	dr7 = vcpu->arch.dr7;
980		- kvm_x86_ops->set_dr7(vcpu, dr7);
	1123	+ kvm_x86_ops.set_dr7(vcpu, dr7);
981	1124	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
982	1125	if (dr7 & DR7_BP_EN_MASK)
983	1126	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_BP_ENABLED;
984	1127	}
	1128	+EXPORT_SYMBOL_GPL(kvm_update_dr7);
985	1129
986	1130	static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
987	1131	{
..	..	@@ -1003,17 +1147,14 @@
1003	1147	vcpu->arch.eff_db[dr] = val;
1004	1148	break;
1005	1149	case 4:
1006		- /* fall through */
1007	1150	case 6:
1008		- if (val & 0xffffffff00000000ULL)
	1151	+ if (!kvm_dr6_valid(val))
1009	1152	return -1; /* #GP */
1010	1153	vcpu->arch.dr6 = (val & DR6_VOLATILE) \| kvm_dr6_fixed(vcpu);
1011		- kvm_update_dr6(vcpu);
1012	1154	break;
1013	1155	case 5:
1014		- /* fall through */
1015	1156	default: /* 7 */
1016		- if (val & 0xffffffff00000000ULL)
	1157	+ if (!kvm_dr7_valid(val))
1017	1158	return -1; /* #GP */
1018	1159	vcpu->arch.dr7 = (val & DR7_VOLATILE) \| DR7_FIXED_1;
1019	1160	kvm_update_dr7(vcpu);
..	..	@@ -1042,15 +1183,10 @@
1042	1183	*val = vcpu->arch.db[array_index_nospec(dr, size)];
1043	1184	break;
1044	1185	case 4:
1045		- /* fall through */
1046	1186	case 6:
1047		- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1048		- *val = vcpu->arch.dr6;
1049		- else
1050		- *val = kvm_x86_ops->get_dr6(vcpu);
	1187	+ *val = vcpu->arch.dr6;
1051	1188	break;
1052	1189	case 5:
1053		- /* fall through */
1054	1190	default: /* 7 */
1055	1191	*val = vcpu->arch.dr7;
1056	1192	break;
..	..	@@ -1061,15 +1197,15 @@
1061	1197
1062	1198	bool kvm_rdpmc(struct kvm_vcpu *vcpu)
1063	1199	{
1064		- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	1200	+ u32 ecx = kvm_rcx_read(vcpu);
1065	1201	u64 data;
1066	1202	int err;
1067	1203
1068	1204	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
1069	1205	if (err)
1070	1206	return err;
1071		- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1072		- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	1207	+ kvm_rax_write(vcpu, (u32)data);
	1208	+ kvm_rdx_write(vcpu, data >> 32);
1073	1209	return err;
1074	1210	}
1075	1211	EXPORT_SYMBOL_GPL(kvm_rdpmc);
..	..	@@ -1078,26 +1214,66 @@
1078	1214	* List of msr numbers which we expose to userspace through KVM_GET_MSRS
1079	1215	* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1080	1216	*
1081		- * This list is modified at module load time to reflect the
	1217	+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
	1218	+ * extract the supported MSRs from the related const lists.
	1219	+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
1082	1220	* capabilities of the host cpu. This capabilities test skips MSRs that are
1083		- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
	1221	+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1084	1222	* may depend on host virtualization features rather than host cpu features.
1085	1223	*/
1086	1224
1087		-static u32 msrs_to_save[] = {
	1225	+static const u32 msrs_to_save_all[] = {
1088	1226	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1089	1227	MSR_STAR,
1090	1228	#ifdef CONFIG_X86_64
1091	1229	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1092	1230	#endif
1093	1231	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1094		- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1095		- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
	1232	+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
	1233	+ MSR_IA32_SPEC_CTRL,
	1234	+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
	1235	+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
	1236	+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
	1237	+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
	1238	+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
	1239	+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
	1240	+ MSR_IA32_UMWAIT_CONTROL,
	1241	+
	1242	+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
	1243	+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
	1244	+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
	1245	+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
	1246	+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
	1247	+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
	1248	+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
	1249	+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
	1250	+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
	1251	+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
	1252	+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
	1253	+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
	1254	+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
	1255	+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
	1256	+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
	1257	+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
	1258	+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
	1259	+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
	1260	+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
	1261	+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
	1262	+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
	1263	+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
	1264	+
	1265	+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
	1266	+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
	1267	+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
	1268	+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
	1269	+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
	1270	+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
1096	1271	};
1097	1272
	1273	+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1098	1274	static unsigned num_msrs_to_save;
1099	1275
1100		-static u32 emulated_msrs[] = {
	1276	+static const u32 emulated_msrs_all[] = {
1101	1277	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1102	1278	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1103	1279	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
..	..	@@ -1113,12 +1289,18 @@
1113	1289	HV_X64_MSR_VP_ASSIST_PAGE,
1114	1290	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1115	1291	HV_X64_MSR_TSC_EMULATION_STATUS,
	1292	+ HV_X64_MSR_SYNDBG_OPTIONS,
	1293	+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
	1294	+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
	1295	+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1116	1296
1117	1297	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1118		- MSR_KVM_PV_EOI_EN,
	1298	+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1119	1299
1120	1300	MSR_IA32_TSC_ADJUST,
1121	1301	MSR_IA32_TSCDEADLINE,
	1302	+ MSR_IA32_ARCH_CAPABILITIES,
	1303	+ MSR_IA32_PERF_CAPABILITIES,
1122	1304	MSR_IA32_MISC_ENABLE,
1123	1305	MSR_IA32_MCG_STATUS,
1124	1306	MSR_IA32_MCG_CTL,
..	..	@@ -1128,15 +1310,41 @@
1128	1310	MSR_PLATFORM_INFO,
1129	1311	MSR_MISC_FEATURES_ENABLES,
1130	1312	MSR_AMD64_VIRT_SPEC_CTRL,
	1313	+ MSR_IA32_POWER_CTL,
	1314	+ MSR_IA32_UCODE_REV,
	1315	+
	1316	+ /*
	1317	+ * The following list leaves out MSRs whose values are determined
	1318	+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
	1319	+ * We always support the "true" VMX control MSRs, even if the host
	1320	+ * processor does not, so I am putting these registers here rather
	1321	+ * than in msrs_to_save_all.
	1322	+ */
	1323	+ MSR_IA32_VMX_BASIC,
	1324	+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
	1325	+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
	1326	+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
	1327	+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
	1328	+ MSR_IA32_VMX_MISC,
	1329	+ MSR_IA32_VMX_CR0_FIXED0,
	1330	+ MSR_IA32_VMX_CR4_FIXED0,
	1331	+ MSR_IA32_VMX_VMCS_ENUM,
	1332	+ MSR_IA32_VMX_PROCBASED_CTLS2,
	1333	+ MSR_IA32_VMX_EPT_VPID_CAP,
	1334	+ MSR_IA32_VMX_VMFUNC,
	1335	+
	1336	+ MSR_K7_HWCR,
	1337	+ MSR_KVM_POLL_CONTROL,
1131	1338	};
1132	1339
	1340	+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1133	1341	static unsigned num_emulated_msrs;
1134	1342
1135	1343	/*
1136	1344	* List of msr numbers which are used to expose MSR-based features that
1137	1345	* can be used by a hypervisor to validate requested CPU features.
1138	1346	*/
1139		-static u32 msr_based_features[] = {
	1347	+static const u32 msr_based_features_all[] = {
1140	1348	MSR_IA32_VMX_BASIC,
1141	1349	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1142	1350	MSR_IA32_VMX_PINBASED_CTLS,
..	..	@@ -1156,18 +1364,41 @@
1156	1364	MSR_IA32_VMX_EPT_VPID_CAP,
1157	1365	MSR_IA32_VMX_VMFUNC,
1158	1366
1159		- MSR_F10H_DECFG,
	1367	+ MSR_AMD64_DE_CFG,
1160	1368	MSR_IA32_UCODE_REV,
1161	1369	MSR_IA32_ARCH_CAPABILITIES,
	1370	+ MSR_IA32_PERF_CAPABILITIES,
1162	1371	};
1163	1372
	1373	+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1164	1374	static unsigned int num_msr_based_features;
1165	1375
1166		-u64 kvm_get_arch_capabilities(void)
1167		-{
1168		- u64 data;
	1376	+/*
	1377	+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
	1378	+ * does not yet virtualize. These include:
	1379	+ * 10 - MISC_PACKAGE_CTRLS
	1380	+ * 11 - ENERGY_FILTERING_CTL
	1381	+ * 12 - DOITM
	1382	+ * 18 - FB_CLEAR_CTRL
	1383	+ * 21 - XAPIC_DISABLE_STATUS
	1384	+ * 23 - OVERCLOCKING_STATUS
	1385	+ */
1169	1386
1170		- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
	1387	+#define KVM_SUPPORTED_ARCH_CAP \
	1388	+ (ARCH_CAP_RDCL_NO \| ARCH_CAP_IBRS_ALL \| ARCH_CAP_RSBA \| \
	1389	+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH \| ARCH_CAP_SSB_NO \| ARCH_CAP_MDS_NO \| \
	1390	+ ARCH_CAP_PSCHANGE_MC_NO \| ARCH_CAP_TSX_CTRL_MSR \| ARCH_CAP_TAA_NO \| \
	1391	+ ARCH_CAP_SBDR_SSDP_NO \| ARCH_CAP_FBSDP_NO \| ARCH_CAP_PSDP_NO \| \
	1392	+ ARCH_CAP_FB_CLEAR \| ARCH_CAP_RRSBA \| ARCH_CAP_PBRSB_NO \| ARCH_CAP_GDS_NO)
	1393	+
	1394	+static u64 kvm_get_arch_capabilities(void)
	1395	+{
	1396	+ u64 data = 0;
	1397	+
	1398	+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
	1399	+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
	1400	+ data &= KVM_SUPPORTED_ARCH_CAP;
	1401	+ }
1171	1402
1172	1403	/*
1173	1404	* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
..	..	@@ -1196,34 +1427,30 @@
1196	1427	if (!boot_cpu_has_bug(X86_BUG_MDS))
1197	1428	data \|= ARCH_CAP_MDS_NO;
1198	1429
1199		- /*
1200		- * On TAA affected systems, export MDS_NO=0 when:
1201		- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
1202		- * - Updated microcode is present. This is detected by
1203		- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
1204		- * that VERW clears CPU buffers.
1205		- *
1206		- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
1207		- * mitigation and don't complain:
1208		- *
1209		- * "Vulnerable: Clear CPU buffers attempted, no microcode"
1210		- *
1211		- * If TSX is disabled on the system, guests are also mitigated against
1212		- * TAA and clear CPU buffer mitigation is not required for guests.
1213		- */
1214		- if (!boot_cpu_has(X86_FEATURE_RTM))
	1430	+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
	1431	+ /*
	1432	+ * If RTM=0 because the kernel has disabled TSX, the host might
	1433	+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
	1434	+ * and therefore knows that there cannot be TAA) but keep
	1435	+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
	1436	+ * and we want to allow migrating those guests to tsx=off hosts.
	1437	+ */
1215	1438	data &= ~ARCH_CAP_TAA_NO;
1216		- else if (!boot_cpu_has_bug(X86_BUG_TAA))
	1439	+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1217	1440	data \|= ARCH_CAP_TAA_NO;
1218		- else if (data & ARCH_CAP_TSX_CTRL_MSR)
1219		- data &= ~ARCH_CAP_MDS_NO;
	1441	+ } else {
	1442	+ /*
	1443	+ * Nothing to do here; we emulate TSX_CTRL if present on the
	1444	+ * host so the guest can choose between disabling TSX or
	1445	+ * using VERW to clear CPU buffers.
	1446	+ */
	1447	+ }
1220	1448
1221		- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
1222		- data &= ~ARCH_CAP_TSX_CTRL_MSR;
	1449	+ if (!boot_cpu_has_bug(X86_BUG_GDS) \|\| gds_ucode_mitigated())
	1450	+ data \|= ARCH_CAP_GDS_NO;
	1451	+
1223	1452	return data;
1224	1453	}
1225		-
1226		-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
1227	1454
1228	1455	static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1229	1456	{
..	..	@@ -1235,8 +1462,7 @@
1235	1462	rdmsrl_safe(msr->index, &msr->data);
1236	1463	break;
1237	1464	default:
1238		- if (kvm_x86_ops->get_msr_feature(msr))
1239		- return 1;
	1465	+ return kvm_x86_ops.get_msr_feature(msr);
1240	1466	}
1241	1467	return 0;
1242	1468	}
..	..	@@ -1248,6 +1474,14 @@
1248	1474
1249	1475	msr.index = index;
1250	1476	r = kvm_get_msr_feature(&msr);
	1477	+
	1478	+ if (r == KVM_MSR_RET_INVALID) {
	1479	+ /* Unconditionally clear the output for simplicity */
	1480	+ *data = 0;
	1481	+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
	1482	+ r = 0;
	1483	+ }
	1484	+
1251	1485	if (r)
1252	1486	return r;
1253	1487
..	..	@@ -1262,6 +1496,13 @@
1262	1496	return false;
1263	1497
1264	1498	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
	1499	+ return false;
	1500	+
	1501	+ if (efer & (EFER_LME \| EFER_LMA) &&
	1502	+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
	1503	+ return false;
	1504	+
	1505	+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1265	1506	return false;
1266	1507
1267	1508	return true;
..	..	@@ -1280,6 +1521,7 @@
1280	1521	{
1281	1522	u64 old_efer = vcpu->arch.efer;
1282	1523	u64 efer = msr_info->data;
	1524	+ int r;
1283	1525
1284	1526	if (efer & efer_reserved_bits)
1285	1527	return 1;
..	..	@@ -1296,7 +1538,11 @@
1296	1538	efer &= ~EFER_LMA;
1297	1539	efer \|= vcpu->arch.efer & EFER_LMA;
1298	1540
1299		- kvm_x86_ops->set_efer(vcpu, efer);
	1541	+ r = kvm_x86_ops.set_efer(vcpu, efer);
	1542	+ if (r) {
	1543	+ WARN_ON(r > 0);
	1544	+ return r;
	1545	+ }
1300	1546
1301	1547	/* Update reserved bits */
1302	1548	if ((efer ^ old_efer) & EFER_NX)
..	..	@@ -1311,20 +1557,73 @@
1311	1557	}
1312	1558	EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1313	1559
	1560	+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
	1561	+{
	1562	+ struct kvm_x86_msr_filter *msr_filter;
	1563	+ struct msr_bitmap_range *ranges;
	1564	+ struct kvm *kvm = vcpu->kvm;
	1565	+ bool allowed;
	1566	+ int idx;
	1567	+ u32 i;
	1568	+
	1569	+ /* x2APIC MSRs do not support filtering. */
	1570	+ if (index >= 0x800 && index <= 0x8ff)
	1571	+ return true;
	1572	+
	1573	+ idx = srcu_read_lock(&kvm->srcu);
	1574	+
	1575	+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
	1576	+ if (!msr_filter) {
	1577	+ allowed = true;
	1578	+ goto out;
	1579	+ }
	1580	+
	1581	+ allowed = msr_filter->default_allow;
	1582	+ ranges = msr_filter->ranges;
	1583	+
	1584	+ for (i = 0; i < msr_filter->count; i++) {
	1585	+ u32 start = ranges[i].base;
	1586	+ u32 end = start + ranges[i].nmsrs;
	1587	+ u32 flags = ranges[i].flags;
	1588	+ unsigned long *bitmap = ranges[i].bitmap;
	1589	+
	1590	+ if ((index >= start) && (index < end) && (flags & type)) {
	1591	+ allowed = !!test_bit(index - start, bitmap);
	1592	+ break;
	1593	+ }
	1594	+
	1595	+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
	1596	+ ++vcpu->stat.exits;
	1597	+ }
	1598	+
	1599	+out:
	1600	+ srcu_read_unlock(&kvm->srcu, idx);
	1601	+
	1602	+ return allowed;
	1603	+}
	1604	+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
	1605	+
1314	1606	/*
1315		- * Writes msr value into into the appropriate "register".
	1607	+ * Write @data into the MSR specified by @index. Select MSR specific fault
	1608	+ * checks are bypassed if @host_initiated is %true.
1316	1609	* Returns 0 on success, non-0 otherwise.
1317	1610	* Assumes vcpu_load() was already called.
1318	1611	*/
1319		-int kvm_set_msr(struct kvm_vcpu vcpu, struct msr_data msr)
	1612	+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
	1613	+ bool host_initiated)
1320	1614	{
1321		- switch (msr->index) {
	1615	+ struct msr_data msr;
	1616	+
	1617	+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
	1618	+ return KVM_MSR_RET_FILTERED;
	1619	+
	1620	+ switch (index) {
1322	1621	case MSR_FS_BASE:
1323	1622	case MSR_GS_BASE:
1324	1623	case MSR_KERNEL_GS_BASE:
1325	1624	case MSR_CSTAR:
1326	1625	case MSR_LSTAR:
1327		- if (is_noncanonical_address(msr->data, vcpu))
	1626	+ if (is_noncanonical_address(data, vcpu))
1328	1627	return 1;
1329	1628	break;
1330	1629	case MSR_IA32_SYSENTER_EIP:
..	..	@@ -1341,54 +1640,313 @@
1341	1640	* value, and that something deterministic happens if the guest
1342	1641	* invokes 64-bit SYSENTER.
1343	1642	*/
1344		- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
	1643	+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
1345	1644	}
1346		- return kvm_x86_ops->set_msr(vcpu, msr);
	1645	+
	1646	+ msr.data = data;
	1647	+ msr.index = index;
	1648	+ msr.host_initiated = host_initiated;
	1649	+
	1650	+ return kvm_x86_ops.set_msr(vcpu, &msr);
	1651	+}
	1652	+
	1653	+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
	1654	+ u32 index, u64 data, bool host_initiated)
	1655	+{
	1656	+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
	1657	+
	1658	+ if (ret == KVM_MSR_RET_INVALID)
	1659	+ if (kvm_msr_ignored_check(vcpu, index, data, true))
	1660	+ ret = 0;
	1661	+
	1662	+ return ret;
	1663	+}
	1664	+
	1665	+/*
	1666	+ * Read the MSR specified by @index into @data. Select MSR specific fault
	1667	+ * checks are bypassed if @host_initiated is %true.
	1668	+ * Returns 0 on success, non-0 otherwise.
	1669	+ * Assumes vcpu_load() was already called.
	1670	+ */
	1671	+int __kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data,
	1672	+ bool host_initiated)
	1673	+{
	1674	+ struct msr_data msr;
	1675	+ int ret;
	1676	+
	1677	+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
	1678	+ return KVM_MSR_RET_FILTERED;
	1679	+
	1680	+ msr.index = index;
	1681	+ msr.host_initiated = host_initiated;
	1682	+
	1683	+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
	1684	+ if (!ret)
	1685	+ *data = msr.data;
	1686	+ return ret;
	1687	+}
	1688	+
	1689	+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
	1690	+ u32 index, u64 *data, bool host_initiated)
	1691	+{
	1692	+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
	1693	+
	1694	+ if (ret == KVM_MSR_RET_INVALID) {
	1695	+ /* Unconditionally clear data for simplicity /
	1696	+ *data = 0;
	1697	+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
	1698	+ ret = 0;
	1699	+ }
	1700	+
	1701	+ return ret;
	1702	+}
	1703	+
	1704	+int kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data)
	1705	+{
	1706	+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
	1707	+}
	1708	+EXPORT_SYMBOL_GPL(kvm_get_msr);
	1709	+
	1710	+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
	1711	+{
	1712	+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
1347	1713	}
1348	1714	EXPORT_SYMBOL_GPL(kvm_set_msr);
	1715	+
	1716	+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
	1717	+{
	1718	+ if (vcpu->run->msr.error) {
	1719	+ kvm_inject_gp(vcpu, 0);
	1720	+ return 1;
	1721	+ } else if (is_read) {
	1722	+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
	1723	+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
	1724	+ }
	1725	+
	1726	+ return kvm_skip_emulated_instruction(vcpu);
	1727	+}
	1728	+
	1729	+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
	1730	+{
	1731	+ return complete_emulated_msr(vcpu, true);
	1732	+}
	1733	+
	1734	+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
	1735	+{
	1736	+ return complete_emulated_msr(vcpu, false);
	1737	+}
	1738	+
	1739	+static u64 kvm_msr_reason(int r)
	1740	+{
	1741	+ switch (r) {
	1742	+ case KVM_MSR_RET_INVALID:
	1743	+ return KVM_MSR_EXIT_REASON_UNKNOWN;
	1744	+ case KVM_MSR_RET_FILTERED:
	1745	+ return KVM_MSR_EXIT_REASON_FILTER;
	1746	+ default:
	1747	+ return KVM_MSR_EXIT_REASON_INVAL;
	1748	+ }
	1749	+}
	1750	+
	1751	+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
	1752	+ u32 exit_reason, u64 data,
	1753	+ int (completion)(struct kvm_vcpu vcpu),
	1754	+ int r)
	1755	+{
	1756	+ u64 msr_reason = kvm_msr_reason(r);
	1757	+
	1758	+ /* Check if the user wanted to know about this MSR fault */
	1759	+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
	1760	+ return 0;
	1761	+
	1762	+ vcpu->run->exit_reason = exit_reason;
	1763	+ vcpu->run->msr.error = 0;
	1764	+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
	1765	+ vcpu->run->msr.reason = msr_reason;
	1766	+ vcpu->run->msr.index = index;
	1767	+ vcpu->run->msr.data = data;
	1768	+ vcpu->arch.complete_userspace_io = completion;
	1769	+
	1770	+ return 1;
	1771	+}
	1772	+
	1773	+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
	1774	+{
	1775	+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
	1776	+ complete_emulated_rdmsr, r);
	1777	+}
	1778	+
	1779	+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
	1780	+{
	1781	+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
	1782	+ complete_emulated_wrmsr, r);
	1783	+}
	1784	+
	1785	+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
	1786	+{
	1787	+ u32 ecx = kvm_rcx_read(vcpu);
	1788	+ u64 data;
	1789	+ int r;
	1790	+
	1791	+ r = kvm_get_msr(vcpu, ecx, &data);
	1792	+
	1793	+ /* MSR read failed? See if we should ask user space */
	1794	+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
	1795	+ /* Bounce to user space */
	1796	+ return 0;
	1797	+ }
	1798	+
	1799	+ /* MSR read failed? Inject a #GP */
	1800	+ if (r) {
	1801	+ trace_kvm_msr_read_ex(ecx);
	1802	+ kvm_inject_gp(vcpu, 0);
	1803	+ return 1;
	1804	+ }
	1805	+
	1806	+ trace_kvm_msr_read(ecx, data);
	1807	+
	1808	+ kvm_rax_write(vcpu, data & -1u);
	1809	+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
	1810	+ return kvm_skip_emulated_instruction(vcpu);
	1811	+}
	1812	+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
	1813	+
	1814	+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
	1815	+{
	1816	+ u32 ecx = kvm_rcx_read(vcpu);
	1817	+ u64 data = kvm_read_edx_eax(vcpu);
	1818	+ int r;
	1819	+
	1820	+ r = kvm_set_msr(vcpu, ecx, data);
	1821	+
	1822	+ /* MSR write failed? See if we should ask user space */
	1823	+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
	1824	+ /* Bounce to user space */
	1825	+ return 0;
	1826	+
	1827	+ /* Signal all other negative errors to userspace */
	1828	+ if (r < 0)
	1829	+ return r;
	1830	+
	1831	+ /* MSR write failed? Inject a #GP */
	1832	+ if (r > 0) {
	1833	+ trace_kvm_msr_write_ex(ecx, data);
	1834	+ kvm_inject_gp(vcpu, 0);
	1835	+ return 1;
	1836	+ }
	1837	+
	1838	+ trace_kvm_msr_write(ecx, data);
	1839	+ return kvm_skip_emulated_instruction(vcpu);
	1840	+}
	1841	+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
	1842	+
	1843	+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
	1844	+{
	1845	+ return vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu) \|\|
	1846	+ xfer_to_guest_mode_work_pending();
	1847	+}
	1848	+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
	1849	+
	1850	+/*
	1851	+ * The fast path for frequent and performance sensitive wrmsr emulation,
	1852	+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
	1853	+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
	1854	+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
	1855	+ * other cases which must be called after interrupts are enabled on the host.
	1856	+ */
	1857	+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
	1858	+{
	1859	+ if (!lapic_in_kernel(vcpu) \|\| !apic_x2apic_mode(vcpu->arch.apic))
	1860	+ return 1;
	1861	+
	1862	+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
	1863	+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
	1864	+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
	1865	+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
	1866	+
	1867	+ data &= ~(1 << 12);
	1868	+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
	1869	+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
	1870	+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
	1871	+ trace_kvm_apic_write(APIC_ICR, (u32)data);
	1872	+ return 0;
	1873	+ }
	1874	+
	1875	+ return 1;
	1876	+}
	1877	+
	1878	+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
	1879	+{
	1880	+ if (!kvm_can_use_hv_timer(vcpu))
	1881	+ return 1;
	1882	+
	1883	+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
	1884	+ return 0;
	1885	+}
	1886	+
	1887	+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
	1888	+{
	1889	+ u32 msr = kvm_rcx_read(vcpu);
	1890	+ u64 data;
	1891	+ fastpath_t ret = EXIT_FASTPATH_NONE;
	1892	+
	1893	+ switch (msr) {
	1894	+ case APIC_BASE_MSR + (APIC_ICR >> 4):
	1895	+ data = kvm_read_edx_eax(vcpu);
	1896	+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
	1897	+ kvm_skip_emulated_instruction(vcpu);
	1898	+ ret = EXIT_FASTPATH_EXIT_HANDLED;
	1899	+ }
	1900	+ break;
	1901	+ case MSR_IA32_TSCDEADLINE:
	1902	+ data = kvm_read_edx_eax(vcpu);
	1903	+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
	1904	+ kvm_skip_emulated_instruction(vcpu);
	1905	+ ret = EXIT_FASTPATH_REENTER_GUEST;
	1906	+ }
	1907	+ break;
	1908	+ default:
	1909	+ break;
	1910	+ }
	1911	+
	1912	+ if (ret != EXIT_FASTPATH_NONE)
	1913	+ trace_kvm_msr_write(msr, data);
	1914	+
	1915	+ return ret;
	1916	+}
	1917	+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
1349	1918
1350	1919	/*
1351	1920	* Adapt set_msr() to msr_io()'s calling convention
1352	1921	*/
1353	1922	static int do_get_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
1354	1923	{
1355		- struct msr_data msr;
1356		- int r;
1357		-
1358		- msr.index = index;
1359		- msr.host_initiated = true;
1360		- r = kvm_get_msr(vcpu, &msr);
1361		- if (r)
1362		- return r;
1363		-
1364		- *data = msr.data;
1365		- return 0;
	1924	+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
1366	1925	}
1367	1926
1368	1927	static int do_set_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
1369	1928	{
1370		- struct msr_data msr;
1371		-
1372		- msr.data = *data;
1373		- msr.index = index;
1374		- msr.host_initiated = true;
1375		- return kvm_set_msr(vcpu, &msr);
	1929	+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
1376	1930	}
1377	1931
1378	1932	#ifdef CONFIG_X86_64
	1933	+struct pvclock_clock {
	1934	+ int vclock_mode;
	1935	+ u64 cycle_last;
	1936	+ u64 mask;
	1937	+ u32 mult;
	1938	+ u32 shift;
	1939	+ u64 base_cycles;
	1940	+ u64 offset;
	1941	+};
	1942	+
1379	1943	struct pvclock_gtod_data {
1380	1944	seqcount_t seq;
1381	1945
1382		- struct { /* extract of a clocksource struct */
1383		- int vclock_mode;
1384		- u64 cycle_last;
1385		- u64 mask;
1386		- u32 mult;
1387		- u32 shift;
1388		- } clock;
	1946	+ struct pvclock_clock clock; /* extract of a clocksource struct */
	1947	+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
1389	1948
1390		- u64 boot_ns;
1391		- u64 nsec_base;
	1949	+ ktime_t offs_boot;
1392	1950	u64 wall_time_sec;
1393	1951	};
1394	1952
..	..	@@ -1397,44 +1955,54 @@
1397	1955	static void update_pvclock_gtod(struct timekeeper *tk)
1398	1956	{
1399	1957	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1400		- u64 boot_ns;
1401		-
1402		- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
1403	1958
1404	1959	write_seqcount_begin(&vdata->seq);
1405	1960
1406	1961	/* copy pvclock gtod data */
1407		- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
	1962	+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
1408	1963	vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
1409	1964	vdata->clock.mask = tk->tkr_mono.mask;
1410	1965	vdata->clock.mult = tk->tkr_mono.mult;
1411	1966	vdata->clock.shift = tk->tkr_mono.shift;
	1967	+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
	1968	+ vdata->clock.offset = tk->tkr_mono.base;
1412	1969
1413		- vdata->boot_ns = boot_ns;
1414		- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
	1970	+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
	1971	+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
	1972	+ vdata->raw_clock.mask = tk->tkr_raw.mask;
	1973	+ vdata->raw_clock.mult = tk->tkr_raw.mult;
	1974	+ vdata->raw_clock.shift = tk->tkr_raw.shift;
	1975	+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
	1976	+ vdata->raw_clock.offset = tk->tkr_raw.base;
1415	1977
1416	1978	vdata->wall_time_sec = tk->xtime_sec;
1417	1979
	1980	+ vdata->offs_boot = tk->offs_boot;
	1981	+
1418	1982	write_seqcount_end(&vdata->seq);
1419	1983	}
1420		-#endif
1421	1984
1422		-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
	1985	+static s64 get_kvmclock_base_ns(void)
1423	1986	{
1424		- /*
1425		- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1426		- * vcpu_enter_guest. This function is only called from
1427		- * the physical CPU that is running vcpu.
1428		- */
1429		- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
	1987	+ /* Count up from boot time, but with the frequency of the raw clock. */
	1988	+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
1430	1989	}
	1990	+#else
	1991	+static s64 get_kvmclock_base_ns(void)
	1992	+{
	1993	+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
	1994	+ return ktime_get_boottime_ns();
	1995	+}
	1996	+#endif
1431	1997
1432	1998	static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1433	1999	{
1434	2000	int version;
1435	2001	int r;
1436	2002	struct pvclock_wall_clock wc;
1437		- struct timespec64 boot;
	2003	+ u64 wall_nsec;
	2004	+
	2005	+ kvm->arch.wall_clock = wall_clock;
1438	2006
1439	2007	if (!wall_clock)
1440	2008	return;
..	..	@@ -1454,23 +2022,46 @@
1454	2022	/*
1455	2023	* The guest calculates current wall clock time by adding
1456	2024	* system time (updated by kvm_guest_time_update below) to the
1457		- * wall clock specified here. guest system time equals host
1458		- * system time for us, thus we must fill in host boot time here.
	2025	+ * wall clock specified here. We do the reverse here.
1459	2026	*/
1460		- getboottime64(&boot);
	2027	+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
1461	2028
1462		- if (kvm->arch.kvmclock_offset) {
1463		- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1464		- boot = timespec64_sub(boot, ts);
1465		- }
1466		- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1467		- wc.nsec = boot.tv_nsec;
	2029	+ wc.nsec = do_div(wall_nsec, 1000000000);
	2030	+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
1468	2031	wc.version = version;
1469	2032
1470	2033	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1471	2034
1472	2035	version++;
1473	2036	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
	2037	+}
	2038	+
	2039	+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
	2040	+ bool old_msr, bool host_initiated)
	2041	+{
	2042	+ struct kvm_arch *ka = &vcpu->kvm->arch;
	2043	+
	2044	+ if (vcpu->vcpu_id == 0 && !host_initiated) {
	2045	+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
	2046	+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
	2047	+
	2048	+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
	2049	+ }
	2050	+
	2051	+ vcpu->arch.time = system_time;
	2052	+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
	2053	+
	2054	+ /* we verify if the enable bit is set... */
	2055	+ vcpu->arch.pv_time_enabled = false;
	2056	+ if (!(system_time & 1))
	2057	+ return;
	2058	+
	2059	+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
	2060	+ &vcpu->arch.pv_time, system_time & ~1ULL,
	2061	+ sizeof(struct pvclock_vcpu_time_info)))
	2062	+ vcpu->arch.pv_time_enabled = true;
	2063	+
	2064	+ return;
1474	2065	}
1475	2066
1476	2067	static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
..	..	@@ -1505,9 +2096,6 @@
1505	2096
1506	2097	*pshift = shift;
1507	2098	*pmultiplier = div_frac(scaled64, tps32);
1508		-
1509		- pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1510		- __func__, base_hz, scaled_hz, shift, *pmultiplier);
1511	2099	}
1512	2100
1513	2101	#ifdef CONFIG_X86_64
..	..	@@ -1604,7 +2192,7 @@
1604	2192
1605	2193	static inline int gtod_is_based_on_tsc(int mode)
1606	2194	{
1607		- return mode == VCLOCK_TSC \|\| mode == VCLOCK_HVCLOCK;
	2195	+ return mode == VDSO_CLOCKMODE_TSC \|\| mode == VDSO_CLOCKMODE_HVCLOCK;
1608	2196	}
1609	2197
1610	2198	static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
..	..	@@ -1633,12 +2221,6 @@
1633	2221	atomic_read(&vcpu->kvm->online_vcpus),
1634	2222	ka->use_master_clock, gtod->clock.vclock_mode);
1635	2223	#endif
1636		-}
1637		-
1638		-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1639		-{
1640		- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1641		- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1642	2224	}
1643	2225
1644	2226	/*
..	..	@@ -1679,15 +2261,14 @@
1679	2261
1680	2262	u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
1681	2263	{
1682		- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1683		-
1684		- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
	2264	+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
1685	2265	}
1686	2266	EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
1687	2267
1688	2268	static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1689	2269	{
1690		- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
	2270	+ vcpu->arch.l1_tsc_offset = offset;
	2271	+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
1691	2272	}
1692	2273
1693	2274	static inline bool kvm_check_tsc_unstable(void)
..	..	@@ -1697,29 +2278,28 @@
1697	2278	* TSC is marked unstable when we're running on Hyper-V,
1698	2279	* 'TSC page' clocksource is good.
1699	2280	*/
1700		- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
	2281	+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
1701	2282	return false;
1702	2283	#endif
1703	2284	return check_tsc_unstable();
1704	2285	}
1705	2286
1706		-void kvm_write_tsc(struct kvm_vcpu vcpu, struct msr_data msr)
	2287	+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
1707	2288	{
1708	2289	struct kvm *kvm = vcpu->kvm;
1709	2290	u64 offset, ns, elapsed;
1710	2291	unsigned long flags;
1711	2292	bool matched;
1712	2293	bool already_matched;
1713		- u64 data = msr->data;
1714	2294	bool synchronizing = false;
1715	2295
1716	2296	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1717	2297	offset = kvm_compute_tsc_offset(vcpu, data);
1718		- ns = ktime_get_boot_ns();
	2298	+ ns = get_kvmclock_base_ns();
1719	2299	elapsed = ns - kvm->arch.last_tsc_nsec;
1720	2300
1721	2301	if (vcpu->arch.virtual_tsc_khz) {
1722		- if (data == 0 && msr->host_initiated) {
	2302	+ if (data == 0) {
1723	2303	/*
1724	2304	* detection of vcpu initialization -- need to sync
1725	2305	* with other vCPUs. This particularly helps to keep
..	..	@@ -1750,12 +2330,10 @@
1750	2330	vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1751	2331	if (!kvm_check_tsc_unstable()) {
1752	2332	offset = kvm->arch.cur_tsc_offset;
1753		- pr_debug("kvm: matched tsc offset for %llu\n", data);
1754	2333	} else {
1755	2334	u64 delta = nsec_to_cycles(vcpu, elapsed);
1756	2335	data += delta;
1757	2336	offset = kvm_compute_tsc_offset(vcpu, data);
1758		- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1759	2337	}
1760	2338	matched = true;
1761	2339	already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
..	..	@@ -1774,8 +2352,6 @@
1774	2352	kvm->arch.cur_tsc_write = data;
1775	2353	kvm->arch.cur_tsc_offset = offset;
1776	2354	matched = false;
1777		- pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1778		- kvm->arch.cur_tsc_generation, data);
1779	2355	}
1780	2356
1781	2357	/*
..	..	@@ -1793,9 +2369,6 @@
1793	2369	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1794	2370	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1795	2371
1796		- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1797		- update_ia32_tsc_adjust_msr(vcpu, offset);
1798		-
1799	2372	kvm_vcpu_write_tsc_offset(vcpu, offset);
1800	2373	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1801	2374
..	..	@@ -1810,12 +2383,10 @@
1810	2383	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1811	2384	}
1812	2385
1813		-EXPORT_SYMBOL_GPL(kvm_write_tsc);
1814		-
1815	2386	static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
1816	2387	s64 adjustment)
1817	2388	{
1818		- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
	2389	+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
1819	2390	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
1820	2391	}
1821	2392
..	..	@@ -1849,43 +2420,43 @@
1849	2420	return last;
1850	2421	}
1851	2422
1852		-static inline u64 vgettsc(u64 tsc_timestamp, int mode)
	2423	+static inline u64 vgettsc(struct pvclock_clock clock, u64 tsc_timestamp,
	2424	+ int *mode)
1853	2425	{
1854	2426	long v;
1855		- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1856	2427	u64 tsc_pg_val;
1857	2428
1858		- switch (gtod->clock.vclock_mode) {
1859		- case VCLOCK_HVCLOCK:
	2429	+ switch (clock->vclock_mode) {
	2430	+ case VDSO_CLOCKMODE_HVCLOCK:
1860	2431	tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
1861	2432	tsc_timestamp);
1862	2433	if (tsc_pg_val != U64_MAX) {
1863	2434	/* TSC page valid */
1864		- *mode = VCLOCK_HVCLOCK;
1865		- v = (tsc_pg_val - gtod->clock.cycle_last) &
1866		- gtod->clock.mask;
	2435	+ *mode = VDSO_CLOCKMODE_HVCLOCK;
	2436	+ v = (tsc_pg_val - clock->cycle_last) &
	2437	+ clock->mask;
1867	2438	} else {
1868	2439	/* TSC page invalid */
1869		- *mode = VCLOCK_NONE;
	2440	+ *mode = VDSO_CLOCKMODE_NONE;
1870	2441	}
1871	2442	break;
1872		- case VCLOCK_TSC:
1873		- *mode = VCLOCK_TSC;
	2443	+ case VDSO_CLOCKMODE_TSC:
	2444	+ *mode = VDSO_CLOCKMODE_TSC;
1874	2445	*tsc_timestamp = read_tsc();
1875		- v = (*tsc_timestamp - gtod->clock.cycle_last) &
1876		- gtod->clock.mask;
	2446	+ v = (*tsc_timestamp - clock->cycle_last) &
	2447	+ clock->mask;
1877	2448	break;
1878	2449	default:
1879		- *mode = VCLOCK_NONE;
	2450	+ *mode = VDSO_CLOCKMODE_NONE;
1880	2451	}
1881	2452
1882		- if (*mode == VCLOCK_NONE)
	2453	+ if (*mode == VDSO_CLOCKMODE_NONE)
1883	2454	*tsc_timestamp = v = 0;
1884	2455
1885		- return v * gtod->clock.mult;
	2456	+ return v * clock->mult;
1886	2457	}
1887	2458
1888		-static int do_monotonic_boot(s64 t, u64 tsc_timestamp)
	2459	+static int do_monotonic_raw(s64 t, u64 tsc_timestamp)
1889	2460	{
1890	2461	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1891	2462	unsigned long seq;
..	..	@@ -1894,10 +2465,10 @@
1894	2465
1895	2466	do {
1896	2467	seq = read_seqcount_begin(&gtod->seq);
1897		- ns = gtod->nsec_base;
1898		- ns += vgettsc(tsc_timestamp, &mode);
1899		- ns >>= gtod->clock.shift;
1900		- ns += gtod->boot_ns;
	2468	+ ns = gtod->raw_clock.base_cycles;
	2469	+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
	2470	+ ns >>= gtod->raw_clock.shift;
	2471	+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
1901	2472	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1902	2473	*t = ns;
1903	2474
..	..	@@ -1914,8 +2485,8 @@
1914	2485	do {
1915	2486	seq = read_seqcount_begin(&gtod->seq);
1916	2487	ts->tv_sec = gtod->wall_time_sec;
1917		- ns = gtod->nsec_base;
1918		- ns += vgettsc(tsc_timestamp, &mode);
	2488	+ ns = gtod->clock.base_cycles;
	2489	+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
1919	2490	ns >>= gtod->clock.shift;
1920	2491	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1921	2492
..	..	@@ -1932,7 +2503,7 @@
1932	2503	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
1933	2504	return false;
1934	2505
1935		- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
	2506	+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
1936	2507	tsc_timestamp));
1937	2508	}
1938	2509
..	..	@@ -2057,7 +2628,7 @@
2057	2628	spin_lock(&ka->pvclock_gtod_sync_lock);
2058	2629	if (!ka->use_master_clock) {
2059	2630	spin_unlock(&ka->pvclock_gtod_sync_lock);
2060		- return ktime_get_boot_ns() + ka->kvmclock_offset;
	2631	+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
2061	2632	}
2062	2633
2063	2634	hv_clock.tsc_timestamp = ka->master_cycle_now;
..	..	@@ -2073,7 +2644,7 @@
2073	2644	&hv_clock.tsc_to_system_mul);
2074	2645	ret = __pvclock_read_cycles(&hv_clock, rdtsc());
2075	2646	} else
2076		- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
	2647	+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
2077	2648
2078	2649	put_cpu();
2079	2650
..	..	@@ -2172,7 +2743,7 @@
2172	2743	}
2173	2744	if (!use_master_clock) {
2174	2745	host_tsc = rdtsc();
2175		- kernel_ns = ktime_get_boot_ns();
	2746	+ kernel_ns = get_kvmclock_base_ns();
2176	2747	}
2177	2748
2178	2749	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
..	..	@@ -2284,6 +2855,18 @@
2284	2855	KVMCLOCK_SYNC_PERIOD);
2285	2856	}
2286	2857
	2858	+/*
	2859	+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
	2860	+ */
	2861	+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
	2862	+{
	2863	+ /* McStatusWrEn enabled? */
	2864	+ if (guest_cpuid_is_amd_or_hygon(vcpu))
	2865	+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
	2866	+
	2867	+ return false;
	2868	+}
	2869	+
2287	2870	static int set_msr_mce(struct kvm_vcpu vcpu, struct msr_data msr_info)
2288	2871	{
2289	2872	u64 mcg_cap = vcpu->arch.mcg_cap;
..	..	@@ -2313,14 +2896,22 @@
2313	2896	/* only 0 or all 1s can be written to IA32_MCi_CTL
2314	2897	* some Linux kernels though clear bit 10 in bank 4 to
2315	2898	* workaround a BIOS/GART TBL issue on AMD K8s, ignore
2316		- * this to avoid an uncatched #GP in the guest
	2899	+ * this to avoid an uncatched #GP in the guest.
	2900	+ *
	2901	+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
	2902	+ * correctable, single-bit ECC data errors.
2317	2903	*/
2318	2904	if ((offset & 0x3) == 0 &&
2319		- data != 0 && (data \| (1 << 10)) != ~(u64)0)
2320		- return -1;
	2905	+ data != 0 && (data \| (1 << 10) \| 1) != ~(u64)0)
	2906	+ return 1;
	2907	+
	2908	+ /* MCi_STATUS */
2321	2909	if (!msr_info->host_initiated &&
2322		- (offset & 0x3) == 1 && data != 0)
2323		- return -1;
	2910	+ (offset & 0x3) == 1 && data != 0) {
	2911	+ if (!can_set_mci_status(vcpu))
	2912	+ return 1;
	2913	+ }
	2914	+
2324	2915	vcpu->arch.mce_banks[offset] = data;
2325	2916	break;
2326	2917	}
..	..	@@ -2340,104 +2931,192 @@
2340	2931	u32 page_num = data & ~PAGE_MASK;
2341	2932	u64 page_addr = data & PAGE_MASK;
2342	2933	u8 *page;
2343		- int r;
2344	2934
2345		- r = -E2BIG;
2346	2935	if (page_num >= blob_size)
2347		- goto out;
2348		- r = -ENOMEM;
	2936	+ return 1;
	2937	+
2349	2938	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2350		- if (IS_ERR(page)) {
2351		- r = PTR_ERR(page);
2352		- goto out;
	2939	+ if (IS_ERR(page))
	2940	+ return PTR_ERR(page);
	2941	+
	2942	+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
	2943	+ kfree(page);
	2944	+ return 1;
2353	2945	}
2354		- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2355		- goto out_free;
2356		- r = 0;
2357		-out_free:
2358		- kfree(page);
2359		-out:
2360		- return r;
	2946	+ return 0;
	2947	+}
	2948	+
	2949	+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
	2950	+{
	2951	+ u64 mask = KVM_ASYNC_PF_ENABLED \| KVM_ASYNC_PF_DELIVERY_AS_INT;
	2952	+
	2953	+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
2361	2954	}
2362	2955
2363	2956	static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2364	2957	{
2365	2958	gpa_t gpa = data & ~0x3f;
2366	2959
2367		- /* Bits 3:5 are reserved, Should be zero */
2368		- if (data & 0x38)
	2960	+ /* Bits 4:5 are reserved, Should be zero */
	2961	+ if (data & 0x30)
2369	2962	return 1;
2370	2963
2371		- vcpu->arch.apf.msr_val = data;
	2964	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
	2965	+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
	2966	+ return 1;
2372	2967
2373		- if (!(data & KVM_ASYNC_PF_ENABLED)) {
	2968	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
	2969	+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
	2970	+ return 1;
	2971	+
	2972	+ if (!lapic_in_kernel(vcpu))
	2973	+ return data ? 1 : 0;
	2974	+
	2975	+ vcpu->arch.apf.msr_en_val = data;
	2976	+
	2977	+ if (!kvm_pv_async_pf_enabled(vcpu)) {
2374	2978	kvm_clear_async_pf_completion_queue(vcpu);
2375	2979	kvm_async_pf_hash_reset(vcpu);
2376	2980	return 0;
2377	2981	}
2378	2982
2379	2983	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2380		- sizeof(u32)))
	2984	+ sizeof(u64)))
2381	2985	return 1;
2382	2986
2383	2987	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2384	2988	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
	2989	+
2385	2990	kvm_async_pf_wakeup_all(vcpu);
	2991	+
	2992	+ return 0;
	2993	+}
	2994	+
	2995	+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
	2996	+{
	2997	+ /* Bits 8-63 are reserved */
	2998	+ if (data >> 8)
	2999	+ return 1;
	3000	+
	3001	+ if (!lapic_in_kernel(vcpu))
	3002	+ return 1;
	3003	+
	3004	+ vcpu->arch.apf.msr_int_val = data;
	3005	+
	3006	+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
	3007	+
2386	3008	return 0;
2387	3009	}
2388	3010
2389	3011	static void kvmclock_reset(struct kvm_vcpu *vcpu)
2390	3012	{
2391	3013	vcpu->arch.pv_time_enabled = false;
	3014	+ vcpu->arch.time = 0;
2392	3015	}
2393	3016
2394		-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
	3017	+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
2395	3018	{
2396	3019	++vcpu->stat.tlb_flush;
2397		- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
	3020	+ kvm_x86_ops.tlb_flush_all(vcpu);
	3021	+}
	3022	+
	3023	+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
	3024	+{
	3025	+ ++vcpu->stat.tlb_flush;
	3026	+ kvm_x86_ops.tlb_flush_guest(vcpu);
2398	3027	}
2399	3028
2400	3029	static void record_steal_time(struct kvm_vcpu *vcpu)
2401	3030	{
2402		- struct kvm_host_map map;
2403		- struct kvm_steal_time *st;
	3031	+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
	3032	+ struct kvm_steal_time __user *st;
	3033	+ struct kvm_memslots *slots;
	3034	+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
	3035	+ u64 steal;
	3036	+ u32 version;
2404	3037
2405	3038	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2406	3039	return;
2407	3040
2408		- /* -EAGAIN is returned in atomic context so we can just return. */
2409		- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
2410		- &map, &vcpu->arch.st.cache, false))
	3041	+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
2411	3042	return;
2412	3043
2413		- st = map.hva +
2414		- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
	3044	+ slots = kvm_memslots(vcpu->kvm);
2415	3045
	3046	+ if (unlikely(slots->generation != ghc->generation \|\|
	3047	+ gpa != ghc->gpa \|\|
	3048	+ kvm_is_error_hva(ghc->hva) \|\| !ghc->memslot)) {
	3049	+ /* We rely on the fact that it fits in a single page. */
	3050	+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
	3051	+
	3052	+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) \|\|
	3053	+ kvm_is_error_hva(ghc->hva) \|\| !ghc->memslot)
	3054	+ return;
	3055	+ }
	3056	+
	3057	+ st = (struct kvm_steal_time __user *)ghc->hva;
2416	3058	/*
2417	3059	* Doing a TLB flush here, on the guest's behalf, can avoid
2418	3060	* expensive IPIs.
2419	3061	*/
2420		- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2421		- kvm_vcpu_flush_tlb(vcpu, false);
	3062	+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
	3063	+ u8 st_preempted = 0;
	3064	+ int err = -EFAULT;
2422	3065
2423		- vcpu->arch.st.preempted = 0;
	3066	+ if (!user_access_begin(st, sizeof(*st)))
	3067	+ return;
2424	3068
2425		- if (st->version & 1)
2426		- st->version += 1; /* first time write, random junk */
	3069	+ asm volatile("1: xchgb %0, %2\n"
	3070	+ "xor %1, %1\n"
	3071	+ "2:\n"
	3072	+ _ASM_EXTABLE_UA(1b, 2b)
	3073	+ : "+q" (st_preempted),
	3074	+ "+&r" (err),
	3075	+ "+m" (st->preempted));
	3076	+ if (err)
	3077	+ goto out;
2427	3078
2428		- st->version += 1;
	3079	+ user_access_end();
	3080	+
	3081	+ vcpu->arch.st.preempted = 0;
	3082	+
	3083	+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
	3084	+ st_preempted & KVM_VCPU_FLUSH_TLB);
	3085	+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
	3086	+ kvm_vcpu_flush_tlb_guest(vcpu);
	3087	+
	3088	+ if (!user_access_begin(st, sizeof(*st)))
	3089	+ goto dirty;
	3090	+ } else {
	3091	+ if (!user_access_begin(st, sizeof(*st)))
	3092	+ return;
	3093	+
	3094	+ unsafe_put_user(0, &st->preempted, out);
	3095	+ vcpu->arch.st.preempted = 0;
	3096	+ }
	3097	+
	3098	+ unsafe_get_user(version, &st->version, out);
	3099	+ if (version & 1)
	3100	+ version += 1; /* first time write, random junk */
	3101	+
	3102	+ version += 1;
	3103	+ unsafe_put_user(version, &st->version, out);
2429	3104
2430	3105	smp_wmb();
2431	3106
2432		- st->steal += current->sched_info.run_delay -
	3107	+ unsafe_get_user(steal, &st->steal, out);
	3108	+ steal += current->sched_info.run_delay -
2433	3109	vcpu->arch.st.last_steal;
2434	3110	vcpu->arch.st.last_steal = current->sched_info.run_delay;
	3111	+ unsafe_put_user(steal, &st->steal, out);
2435	3112
2436		- smp_wmb();
	3113	+ version += 1;
	3114	+ unsafe_put_user(version, &st->version, out);
2437	3115
2438		- st->version += 1;
2439		-
2440		- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
	3116	+ out:
	3117	+ user_access_end();
	3118	+ dirty:
	3119	+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
2441	3120	}
2442	3121
2443	3122	int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
..	..	@@ -2465,14 +3144,31 @@
2465	3144	return 1;
2466	3145	vcpu->arch.arch_capabilities = data;
2467	3146	break;
	3147	+ case MSR_IA32_PERF_CAPABILITIES: {
	3148	+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
	3149	+
	3150	+ if (!msr_info->host_initiated)
	3151	+ return 1;
	3152	+ if (kvm_get_msr_feature(&msr_ent))
	3153	+ return 1;
	3154	+ if (data & ~msr_ent.data)
	3155	+ return 1;
	3156	+
	3157	+ vcpu->arch.perf_capabilities = data;
	3158	+
	3159	+ return 0;
	3160	+ }
2468	3161	case MSR_EFER:
2469	3162	return set_efer(vcpu, msr_info);
2470	3163	case MSR_K7_HWCR:
2471	3164	data &= ~(u64)0x40; /* ignore flush filter disable */
2472	3165	data &= ~(u64)0x100; /* ignore ignne emulation enable */
2473	3166	data &= ~(u64)0x8; /* ignore TLB cache disable */
2474		- data &= ~(u64)0x40000; /* ignore Mc status write enable */
2475		- if (data != 0) {
	3167	+
	3168	+ /* Handle McStatusWrEn */
	3169	+ if (data == BIT_ULL(18)) {
	3170	+ vcpu->arch.msr_hwcr = data;
	3171	+ } else if (data != 0) {
2476	3172	vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2477	3173	data);
2478	3174	return 1;
..	..	@@ -2493,9 +3189,9 @@
2493	3189	/* Values other than LBR and BTF are vendor-specific,
2494	3190	thus reserved and should throw a #GP */
2495	3191	return 1;
2496		- }
2497		- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2498		- __func__, data);
	3192	+ } else if (report_ignored_msrs)
	3193	+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
	3194	+ __func__, data);
2499	3195	break;
2500	3196	case 0x200 ... 0x2ff:
2501	3197	return kvm_mtrr_set_msr(vcpu, msr, data);
..	..	@@ -2520,15 +3216,46 @@
2520	3216	}
2521	3217	break;
2522	3218	case MSR_IA32_MISC_ENABLE:
2523		- vcpu->arch.ia32_misc_enable_msr = data;
	3219	+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
	3220	+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
	3221	+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
	3222	+ return 1;
	3223	+ vcpu->arch.ia32_misc_enable_msr = data;
	3224	+ kvm_update_cpuid_runtime(vcpu);
	3225	+ } else {
	3226	+ vcpu->arch.ia32_misc_enable_msr = data;
	3227	+ }
2524	3228	break;
2525	3229	case MSR_IA32_SMBASE:
2526	3230	if (!msr_info->host_initiated)
2527	3231	return 1;
2528	3232	vcpu->arch.smbase = data;
2529	3233	break;
	3234	+ case MSR_IA32_POWER_CTL:
	3235	+ vcpu->arch.msr_ia32_power_ctl = data;
	3236	+ break;
2530	3237	case MSR_IA32_TSC:
2531		- kvm_write_tsc(vcpu, msr_info);
	3238	+ if (msr_info->host_initiated) {
	3239	+ kvm_synchronize_tsc(vcpu, data);
	3240	+ } else {
	3241	+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
	3242	+ adjust_tsc_offset_guest(vcpu, adj);
	3243	+ vcpu->arch.ia32_tsc_adjust_msr += adj;
	3244	+ }
	3245	+ break;
	3246	+ case MSR_IA32_XSS:
	3247	+ if (!msr_info->host_initiated &&
	3248	+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
	3249	+ return 1;
	3250	+ /*
	3251	+ * KVM supports exposing PT to the guest, but does not support
	3252	+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
	3253	+ * XSAVES/XRSTORS to save/restore PT MSRs.
	3254	+ */
	3255	+ if (data & ~supported_xss)
	3256	+ return 1;
	3257	+ vcpu->arch.ia32_xss = data;
	3258	+ kvm_update_cpuid_runtime(vcpu);
2532	3259	break;
2533	3260	case MSR_SMI_COUNT:
2534	3261	if (!msr_info->host_initiated)
..	..	@@ -2536,46 +3263,54 @@
2536	3263	vcpu->arch.smi_count = data;
2537	3264	break;
2538	3265	case MSR_KVM_WALL_CLOCK_NEW:
	3266	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3267	+ return 1;
	3268	+
	3269	+ kvm_write_wall_clock(vcpu->kvm, data);
	3270	+ break;
2539	3271	case MSR_KVM_WALL_CLOCK:
2540		- vcpu->kvm->arch.wall_clock = data;
	3272	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3273	+ return 1;
	3274	+
2541	3275	kvm_write_wall_clock(vcpu->kvm, data);
2542	3276	break;
2543	3277	case MSR_KVM_SYSTEM_TIME_NEW:
2544		- case MSR_KVM_SYSTEM_TIME: {
2545		- struct kvm_arch *ka = &vcpu->kvm->arch;
	3278	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3279	+ return 1;
2546	3280
2547		- kvmclock_reset(vcpu);
2548		-
2549		- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2550		- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2551		-
2552		- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2553		- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2554		-
2555		- ka->boot_vcpu_runs_old_kvmclock = tmp;
2556		- }
2557		-
2558		- vcpu->arch.time = data;
2559		- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2560		-
2561		- /* we verify if the enable bit is set... */
2562		- if (!(data & 1))
2563		- break;
2564		-
2565		- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2566		- &vcpu->arch.pv_time, data & ~1ULL,
2567		- sizeof(struct pvclock_vcpu_time_info)))
2568		- vcpu->arch.pv_time_enabled = false;
2569		- else
2570		- vcpu->arch.pv_time_enabled = true;
2571		-
	3281	+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
2572	3282	break;
2573		- }
	3283	+ case MSR_KVM_SYSTEM_TIME:
	3284	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3285	+ return 1;
	3286	+
	3287	+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
	3288	+ break;
2574	3289	case MSR_KVM_ASYNC_PF_EN:
	3290	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
	3291	+ return 1;
	3292	+
2575	3293	if (kvm_pv_enable_async_pf(vcpu, data))
2576	3294	return 1;
2577	3295	break;
	3296	+ case MSR_KVM_ASYNC_PF_INT:
	3297	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3298	+ return 1;
	3299	+
	3300	+ if (kvm_pv_enable_async_pf_int(vcpu, data))
	3301	+ return 1;
	3302	+ break;
	3303	+ case MSR_KVM_ASYNC_PF_ACK:
	3304	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3305	+ return 1;
	3306	+ if (data & 0x1) {
	3307	+ vcpu->arch.apf.pageready_pending = false;
	3308	+ kvm_check_async_pf_completion(vcpu);
	3309	+ }
	3310	+ break;
2578	3311	case MSR_KVM_STEAL_TIME:
	3312	+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
	3313	+ return 1;
2579	3314
2580	3315	if (unlikely(!sched_info_on()))
2581	3316	return 1;
..	..	@@ -2592,8 +3327,22 @@
2592	3327
2593	3328	break;
2594	3329	case MSR_KVM_PV_EOI_EN:
	3330	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
	3331	+ return 1;
	3332	+
2595	3333	if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
2596	3334	return 1;
	3335	+ break;
	3336	+
	3337	+ case MSR_KVM_POLL_CONTROL:
	3338	+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	3339	+ return 1;
	3340	+
	3341	+ /* only enable bit supported */
	3342	+ if (data & (-1ULL << 1))
	3343	+ return 1;
	3344	+
	3345	+ vcpu->arch.msr_kvm_poll_control = data;
2597	3346	break;
2598	3347
2599	3348	case MSR_IA32_MCG_CTL:
..	..	@@ -2603,7 +3352,8 @@
2603	3352
2604	3353	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
2605	3354	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2606		- pr = true; /* fall through */
	3355	+ pr = true;
	3356	+ fallthrough;
2607	3357	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
2608	3358	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2609	3359	if (kvm_pmu_is_valid_msr(vcpu, msr))
..	..	@@ -2624,6 +3374,8 @@
2624	3374	*/
2625	3375	break;
2626	3376	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
	3377	+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
	3378	+ case HV_X64_MSR_SYNDBG_OPTIONS:
2627	3379	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2628	3380	case HV_X64_MSR_CRASH_CTL:
2629	3381	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
..	..	@@ -2669,33 +3421,11 @@
2669	3421	return xen_hvm_config(vcpu, data);
2670	3422	if (kvm_pmu_is_valid_msr(vcpu, msr))
2671	3423	return kvm_pmu_set_msr(vcpu, msr_info);
2672		- if (!ignore_msrs) {
2673		- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2674		- msr, data);
2675		- return 1;
2676		- } else {
2677		- if (report_ignored_msrs)
2678		- vcpu_unimpl(vcpu,
2679		- "ignored wrmsr: 0x%x data 0x%llx\n",
2680		- msr, data);
2681		- break;
2682		- }
	3424	+ return KVM_MSR_RET_INVALID;
2683	3425	}
2684	3426	return 0;
2685	3427	}
2686	3428	EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2687		-
2688		-
2689		-/*
2690		- * Reads an msr value (of 'msr_index') into 'pdata'.
2691		- * Returns 0 on success, non-0 otherwise.
2692		- * Assumes vcpu_load() was already called.
2693		- */
2694		-int kvm_get_msr(struct kvm_vcpu vcpu, struct msr_data msr)
2695		-{
2696		- return kvm_x86_ops->get_msr(vcpu, msr);
2697		-}
2698		-EXPORT_SYMBOL_GPL(kvm_get_msr);
2699	3429
2700	3430	static int get_msr_mce(struct kvm_vcpu vcpu, u32 msr, u64 pdata, bool host)
2701	3431	{
..	..	@@ -2748,7 +3478,6 @@
2748	3478	case MSR_K8_SYSCFG:
2749	3479	case MSR_K8_TSEG_ADDR:
2750	3480	case MSR_K8_TSEG_MASK:
2751		- case MSR_K7_HWCR:
2752	3481	case MSR_VM_HSAVE_PA:
2753	3482	case MSR_K8_INT_PENDING_MSG:
2754	3483	case MSR_AMD64_NB_CFG:
..	..	@@ -2757,6 +3486,17 @@
2757	3486	case MSR_IA32_PERF_CTL:
2758	3487	case MSR_AMD64_DC_CFG:
2759	3488	case MSR_F15H_EX_CFG:
	3489	+ /*
	3490	+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
	3491	+ * limit) MSRs. Just return 0, as we do not want to expose the host
	3492	+ * data here. Do not conditionalize this on CPUID, as KVM does not do
	3493	+ * so for existing CPU-specific MSRs.
	3494	+ */
	3495	+ case MSR_RAPL_POWER_UNIT:
	3496	+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
	3497	+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
	3498	+ case MSR_PKG_ENERGY_STATUS: /* Total package */
	3499	+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
2760	3500	msr_info->data = 0;
2761	3501	break;
2762	3502	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
..	..	@@ -2765,7 +3505,7 @@
2765	3505	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2766	3506	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
2767	3507	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2768		- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
	3508	+ return kvm_pmu_get_msr(vcpu, msr_info);
2769	3509	msr_info->data = 0;
2770	3510	break;
2771	3511	case MSR_IA32_UCODE_REV:
..	..	@@ -2777,9 +3517,31 @@
2777	3517	return 1;
2778	3518	msr_info->data = vcpu->arch.arch_capabilities;
2779	3519	break;
2780		- case MSR_IA32_TSC:
2781		- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
	3520	+ case MSR_IA32_PERF_CAPABILITIES:
	3521	+ if (!msr_info->host_initiated &&
	3522	+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
	3523	+ return 1;
	3524	+ msr_info->data = vcpu->arch.perf_capabilities;
2782	3525	break;
	3526	+ case MSR_IA32_POWER_CTL:
	3527	+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
	3528	+ break;
	3529	+ case MSR_IA32_TSC: {
	3530	+ /*
	3531	+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
	3532	+ * even when not intercepted. AMD manual doesn't explicitly
	3533	+ * state this but appears to behave the same.
	3534	+ *
	3535	+ * On userspace reads and writes, however, we unconditionally
	3536	+ * return L1's TSC value to ensure backwards-compatible
	3537	+ * behavior for migration.
	3538	+ */
	3539	+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
	3540	+ vcpu->arch.tsc_offset;
	3541	+
	3542	+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
	3543	+ break;
	3544	+ }
2783	3545	case MSR_MTRRcap:
2784	3546	case 0x200 ... 0x2ff:
2785	3547	return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
..	..	@@ -2805,7 +3567,6 @@
2805	3567	break;
2806	3568	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
2807	3569	return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2808		- break;
2809	3570	case MSR_IA32_TSCDEADLINE:
2810	3571	msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
2811	3572	break;
..	..	@@ -2833,21 +3594,64 @@
2833	3594	msr_info->data = vcpu->arch.efer;
2834	3595	break;
2835	3596	case MSR_KVM_WALL_CLOCK:
	3597	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3598	+ return 1;
	3599	+
	3600	+ msr_info->data = vcpu->kvm->arch.wall_clock;
	3601	+ break;
2836	3602	case MSR_KVM_WALL_CLOCK_NEW:
	3603	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3604	+ return 1;
	3605	+
2837	3606	msr_info->data = vcpu->kvm->arch.wall_clock;
2838	3607	break;
2839	3608	case MSR_KVM_SYSTEM_TIME:
	3609	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
	3610	+ return 1;
	3611	+
	3612	+ msr_info->data = vcpu->arch.time;
	3613	+ break;
2840	3614	case MSR_KVM_SYSTEM_TIME_NEW:
	3615	+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
	3616	+ return 1;
	3617	+
2841	3618	msr_info->data = vcpu->arch.time;
2842	3619	break;
2843	3620	case MSR_KVM_ASYNC_PF_EN:
2844		- msr_info->data = vcpu->arch.apf.msr_val;
	3621	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
	3622	+ return 1;
	3623	+
	3624	+ msr_info->data = vcpu->arch.apf.msr_en_val;
	3625	+ break;
	3626	+ case MSR_KVM_ASYNC_PF_INT:
	3627	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3628	+ return 1;
	3629	+
	3630	+ msr_info->data = vcpu->arch.apf.msr_int_val;
	3631	+ break;
	3632	+ case MSR_KVM_ASYNC_PF_ACK:
	3633	+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
	3634	+ return 1;
	3635	+
	3636	+ msr_info->data = 0;
2845	3637	break;
2846	3638	case MSR_KVM_STEAL_TIME:
	3639	+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
	3640	+ return 1;
	3641	+
2847	3642	msr_info->data = vcpu->arch.st.msr_val;
2848	3643	break;
2849	3644	case MSR_KVM_PV_EOI_EN:
	3645	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
	3646	+ return 1;
	3647	+
2850	3648	msr_info->data = vcpu->arch.pv_eoi.msr_val;
	3649	+ break;
	3650	+ case MSR_KVM_POLL_CONTROL:
	3651	+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
	3652	+ return 1;
	3653	+
	3654	+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
2851	3655	break;
2852	3656	case MSR_IA32_P5_MC_ADDR:
2853	3657	case MSR_IA32_P5_MC_TYPE:
..	..	@@ -2857,6 +3661,12 @@
2857	3661	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2858	3662	return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
2859	3663	msr_info->host_initiated);
	3664	+ case MSR_IA32_XSS:
	3665	+ if (!msr_info->host_initiated &&
	3666	+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
	3667	+ return 1;
	3668	+ msr_info->data = vcpu->arch.ia32_xss;
	3669	+ break;
2860	3670	case MSR_K7_CLK_CTL:
2861	3671	/*
2862	3672	* Provide expected ramp-up count for K7. All other
..	..	@@ -2870,6 +3680,8 @@
2870	3680	msr_info->data = 0x20000000;
2871	3681	break;
2872	3682	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
	3683	+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
	3684	+ case HV_X64_MSR_SYNDBG_OPTIONS:
2873	3685	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
2874	3686	case HV_X64_MSR_CRASH_CTL:
2875	3687	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
..	..	@@ -2879,7 +3691,6 @@
2879	3691	return kvm_hv_get_msr_common(vcpu,
2880	3692	msr_info->index, &msr_info->data,
2881	3693	msr_info->host_initiated);
2882		- break;
2883	3694	case MSR_IA32_BBL_CR_CTL3:
2884	3695	/* This legacy MSR exists but isn't fully documented in current
2885	3696	* silicon. It is however accessed by winxp in very narrow
..	..	@@ -2912,20 +3723,13 @@
2912	3723	case MSR_MISC_FEATURES_ENABLES:
2913	3724	msr_info->data = vcpu->arch.msr_misc_features_enables;
2914	3725	break;
	3726	+ case MSR_K7_HWCR:
	3727	+ msr_info->data = vcpu->arch.msr_hwcr;
	3728	+ break;
2915	3729	default:
2916	3730	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2917		- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2918		- if (!ignore_msrs) {
2919		- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2920		- msr_info->index);
2921		- return 1;
2922		- } else {
2923		- if (report_ignored_msrs)
2924		- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2925		- msr_info->index);
2926		- msr_info->data = 0;
2927		- }
2928		- break;
	3731	+ return kvm_pmu_get_msr(vcpu, msr_info);
	3732	+ return KVM_MSR_RET_INVALID;
2929	3733	}
2930	3734	return 0;
2931	3735	}
..	..	@@ -2966,7 +3770,7 @@
2966	3770	unsigned size;
2967	3771
2968	3772	r = -EFAULT;
2969		- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
	3773	+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
2970	3774	goto out;
2971	3775
2972	3776	r = -E2BIG;
..	..	@@ -3037,24 +3841,33 @@
3037	3841	case KVM_CAP_HYPERV_VP_INDEX:
3038	3842	case KVM_CAP_HYPERV_EVENTFD:
3039	3843	case KVM_CAP_HYPERV_TLBFLUSH:
	3844	+ case KVM_CAP_HYPERV_SEND_IPI:
	3845	+ case KVM_CAP_HYPERV_CPUID:
3040	3846	case KVM_CAP_PCI_SEGMENT:
3041	3847	case KVM_CAP_DEBUGREGS:
3042	3848	case KVM_CAP_X86_ROBUST_SINGLESTEP:
3043	3849	case KVM_CAP_XSAVE:
3044	3850	case KVM_CAP_ASYNC_PF:
	3851	+ case KVM_CAP_ASYNC_PF_INT:
3045	3852	case KVM_CAP_GET_TSC_KHZ:
3046	3853	case KVM_CAP_KVMCLOCK_CTRL:
3047	3854	case KVM_CAP_READONLY_MEM:
3048	3855	case KVM_CAP_HYPERV_TIME:
3049	3856	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
3050	3857	case KVM_CAP_TSC_DEADLINE_TIMER:
3051		- case KVM_CAP_ENABLE_CAP_VM:
3052	3858	case KVM_CAP_DISABLE_QUIRKS:
3053	3859	case KVM_CAP_SET_BOOT_CPU_ID:
3054	3860	case KVM_CAP_SPLIT_IRQCHIP:
3055	3861	case KVM_CAP_IMMEDIATE_EXIT:
	3862	+ case KVM_CAP_PMU_EVENT_FILTER:
3056	3863	case KVM_CAP_GET_MSR_FEATURES:
3057	3864	case KVM_CAP_MSR_PLATFORM_INFO:
	3865	+ case KVM_CAP_EXCEPTION_PAYLOAD:
	3866	+ case KVM_CAP_SET_GUEST_DEBUG:
	3867	+ case KVM_CAP_LAST_CPU:
	3868	+ case KVM_CAP_X86_USER_SPACE_MSR:
	3869	+ case KVM_CAP_X86_MSR_FILTER:
	3870	+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
3058	3871	r = 1;
3059	3872	break;
3060	3873	case KVM_CAP_SYNC_REGS:
..	..	@@ -3064,7 +3877,8 @@
3064	3877	r = KVM_CLOCK_TSC_STABLE;
3065	3878	break;
3066	3879	case KVM_CAP_X86_DISABLE_EXITS:
3067		- r \|= KVM_X86_DISABLE_EXITS_HLT \| KVM_X86_DISABLE_EXITS_PAUSE;
	3880	+ r \|= KVM_X86_DISABLE_EXITS_HLT \| KVM_X86_DISABLE_EXITS_PAUSE \|
	3881	+ KVM_X86_DISABLE_EXITS_CSTATE;
3068	3882	if(kvm_can_mwait_in_guest())
3069	3883	r \|= KVM_X86_DISABLE_EXITS_MWAIT;
3070	3884	break;
..	..	@@ -3077,10 +3891,10 @@
3077	3891	* fringe case that is not enabled except via specific settings
3078	3892	* of the module parameters.
3079	3893	*/
3080		- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
	3894	+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
3081	3895	break;
3082	3896	case KVM_CAP_VAPIC:
3083		- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
	3897	+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
3084	3898	break;
3085	3899	case KVM_CAP_NR_VCPUS:
3086	3900	r = KVM_SOFT_MAX_VCPUS;
..	..	@@ -3090,9 +3904,6 @@
3090	3904	break;
3091	3905	case KVM_CAP_MAX_VCPU_ID:
3092	3906	r = KVM_MAX_VCPU_ID;
3093		- break;
3094		- case KVM_CAP_NR_MEMSLOTS:
3095		- r = KVM_USER_MEM_SLOTS;
3096	3907	break;
3097	3908	case KVM_CAP_PV_MMU: /* obsolete */
3098	3909	r = 0;
..	..	@@ -3110,8 +3921,20 @@
3110	3921	r = KVM_X2APIC_API_VALID_FLAGS;
3111	3922	break;
3112	3923	case KVM_CAP_NESTED_STATE:
3113		- r = kvm_x86_ops->get_nested_state ?
3114		- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
	3924	+ r = kvm_x86_ops.nested_ops->get_state ?
	3925	+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
	3926	+ break;
	3927	+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
	3928	+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
	3929	+ break;
	3930	+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
	3931	+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
	3932	+ break;
	3933	+ case KVM_CAP_SMALLER_MAXPHYADDR:
	3934	+ r = (int) allow_smaller_maxphyaddr;
	3935	+ break;
	3936	+ case KVM_CAP_STEAL_TIME:
	3937	+ r = sched_info_on();
3115	3938	break;
3116	3939	default:
3117	3940	break;
..	..	@@ -3133,11 +3956,11 @@
3133	3956	unsigned n;
3134	3957
3135	3958	r = -EFAULT;
3136		- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
	3959	+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
3137	3960	goto out;
3138	3961	n = msr_list.nmsrs;
3139	3962	msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3140		- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
	3963	+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
3141	3964	goto out;
3142	3965	r = -E2BIG;
3143	3966	if (n < msr_list.nmsrs)
..	..	@@ -3159,7 +3982,7 @@
3159	3982	struct kvm_cpuid2 cpuid;
3160	3983
3161	3984	r = -EFAULT;
3162		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	3985	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3163	3986	goto out;
3164	3987
3165	3988	r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
..	..	@@ -3168,12 +3991,12 @@
3168	3991	goto out;
3169	3992
3170	3993	r = -EFAULT;
3171		- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
	3994	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3172	3995	goto out;
3173	3996	r = 0;
3174	3997	break;
3175	3998	}
3176		- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
	3999	+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
3177	4000	r = -EFAULT;
3178	4001	if (copy_to_user(argp, &kvm_mce_cap_supported,
3179	4002	sizeof(kvm_mce_cap_supported)))
..	..	@@ -3205,9 +4028,9 @@
3205	4028	case KVM_GET_MSRS:
3206	4029	r = msr_io(NULL, argp, do_get_msr_feature, 1);
3207	4030	break;
3208		- }
3209	4031	default:
3210	4032	r = -EINVAL;
	4033	+ break;
3211	4034	}
3212	4035	out:
3213	4036	return r;
..	..	@@ -3227,14 +4050,17 @@
3227	4050	{
3228	4051	/* Address WBINVD may be executed by guest */
3229	4052	if (need_emulate_wbinvd(vcpu)) {
3230		- if (kvm_x86_ops->has_wbinvd_exit())
	4053	+ if (kvm_x86_ops.has_wbinvd_exit())
3231	4054	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3232	4055	else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
3233	4056	smp_call_function_single(vcpu->cpu,
3234	4057	wbinvd_ipi, NULL, 1);
3235	4058	}
3236	4059
3237		- kvm_x86_ops->vcpu_load(vcpu, cpu);
	4060	+ kvm_x86_ops.vcpu_load(vcpu, cpu);
	4061	+
	4062	+ /* Save host pkru register if supported */
	4063	+ vcpu->arch.host_pkru = read_pkru();
3238	4064
3239	4065	/* Apply any externally detected TSC adjustments (due to suspend) */
3240	4066	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
..	..	@@ -3275,52 +4101,68 @@
3275	4101
3276	4102	static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
3277	4103	{
3278		- struct kvm_host_map map;
3279		- struct kvm_steal_time *st;
	4104	+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
	4105	+ struct kvm_steal_time __user *st;
	4106	+ struct kvm_memslots *slots;
	4107	+ static const u8 preempted = KVM_VCPU_PREEMPTED;
	4108	+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3280	4109
	4110	+ /*
	4111	+ * The vCPU can be marked preempted if and only if the VM-Exit was on
	4112	+ * an instruction boundary and will not trigger guest emulation of any
	4113	+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
	4114	+ * when this is true, for example allowing the vCPU to be marked
	4115	+ * preempted if and only if the VM-Exit was due to a host interrupt.
	4116	+ */
	4117	+ if (!vcpu->arch.at_instruction_boundary) {
	4118	+ vcpu->stat.preemption_other++;
	4119	+ return;
	4120	+ }
	4121	+
	4122	+ vcpu->stat.preemption_reported++;
3281	4123	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3282	4124	return;
3283	4125
3284	4126	if (vcpu->arch.st.preempted)
3285	4127	return;
3286	4128
3287		- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
3288		- &vcpu->arch.st.cache, true))
	4129	+ /* This happens on process exit */
	4130	+ if (unlikely(current->mm != vcpu->kvm->mm))
3289	4131	return;
3290	4132
3291		- st = map.hva +
3292		- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
	4133	+ slots = kvm_memslots(vcpu->kvm);
3293	4134
3294		- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
	4135	+ if (unlikely(slots->generation != ghc->generation \|\|
	4136	+ gpa != ghc->gpa \|\|
	4137	+ kvm_is_error_hva(ghc->hva) \|\| !ghc->memslot))
	4138	+ return;
3295	4139
3296		- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
	4140	+ st = (struct kvm_steal_time __user *)ghc->hva;
	4141	+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
	4142	+
	4143	+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
	4144	+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
	4145	+
	4146	+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
3297	4147	}
3298	4148
3299	4149	void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
3300	4150	{
3301	4151	int idx;
3302	4152
3303		- if (vcpu->preempted)
3304		- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
	4153	+ if (vcpu->preempted) {
	4154	+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
3305	4155
3306		- /*
3307		- * Disable page faults because we're in atomic context here.
3308		- * kvm_write_guest_offset_cached() would call might_fault()
3309		- * that relies on pagefault_disable() to tell if there's a
3310		- * bug. NOTE: the write to guest memory may not go through if
3311		- * during postcopy live migration or if there's heavy guest
3312		- * paging.
3313		- */
3314		- pagefault_disable();
3315		- /*
3316		- * kvm_memslots() will be called by
3317		- * kvm_write_guest_offset_cached() so take the srcu lock.
3318		- */
3319		- idx = srcu_read_lock(&vcpu->kvm->srcu);
3320		- kvm_steal_time_set_preempted(vcpu);
3321		- srcu_read_unlock(&vcpu->kvm->srcu, idx);
3322		- pagefault_enable();
3323		- kvm_x86_ops->vcpu_put(vcpu);
	4156	+ /*
	4157	+ * Take the srcu lock as memslots will be accessed to check the gfn
	4158	+ * cache generation against the memslots generation.
	4159	+ */
	4160	+ idx = srcu_read_lock(&vcpu->kvm->srcu);
	4161	+ kvm_steal_time_set_preempted(vcpu);
	4162	+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
	4163	+ }
	4164	+
	4165	+ kvm_x86_ops.vcpu_put(vcpu);
3324	4166	vcpu->arch.last_host_tsc = rdtsc();
3325	4167	/*
3326	4168	* If userspace has set any breakpoints or watchpoints, dr6 is restored
..	..	@@ -3334,7 +4176,7 @@
3334	4176	struct kvm_lapic_state *s)
3335	4177	{
3336	4178	if (vcpu->arch.apicv_active)
3337		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	4179	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
3338	4180
3339	4181	return kvm_apic_get_state(vcpu, s);
3340	4182	}
..	..	@@ -3453,8 +4295,7 @@
3453	4295	for (bank = 0; bank < bank_num; bank++)
3454	4296	vcpu->arch.mce_banks[bank*4] = ~(u64)0;
3455	4297
3456		- if (kvm_x86_ops->setup_mce)
3457		- kvm_x86_ops->setup_mce(vcpu);
	4298	+ kvm_x86_ops.setup_mce(vcpu);
3458	4299	out:
3459	4300	return r;
3460	4301	}
..	..	@@ -3516,28 +4357,56 @@
3516	4357	process_smi(vcpu);
3517	4358
3518	4359	/*
3519		- * FIXME: pass injected and pending separately. This is only
3520		- * needed for nested virtualization, whose state cannot be
3521		- * migrated yet. For now we can combine them.
	4360	+ * In guest mode, payload delivery should be deferred,
	4361	+ * so that the L1 hypervisor can intercept #PF before
	4362	+ * CR2 is modified (or intercept #DB before DR6 is
	4363	+ * modified under nVMX). Unless the per-VM capability,
	4364	+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
	4365	+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
	4366	+ * opportunistically defer the exception payload, deliver it if the
	4367	+ * capability hasn't been requested before processing a
	4368	+ * KVM_GET_VCPU_EVENTS.
3522	4369	*/
3523		- events->exception.injected =
3524		- (vcpu->arch.exception.pending \|\|
3525		- vcpu->arch.exception.injected) &&
3526		- !kvm_exception_is_soft(vcpu->arch.exception.nr);
	4370	+ if (!vcpu->kvm->arch.exception_payload_enabled &&
	4371	+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
	4372	+ kvm_deliver_exception_payload(vcpu);
	4373	+
	4374	+ /*
	4375	+ * The API doesn't provide the instruction length for software
	4376	+ * exceptions, so don't report them. As long as the guest RIP
	4377	+ * isn't advanced, we should expect to encounter the exception
	4378	+ * again.
	4379	+ */
	4380	+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
	4381	+ events->exception.injected = 0;
	4382	+ events->exception.pending = 0;
	4383	+ } else {
	4384	+ events->exception.injected = vcpu->arch.exception.injected;
	4385	+ events->exception.pending = vcpu->arch.exception.pending;
	4386	+ /*
	4387	+ * For ABI compatibility, deliberately conflate
	4388	+ * pending and injected exceptions when
	4389	+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
	4390	+ */
	4391	+ if (!vcpu->kvm->arch.exception_payload_enabled)
	4392	+ events->exception.injected \|=
	4393	+ vcpu->arch.exception.pending;
	4394	+ }
3527	4395	events->exception.nr = vcpu->arch.exception.nr;
3528	4396	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3529		- events->exception.pad = 0;
3530	4397	events->exception.error_code = vcpu->arch.exception.error_code;
	4398	+ events->exception_has_payload = vcpu->arch.exception.has_payload;
	4399	+ events->exception_payload = vcpu->arch.exception.payload;
3531	4400
3532	4401	events->interrupt.injected =
3533	4402	vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
3534	4403	events->interrupt.nr = vcpu->arch.interrupt.nr;
3535	4404	events->interrupt.soft = 0;
3536		- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
	4405	+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
3537	4406
3538	4407	events->nmi.injected = vcpu->arch.nmi_injected;
3539	4408	events->nmi.pending = vcpu->arch.nmi_pending != 0;
3540		- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
	4409	+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
3541	4410	events->nmi.pad = 0;
3542	4411
3543	4412	events->sipi_vector = 0; /* never valid when reporting to user space */
..	..	@@ -3551,10 +4420,13 @@
3551	4420	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3552	4421	\| KVM_VCPUEVENT_VALID_SHADOW
3553	4422	\| KVM_VCPUEVENT_VALID_SMM);
	4423	+ if (vcpu->kvm->arch.exception_payload_enabled)
	4424	+ events->flags \|= KVM_VCPUEVENT_VALID_PAYLOAD;
	4425	+
3554	4426	memset(&events->reserved, 0, sizeof(events->reserved));
3555	4427	}
3556	4428
3557		-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
	4429	+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
3558	4430
3559	4431	static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3560	4432	struct kvm_vcpu_events *events)
..	..	@@ -3562,12 +4434,24 @@
3562	4434	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3563	4435	\| KVM_VCPUEVENT_VALID_SIPI_VECTOR
3564	4436	\| KVM_VCPUEVENT_VALID_SHADOW
3565		- \| KVM_VCPUEVENT_VALID_SMM))
	4437	+ \| KVM_VCPUEVENT_VALID_SMM
	4438	+ \| KVM_VCPUEVENT_VALID_PAYLOAD))
3566	4439	return -EINVAL;
3567	4440
3568		- if (events->exception.injected &&
3569		- (events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR \|\|
3570		- is_guest_mode(vcpu)))
	4441	+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
	4442	+ if (!vcpu->kvm->arch.exception_payload_enabled)
	4443	+ return -EINVAL;
	4444	+ if (events->exception.pending)
	4445	+ events->exception.injected = 0;
	4446	+ else
	4447	+ events->exception_has_payload = 0;
	4448	+ } else {
	4449	+ events->exception.pending = 0;
	4450	+ events->exception_has_payload = 0;
	4451	+ }
	4452	+
	4453	+ if ((events->exception.injected \|\| events->exception.pending) &&
	4454	+ (events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR))
3571	4455	return -EINVAL;
3572	4456
3573	4457	/* INITs are latched while in SMM */
..	..	@@ -3577,35 +4461,40 @@
3577	4461	return -EINVAL;
3578	4462
3579	4463	process_nmi(vcpu);
3580		- vcpu->arch.exception.injected = false;
3581		- vcpu->arch.exception.pending = events->exception.injected;
	4464	+ vcpu->arch.exception.injected = events->exception.injected;
	4465	+ vcpu->arch.exception.pending = events->exception.pending;
3582	4466	vcpu->arch.exception.nr = events->exception.nr;
3583	4467	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3584	4468	vcpu->arch.exception.error_code = events->exception.error_code;
	4469	+ vcpu->arch.exception.has_payload = events->exception_has_payload;
	4470	+ vcpu->arch.exception.payload = events->exception_payload;
3585	4471
3586	4472	vcpu->arch.interrupt.injected = events->interrupt.injected;
3587	4473	vcpu->arch.interrupt.nr = events->interrupt.nr;
3588	4474	vcpu->arch.interrupt.soft = events->interrupt.soft;
3589	4475	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3590		- kvm_x86_ops->set_interrupt_shadow(vcpu,
	4476	+ kvm_x86_ops.set_interrupt_shadow(vcpu,
3591	4477	events->interrupt.shadow);
3592	4478
3593	4479	vcpu->arch.nmi_injected = events->nmi.injected;
3594	4480	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3595	4481	vcpu->arch.nmi_pending = events->nmi.pending;
3596		- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
	4482	+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
3597	4483
3598	4484	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3599	4485	lapic_in_kernel(vcpu))
3600	4486	vcpu->arch.apic->sipi_vector = events->sipi_vector;
3601	4487
3602	4488	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3603		- u32 hflags = vcpu->arch.hflags;
3604		- if (events->smi.smm)
3605		- hflags \|= HF_SMM_MASK;
3606		- else
3607		- hflags &= ~HF_SMM_MASK;
3608		- kvm_set_hflags(vcpu, hflags);
	4489	+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
	4490	+ if (events->smi.smm)
	4491	+ vcpu->arch.hflags \|= HF_SMM_MASK;
	4492	+ else
	4493	+ vcpu->arch.hflags &= ~HF_SMM_MASK;
	4494	+
	4495	+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
	4496	+ kvm_smm_changed(vcpu);
	4497	+ }
3609	4498
3610	4499	vcpu->arch.smi_pending = events->smi.pending;
3611	4500
..	..	@@ -3614,12 +4503,13 @@
3614	4503	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
3615	4504	else
3616	4505	vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3617		- if (lapic_in_kernel(vcpu)) {
3618		- if (events->smi.latched_init)
3619		- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3620		- else
3621		- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3622		- }
	4506	+ }
	4507	+
	4508	+ if (lapic_in_kernel(vcpu)) {
	4509	+ if (events->smi.latched_init)
	4510	+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
	4511	+ else
	4512	+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3623	4513	}
3624	4514	}
3625	4515
..	..	@@ -3633,12 +4523,11 @@
3633	4523	{
3634	4524	unsigned long val;
3635	4525
	4526	+ memset(dbgregs, 0, sizeof(*dbgregs));
3636	4527	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3637	4528	kvm_get_dr(vcpu, 6, &val);
3638	4529	dbgregs->dr6 = val;
3639	4530	dbgregs->dr7 = vcpu->arch.dr7;
3640		- dbgregs->flags = 0;
3641		- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3642	4531	}
3643	4532
3644	4533	static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
..	..	@@ -3655,7 +4544,6 @@
3655	4544	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3656	4545	kvm_update_dr0123(vcpu);
3657	4546	vcpu->arch.dr6 = dbgregs->dr6;
3658		- kvm_update_dr6(vcpu);
3659	4547	vcpu->arch.dr7 = dbgregs->dr7;
3660	4548	kvm_update_dr7(vcpu);
3661	4549
..	..	@@ -3666,7 +4554,7 @@
3666	4554
3667	4555	static void fill_xsave(u8 dest, struct kvm_vcpu vcpu)
3668	4556	{
3669		- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
	4557	+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3670	4558	u64 xstate_bv = xsave->header.xfeatures;
3671	4559	u64 valid;
3672	4560
..	..	@@ -3686,15 +4574,15 @@
3686	4574	*/
3687	4575	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3688	4576	while (valid) {
3689		- u64 feature = valid & -valid;
3690		- int index = fls64(feature) - 1;
3691		- void *src = get_xsave_addr(xsave, feature);
	4577	+ u64 xfeature_mask = valid & -valid;
	4578	+ int xfeature_nr = fls64(xfeature_mask) - 1;
	4579	+ void *src = get_xsave_addr(xsave, xfeature_nr);
3692	4580
3693	4581	if (src) {
3694	4582	u32 size, offset, ecx, edx;
3695		- cpuid_count(XSTATE_CPUID, index,
	4583	+ cpuid_count(XSTATE_CPUID, xfeature_nr,
3696	4584	&size, &offset, &ecx, &edx);
3697		- if (feature == XFEATURE_MASK_PKRU)
	4585	+ if (xfeature_nr == XFEATURE_PKRU)
3698	4586	memcpy(dest + offset, &vcpu->arch.pkru,
3699	4587	sizeof(vcpu->arch.pkru));
3700	4588	else
..	..	@@ -3702,13 +4590,13 @@
3702	4590
3703	4591	}
3704	4592
3705		- valid -= feature;
	4593	+ valid -= xfeature_mask;
3706	4594	}
3707	4595	}
3708	4596
3709	4597	static void load_xsave(struct kvm_vcpu vcpu, u8 src)
3710	4598	{
3711		- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
	4599	+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
3712	4600	u64 xstate_bv = (u64 )(src + XSAVE_HDR_OFFSET);
3713	4601	u64 valid;
3714	4602
..	..	@@ -3729,22 +4617,22 @@
3729	4617	*/
3730	4618	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
3731	4619	while (valid) {
3732		- u64 feature = valid & -valid;
3733		- int index = fls64(feature) - 1;
3734		- void *dest = get_xsave_addr(xsave, feature);
	4620	+ u64 xfeature_mask = valid & -valid;
	4621	+ int xfeature_nr = fls64(xfeature_mask) - 1;
	4622	+ void *dest = get_xsave_addr(xsave, xfeature_nr);
3735	4623
3736	4624	if (dest) {
3737	4625	u32 size, offset, ecx, edx;
3738		- cpuid_count(XSTATE_CPUID, index,
	4626	+ cpuid_count(XSTATE_CPUID, xfeature_nr,
3739	4627	&size, &offset, &ecx, &edx);
3740		- if (feature == XFEATURE_MASK_PKRU)
	4628	+ if (xfeature_nr == XFEATURE_PKRU)
3741	4629	memcpy(&vcpu->arch.pkru, src + offset,
3742	4630	sizeof(vcpu->arch.pkru));
3743	4631	else
3744	4632	memcpy(dest, src + offset, size);
3745	4633	}
3746	4634
3747		- valid -= feature;
	4635	+ valid -= xfeature_mask;
3748	4636	}
3749	4637	}
3750	4638
..	..	@@ -3756,7 +4644,7 @@
3756	4644	fill_xsave((u8 *) guest_xsave->region, vcpu);
3757	4645	} else {
3758	4646	memcpy(guest_xsave->region,
3759		- &vcpu->arch.guest_fpu.state.fxsave,
	4647	+ &vcpu->arch.guest_fpu->state.fxsave,
3760	4648	sizeof(struct fxregs_state));
3761	4649	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3762	4650	XFEATURE_MASK_FPSSE;
..	..	@@ -3778,15 +4666,14 @@
3778	4666	* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
3779	4667	* with old userspace.
3780	4668	*/
3781		- if (xstate_bv & ~kvm_supported_xcr0() \|\|
3782		- mxcsr & ~mxcsr_feature_mask)
	4669	+ if (xstate_bv & ~supported_xcr0 \|\| mxcsr & ~mxcsr_feature_mask)
3783	4670	return -EINVAL;
3784	4671	load_xsave(vcpu, (u8 *)guest_xsave->region);
3785	4672	} else {
3786	4673	if (xstate_bv & ~XFEATURE_MASK_FPSSE \|\|
3787	4674	mxcsr & ~mxcsr_feature_mask)
3788	4675	return -EINVAL;
3789		- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
	4676	+ memcpy(&vcpu->arch.guest_fpu->state.fxsave,
3790	4677	guest_xsave->region, sizeof(struct fxregs_state));
3791	4678	}
3792	4679	return 0;
..	..	@@ -3847,6 +4734,10 @@
3847	4734	static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
3848	4735	struct kvm_enable_cap *cap)
3849	4736	{
	4737	+ int r;
	4738	+ uint16_t vmcs_version;
	4739	+ void __user *user_ptr;
	4740	+
3850	4741	if (cap->flags)
3851	4742	return -EINVAL;
3852	4743
..	..	@@ -3854,11 +4745,37 @@
3854	4745	case KVM_CAP_HYPERV_SYNIC2:
3855	4746	if (cap->args[0])
3856	4747	return -EINVAL;
	4748	+ fallthrough;
	4749	+
3857	4750	case KVM_CAP_HYPERV_SYNIC:
3858	4751	if (!irqchip_in_kernel(vcpu->kvm))
3859	4752	return -EINVAL;
3860	4753	return kvm_hv_activate_synic(vcpu, cap->cap ==
3861	4754	KVM_CAP_HYPERV_SYNIC2);
	4755	+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
	4756	+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
	4757	+ return -ENOTTY;
	4758	+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
	4759	+ if (!r) {
	4760	+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
	4761	+ if (copy_to_user(user_ptr, &vmcs_version,
	4762	+ sizeof(vmcs_version)))
	4763	+ r = -EFAULT;
	4764	+ }
	4765	+ return r;
	4766	+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
	4767	+ if (!kvm_x86_ops.enable_direct_tlbflush)
	4768	+ return -ENOTTY;
	4769	+
	4770	+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
	4771	+
	4772	+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
	4773	+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
	4774	+ if (vcpu->arch.pv_cpuid.enforce)
	4775	+ kvm_update_pv_runtime(vcpu);
	4776	+
	4777	+ return 0;
	4778	+
3862	4779	default:
3863	4780	return -EINVAL;
3864	4781	}
..	..	@@ -3885,7 +4802,8 @@
3885	4802	r = -EINVAL;
3886	4803	if (!lapic_in_kernel(vcpu))
3887	4804	goto out;
3888		- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
	4805	+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
	4806	+ GFP_KERNEL_ACCOUNT);
3889	4807
3890	4808	r = -ENOMEM;
3891	4809	if (!u.lapic)
..	..	@@ -3916,7 +4834,7 @@
3916	4834	struct kvm_interrupt irq;
3917	4835
3918	4836	r = -EFAULT;
3919		- if (copy_from_user(&irq, argp, sizeof irq))
	4837	+ if (copy_from_user(&irq, argp, sizeof(irq)))
3920	4838	goto out;
3921	4839	r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3922	4840	break;
..	..	@@ -3934,7 +4852,7 @@
3934	4852	struct kvm_cpuid cpuid;
3935	4853
3936	4854	r = -EFAULT;
3937		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4855	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3938	4856	goto out;
3939	4857	r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3940	4858	break;
..	..	@@ -3944,7 +4862,7 @@
3944	4862	struct kvm_cpuid2 cpuid;
3945	4863
3946	4864	r = -EFAULT;
3947		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4865	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3948	4866	goto out;
3949	4867	r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3950	4868	cpuid_arg->entries);
..	..	@@ -3955,14 +4873,14 @@
3955	4873	struct kvm_cpuid2 cpuid;
3956	4874
3957	4875	r = -EFAULT;
3958		- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
	4876	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
3959	4877	goto out;
3960	4878	r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3961	4879	cpuid_arg->entries);
3962	4880	if (r)
3963	4881	goto out;
3964	4882	r = -EFAULT;
3965		- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
	4883	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
3966	4884	goto out;
3967	4885	r = 0;
3968	4886	break;
..	..	@@ -3983,13 +4901,13 @@
3983	4901	struct kvm_tpr_access_ctl tac;
3984	4902
3985	4903	r = -EFAULT;
3986		- if (copy_from_user(&tac, argp, sizeof tac))
	4904	+ if (copy_from_user(&tac, argp, sizeof(tac)))
3987	4905	goto out;
3988	4906	r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3989	4907	if (r)
3990	4908	goto out;
3991	4909	r = -EFAULT;
3992		- if (copy_to_user(argp, &tac, sizeof tac))
	4910	+ if (copy_to_user(argp, &tac, sizeof(tac)))
3993	4911	goto out;
3994	4912	r = 0;
3995	4913	break;
..	..	@@ -4002,7 +4920,7 @@
4002	4920	if (!lapic_in_kernel(vcpu))
4003	4921	goto out;
4004	4922	r = -EFAULT;
4005		- if (copy_from_user(&va, argp, sizeof va))
	4923	+ if (copy_from_user(&va, argp, sizeof(va)))
4006	4924	goto out;
4007	4925	idx = srcu_read_lock(&vcpu->kvm->srcu);
4008	4926	r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
..	..	@@ -4013,7 +4931,7 @@
4013	4931	u64 mcg_cap;
4014	4932
4015	4933	r = -EFAULT;
4016		- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
	4934	+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
4017	4935	goto out;
4018	4936	r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
4019	4937	break;
..	..	@@ -4022,7 +4940,7 @@
4022	4940	struct kvm_x86_mce mce;
4023	4941
4024	4942	r = -EFAULT;
4025		- if (copy_from_user(&mce, argp, sizeof mce))
	4943	+ if (copy_from_user(&mce, argp, sizeof(mce)))
4026	4944	goto out;
4027	4945	r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
4028	4946	break;
..	..	@@ -4072,7 +4990,7 @@
4072	4990	break;
4073	4991	}
4074	4992	case KVM_GET_XSAVE: {
4075		- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
	4993	+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
4076	4994	r = -ENOMEM;
4077	4995	if (!u.xsave)
4078	4996	break;
..	..	@@ -4096,7 +5014,7 @@
4096	5014	break;
4097	5015	}
4098	5016	case KVM_GET_XCRS: {
4099		- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
	5017	+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
4100	5018	r = -ENOMEM;
4101	5019	if (!u.xcrs)
4102	5020	break;
..	..	@@ -4126,7 +5044,8 @@
4126	5044	r = -EINVAL;
4127	5045	user_tsc_khz = (u32)arg;
4128	5046
4129		- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
	5047	+ if (kvm_has_tsc_control &&
	5048	+ user_tsc_khz >= kvm_max_guest_tsc_khz)
4130	5049	goto out;
4131	5050
4132	5051	if (user_tsc_khz == 0)
..	..	@@ -4159,7 +5078,7 @@
4159	5078	u32 user_data_size;
4160	5079
4161	5080	r = -EINVAL;
4162		- if (!kvm_x86_ops->get_nested_state)
	5081	+ if (!kvm_x86_ops.nested_ops->get_state)
4163	5082	break;
4164	5083
4165	5084	BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
..	..	@@ -4167,8 +5086,8 @@
4167	5086	if (get_user(user_data_size, &user_kvm_nested_state->size))
4168	5087	break;
4169	5088
4170		- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
4171		- user_data_size);
	5089	+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
	5090	+ user_data_size);
4172	5091	if (r < 0)
4173	5092	break;
4174	5093
..	..	@@ -4189,7 +5108,7 @@
4189	5108	int idx;
4190	5109
4191	5110	r = -EINVAL;
4192		- if (!kvm_x86_ops->set_nested_state)
	5111	+ if (!kvm_x86_ops.nested_ops->set_state)
4193	5112	break;
4194	5113
4195	5114	r = -EFAULT;
..	..	@@ -4201,16 +5120,38 @@
4201	5120	break;
4202	5121
4203	5122	if (kvm_state.flags &
4204		- ~(KVM_STATE_NESTED_RUN_PENDING \| KVM_STATE_NESTED_GUEST_MODE))
	5123	+ ~(KVM_STATE_NESTED_RUN_PENDING \| KVM_STATE_NESTED_GUEST_MODE
	5124	+ \| KVM_STATE_NESTED_EVMCS \| KVM_STATE_NESTED_MTF_PENDING
	5125	+ \| KVM_STATE_NESTED_GIF_SET))
4205	5126	break;
4206	5127
4207	5128	/* nested_run_pending implies guest_mode. */
4208		- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
	5129	+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
	5130	+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
4209	5131	break;
4210	5132
4211	5133	idx = srcu_read_lock(&vcpu->kvm->srcu);
4212		- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
	5134	+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
4213	5135	srcu_read_unlock(&vcpu->kvm->srcu, idx);
	5136	+ break;
	5137	+ }
	5138	+ case KVM_GET_SUPPORTED_HV_CPUID: {
	5139	+ struct kvm_cpuid2 __user *cpuid_arg = argp;
	5140	+ struct kvm_cpuid2 cpuid;
	5141	+
	5142	+ r = -EFAULT;
	5143	+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
	5144	+ goto out;
	5145	+
	5146	+ r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
	5147	+ cpuid_arg->entries);
	5148	+ if (r)
	5149	+ goto out;
	5150	+
	5151	+ r = -EFAULT;
	5152	+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
	5153	+ goto out;
	5154	+ r = 0;
4214	5155	break;
4215	5156	}
4216	5157	default:
..	..	@@ -4234,14 +5175,14 @@
4234	5175
4235	5176	if (addr > (unsigned int)(-3 * PAGE_SIZE))
4236	5177	return -EINVAL;
4237		- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
	5178	+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
4238	5179	return ret;
4239	5180	}
4240	5181
4241	5182	static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
4242	5183	u64 ident_addr)
4243	5184	{
4244		- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
	5185	+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
4245	5186	}
4246	5187
4247	5188	static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
..	..	@@ -4382,9 +5323,6 @@
4382	5323	{
4383	5324	struct kvm_pit *pit = kvm->arch.vpit;
4384	5325
4385		- if (!pit)
4386		- return -ENXIO;
4387		-
4388	5326	/* pit->pit_state.lock was overloaded to prevent userspace from getting
4389	5327	* an inconsistent state after running multiple KVM_REINJECT_CONTROL
4390	5328	* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
..	..	@@ -4396,50 +5334,13 @@
4396	5334	return 0;
4397	5335	}
4398	5336
4399		-/**
4400		- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4401		- * @kvm: kvm instance
4402		- * @log: slot id and address to which we copy the log
4403		- *
4404		- * Steps 1-4 below provide general overview of dirty page logging. See
4405		- * kvm_get_dirty_log_protect() function description for additional details.
4406		- *
4407		- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4408		- * always flush the TLB (step 4) even if previous step failed and the dirty
4409		- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4410		- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4411		- * writes will be marked dirty for next log read.
4412		- *
4413		- * 1. Take a snapshot of the bit and clear it if needed.
4414		- * 2. Write protect the corresponding page.
4415		- * 3. Copy the snapshot to the userspace.
4416		- * 4. Flush TLB's if needed.
4417		- */
4418		-int kvm_vm_ioctl_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log)
	5337	+void kvm_arch_sync_dirty_log(struct kvm kvm, struct kvm_memory_slot memslot)
4419	5338	{
4420		- bool is_dirty = false;
4421		- int r;
4422		-
4423		- mutex_lock(&kvm->slots_lock);
4424		-
4425	5339	/*
4426	5340	* Flush potentially hardware-cached dirty pages to dirty_bitmap.
4427	5341	*/
4428		- if (kvm_x86_ops->flush_log_dirty)
4429		- kvm_x86_ops->flush_log_dirty(kvm);
4430		-
4431		- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
4432		-
4433		- /*
4434		- * All the TLBs can be flushed out of mmu lock, see the comments in
4435		- * kvm_mmu_slot_remove_write_access().
4436		- */
4437		- lockdep_assert_held(&kvm->slots_lock);
4438		- if (is_dirty)
4439		- kvm_flush_remote_tlbs(kvm);
4440		-
4441		- mutex_unlock(&kvm->slots_lock);
4442		- return r;
	5342	+ if (kvm_x86_ops.flush_log_dirty)
	5343	+ kvm_x86_ops.flush_log_dirty(kvm);
4443	5344	}
4444	5345
4445	5346	int kvm_vm_ioctl_irq_line(struct kvm kvm, struct kvm_irq_level irq_event,
..	..	@@ -4454,8 +5355,8 @@
4454	5355	return 0;
4455	5356	}
4456	5357
4457		-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4458		- struct kvm_enable_cap *cap)
	5358	+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
	5359	+ struct kvm_enable_cap *cap)
4459	5360	{
4460	5361	int r;
4461	5362
..	..	@@ -4513,10 +5414,25 @@
4513	5414	kvm->arch.hlt_in_guest = true;
4514	5415	if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
4515	5416	kvm->arch.pause_in_guest = true;
	5417	+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
	5418	+ kvm->arch.cstate_in_guest = true;
4516	5419	r = 0;
4517	5420	break;
4518	5421	case KVM_CAP_MSR_PLATFORM_INFO:
4519	5422	kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
	5423	+ r = 0;
	5424	+ break;
	5425	+ case KVM_CAP_EXCEPTION_PAYLOAD:
	5426	+ kvm->arch.exception_payload_enabled = cap->args[0];
	5427	+ r = 0;
	5428	+ break;
	5429	+ case KVM_CAP_X86_USER_SPACE_MSR:
	5430	+ r = -EINVAL;
	5431	+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL \|
	5432	+ KVM_MSR_EXIT_REASON_UNKNOWN \|
	5433	+ KVM_MSR_EXIT_REASON_FILTER))
	5434	+ break;
	5435	+ kvm->arch.user_space_msr_mask = cap->args[0];
4520	5436	r = 0;
4521	5437	break;
4522	5438	default:
..	..	@@ -4525,6 +5441,180 @@
4525	5441	}
4526	5442	return r;
4527	5443	}
	5444	+
	5445	+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
	5446	+{
	5447	+ struct kvm_x86_msr_filter *msr_filter;
	5448	+
	5449	+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
	5450	+ if (!msr_filter)
	5451	+ return NULL;
	5452	+
	5453	+ msr_filter->default_allow = default_allow;
	5454	+ return msr_filter;
	5455	+}
	5456	+
	5457	+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
	5458	+{
	5459	+ u32 i;
	5460	+
	5461	+ if (!msr_filter)
	5462	+ return;
	5463	+
	5464	+ for (i = 0; i < msr_filter->count; i++)
	5465	+ kfree(msr_filter->ranges[i].bitmap);
	5466	+
	5467	+ kfree(msr_filter);
	5468	+}
	5469	+
	5470	+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
	5471	+ struct kvm_msr_filter_range *user_range)
	5472	+{
	5473	+ struct msr_bitmap_range range;
	5474	+ unsigned long *bitmap = NULL;
	5475	+ size_t bitmap_size;
	5476	+ int r;
	5477	+
	5478	+ if (!user_range->nmsrs)
	5479	+ return 0;
	5480	+
	5481	+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
	5482	+ if (!bitmap_size \|\| bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
	5483	+ return -EINVAL;
	5484	+
	5485	+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
	5486	+ if (IS_ERR(bitmap))
	5487	+ return PTR_ERR(bitmap);
	5488	+
	5489	+ range = (struct msr_bitmap_range) {
	5490	+ .flags = user_range->flags,
	5491	+ .base = user_range->base,
	5492	+ .nmsrs = user_range->nmsrs,
	5493	+ .bitmap = bitmap,
	5494	+ };
	5495	+
	5496	+ if (range.flags & ~(KVM_MSR_FILTER_READ \| KVM_MSR_FILTER_WRITE)) {
	5497	+ r = -EINVAL;
	5498	+ goto err;
	5499	+ }
	5500	+
	5501	+ if (!range.flags) {
	5502	+ r = -EINVAL;
	5503	+ goto err;
	5504	+ }
	5505	+
	5506	+ /* Everything ok, add this range identifier. */
	5507	+ msr_filter->ranges[msr_filter->count] = range;
	5508	+ msr_filter->count++;
	5509	+
	5510	+ return 0;
	5511	+err:
	5512	+ kfree(bitmap);
	5513	+ return r;
	5514	+}
	5515	+
	5516	+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
	5517	+ struct kvm_msr_filter *filter)
	5518	+{
	5519	+ struct kvm_x86_msr_filter new_filter, old_filter;
	5520	+ bool default_allow;
	5521	+ bool empty = true;
	5522	+ int r = 0;
	5523	+ u32 i;
	5524	+
	5525	+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
	5526	+ return -EINVAL;
	5527	+
	5528	+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
	5529	+ empty &= !filter->ranges[i].nmsrs;
	5530	+
	5531	+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
	5532	+ if (empty && !default_allow)
	5533	+ return -EINVAL;
	5534	+
	5535	+ new_filter = kvm_alloc_msr_filter(default_allow);
	5536	+ if (!new_filter)
	5537	+ return -ENOMEM;
	5538	+
	5539	+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
	5540	+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
	5541	+ if (r) {
	5542	+ kvm_free_msr_filter(new_filter);
	5543	+ return r;
	5544	+ }
	5545	+ }
	5546	+
	5547	+ mutex_lock(&kvm->lock);
	5548	+
	5549	+ /* The per-VM filter is protected by kvm->lock... */
	5550	+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
	5551	+
	5552	+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
	5553	+ synchronize_srcu(&kvm->srcu);
	5554	+
	5555	+ kvm_free_msr_filter(old_filter);
	5556	+
	5557	+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
	5558	+ mutex_unlock(&kvm->lock);
	5559	+
	5560	+ return 0;
	5561	+}
	5562	+
	5563	+#ifdef CONFIG_KVM_COMPAT
	5564	+/* for KVM_X86_SET_MSR_FILTER */
	5565	+struct kvm_msr_filter_range_compat {
	5566	+ __u32 flags;
	5567	+ __u32 nmsrs;
	5568	+ __u32 base;
	5569	+ __u32 bitmap;
	5570	+};
	5571	+
	5572	+struct kvm_msr_filter_compat {
	5573	+ __u32 flags;
	5574	+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
	5575	+};
	5576	+
	5577	+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
	5578	+
	5579	+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
	5580	+ unsigned long arg)
	5581	+{
	5582	+ void __user argp = (void __user )arg;
	5583	+ struct kvm *kvm = filp->private_data;
	5584	+ long r = -ENOTTY;
	5585	+
	5586	+ switch (ioctl) {
	5587	+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
	5588	+ struct kvm_msr_filter __user *user_msr_filter = argp;
	5589	+ struct kvm_msr_filter_compat filter_compat;
	5590	+ struct kvm_msr_filter filter;
	5591	+ int i;
	5592	+
	5593	+ if (copy_from_user(&filter_compat, user_msr_filter,
	5594	+ sizeof(filter_compat)))
	5595	+ return -EFAULT;
	5596	+
	5597	+ filter.flags = filter_compat.flags;
	5598	+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
	5599	+ struct kvm_msr_filter_range_compat *cr;
	5600	+
	5601	+ cr = &filter_compat.ranges[i];
	5602	+ filter.ranges[i] = (struct kvm_msr_filter_range) {
	5603	+ .flags = cr->flags,
	5604	+ .nmsrs = cr->nmsrs,
	5605	+ .base = cr->base,
	5606	+ .bitmap = (__u8 *)(ulong)cr->bitmap,
	5607	+ };
	5608	+ }
	5609	+
	5610	+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
	5611	+ break;
	5612	+ }
	5613	+ }
	5614	+
	5615	+ return r;
	5616	+}
	5617	+#endif
4528	5618
4529	5619	long kvm_arch_vm_ioctl(struct file *filp,
4530	5620	unsigned int ioctl, unsigned long arg)
..	..	@@ -4555,7 +5645,7 @@
4555	5645	if (kvm->created_vcpus)
4556	5646	goto set_identity_unlock;
4557	5647	r = -EFAULT;
4558		- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
	5648	+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
4559	5649	goto set_identity_unlock;
4560	5650	r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
4561	5651	set_identity_unlock:
..	..	@@ -4639,7 +5729,7 @@
4639	5729	if (r)
4640	5730	goto get_irqchip_out;
4641	5731	r = -EFAULT;
4642		- if (copy_to_user(argp, chip, sizeof *chip))
	5732	+ if (copy_to_user(argp, chip, sizeof(*chip)))
4643	5733	goto get_irqchip_out;
4644	5734	r = 0;
4645	5735	get_irqchip_out:
..	..	@@ -4660,9 +5750,6 @@
4660	5750	if (!irqchip_kernel(kvm))
4661	5751	goto set_irqchip_out;
4662	5752	r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4663		- if (r)
4664		- goto set_irqchip_out;
4665		- r = 0;
4666	5753	set_irqchip_out:
4667	5754	kfree(chip);
4668	5755	break;
..	..	@@ -4685,7 +5772,7 @@
4685	5772	}
4686	5773	case KVM_SET_PIT: {
4687	5774	r = -EFAULT;
4688		- if (copy_from_user(&u.ps, argp, sizeof u.ps))
	5775	+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
4689	5776	goto out;
4690	5777	mutex_lock(&kvm->lock);
4691	5778	r = -ENXIO;
..	..	@@ -4726,6 +5813,9 @@
4726	5813	struct kvm_reinject_control control;
4727	5814	r = -EFAULT;
4728	5815	if (copy_from_user(&control, argp, sizeof(control)))
	5816	+ goto out;
	5817	+ r = -ENXIO;
	5818	+ if (!kvm->arch.vpit)
4729	5819	goto out;
4730	5820	r = kvm_vm_ioctl_reinject(kvm, &control);
4731	5821	break;
..	..	@@ -4790,19 +5880,10 @@
4790	5880	r = 0;
4791	5881	break;
4792	5882	}
4793		- case KVM_ENABLE_CAP: {
4794		- struct kvm_enable_cap cap;
4795		-
4796		- r = -EFAULT;
4797		- if (copy_from_user(&cap, argp, sizeof(cap)))
4798		- goto out;
4799		- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4800		- break;
4801		- }
4802	5883	case KVM_MEMORY_ENCRYPT_OP: {
4803	5884	r = -ENOTTY;
4804		- if (kvm_x86_ops->mem_enc_op)
4805		- r = kvm_x86_ops->mem_enc_op(kvm, argp);
	5885	+ if (kvm_x86_ops.mem_enc_op)
	5886	+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
4806	5887	break;
4807	5888	}
4808	5889	case KVM_MEMORY_ENCRYPT_REG_REGION: {
..	..	@@ -4813,8 +5894,8 @@
4813	5894	goto out;
4814	5895
4815	5896	r = -ENOTTY;
4816		- if (kvm_x86_ops->mem_enc_reg_region)
4817		- r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
	5897	+ if (kvm_x86_ops.mem_enc_reg_region)
	5898	+ r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
4818	5899	break;
4819	5900	}
4820	5901	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
..	..	@@ -4825,8 +5906,8 @@
4825	5906	goto out;
4826	5907
4827	5908	r = -ENOTTY;
4828		- if (kvm_x86_ops->mem_enc_unreg_region)
4829		- r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
	5909	+ if (kvm_x86_ops.mem_enc_unreg_region)
	5910	+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
4830	5911	break;
4831	5912	}
4832	5913	case KVM_HYPERV_EVENTFD: {
..	..	@@ -4838,6 +5919,19 @@
4838	5919	r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
4839	5920	break;
4840	5921	}
	5922	+ case KVM_SET_PMU_EVENT_FILTER:
	5923	+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
	5924	+ break;
	5925	+ case KVM_X86_SET_MSR_FILTER: {
	5926	+ struct kvm_msr_filter __user *user_msr_filter = argp;
	5927	+ struct kvm_msr_filter filter;
	5928	+
	5929	+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
	5930	+ return -EFAULT;
	5931	+
	5932	+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
	5933	+ break;
	5934	+ }
4841	5935	default:
4842	5936	r = -ENOTTY;
4843	5937	}
..	..	@@ -4847,58 +5941,96 @@
4847	5941
4848	5942	static void kvm_init_msr_list(void)
4849	5943	{
	5944	+ struct x86_pmu_capability x86_pmu;
4850	5945	u32 dummy[2];
4851		- unsigned i, j;
	5946	+ unsigned i;
4852	5947
4853		- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4854		- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
	5948	+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
	5949	+ "Please update the fixed PMCs in msrs_to_saved_all[]");
	5950	+
	5951	+ perf_get_x86_pmu_capability(&x86_pmu);
	5952	+
	5953	+ num_msrs_to_save = 0;
	5954	+ num_emulated_msrs = 0;
	5955	+ num_msr_based_features = 0;
	5956	+
	5957	+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
	5958	+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
4855	5959	continue;
4856	5960
4857	5961	/*
4858	5962	* Even MSRs that are valid in the host may not be exposed
4859	5963	* to the guests in some cases.
4860	5964	*/
4861		- switch (msrs_to_save[i]) {
	5965	+ switch (msrs_to_save_all[i]) {
4862	5966	case MSR_IA32_BNDCFGS:
4863	5967	if (!kvm_mpx_supported())
4864	5968	continue;
4865	5969	break;
4866	5970	case MSR_TSC_AUX:
4867		- if (!kvm_x86_ops->rdtscp_supported())
	5971	+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
	5972	+ continue;
	5973	+ break;
	5974	+ case MSR_IA32_UMWAIT_CONTROL:
	5975	+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
	5976	+ continue;
	5977	+ break;
	5978	+ case MSR_IA32_RTIT_CTL:
	5979	+ case MSR_IA32_RTIT_STATUS:
	5980	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
	5981	+ continue;
	5982	+ break;
	5983	+ case MSR_IA32_RTIT_CR3_MATCH:
	5984	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5985	+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
	5986	+ continue;
	5987	+ break;
	5988	+ case MSR_IA32_RTIT_OUTPUT_BASE:
	5989	+ case MSR_IA32_RTIT_OUTPUT_MASK:
	5990	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5991	+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
	5992	+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
	5993	+ continue;
	5994	+ break;
	5995	+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
	5996	+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) \|\|
	5997	+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
	5998	+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
	5999	+ continue;
	6000	+ break;
	6001	+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
	6002	+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
	6003	+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
	6004	+ continue;
	6005	+ break;
	6006	+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
	6007	+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
	6008	+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
4868	6009	continue;
4869	6010	break;
4870	6011	default:
4871	6012	break;
4872	6013	}
4873	6014
4874		- if (j < i)
4875		- msrs_to_save[j] = msrs_to_save[i];
4876		- j++;
	6015	+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
4877	6016	}
4878		- num_msrs_to_save = j;
4879	6017
4880		- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4881		- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
	6018	+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
	6019	+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
4882	6020	continue;
4883	6021
4884		- if (j < i)
4885		- emulated_msrs[j] = emulated_msrs[i];
4886		- j++;
	6022	+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
4887	6023	}
4888		- num_emulated_msrs = j;
4889	6024
4890		- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
	6025	+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
4891	6026	struct kvm_msr_entry msr;
4892	6027
4893		- msr.index = msr_based_features[i];
	6028	+ msr.index = msr_based_features_all[i];
4894	6029	if (kvm_get_msr_feature(&msr))
4895	6030	continue;
4896	6031
4897		- if (j < i)
4898		- msr_based_features[j] = msr_based_features[i];
4899		- j++;
	6032	+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
4900	6033	}
4901		- num_msr_based_features = j;
4902	6034	}
4903	6035
4904	6036	static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
..	..	@@ -4947,13 +6079,13 @@
4947	6079	static void kvm_set_segment(struct kvm_vcpu *vcpu,
4948	6080	struct kvm_segment *var, int seg)
4949	6081	{
4950		- kvm_x86_ops->set_segment(vcpu, var, seg);
	6082	+ kvm_x86_ops.set_segment(vcpu, var, seg);
4951	6083	}
4952	6084
4953	6085	void kvm_get_segment(struct kvm_vcpu *vcpu,
4954	6086	struct kvm_segment *var, int seg)
4955	6087	{
4956		- kvm_x86_ops->get_segment(vcpu, var, seg);
	6088	+ kvm_x86_ops.get_segment(vcpu, var, seg);
4957	6089	}
4958	6090
4959	6091	gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
..	..	@@ -4965,7 +6097,7 @@
4965	6097
4966	6098	/* NPT walks are always user-walks */
4967	6099	access \|= PFERR_USER_MASK;
4968		- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
	6100	+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
4969	6101
4970	6102	return t_gpa;
4971	6103	}
..	..	@@ -4973,14 +6105,14 @@
4973	6105	gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
4974	6106	struct x86_exception *exception)
4975	6107	{
4976		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6108	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4977	6109	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4978	6110	}
4979	6111
4980	6112	gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
4981	6113	struct x86_exception *exception)
4982	6114	{
4983		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6115	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4984	6116	access \|= PFERR_FETCH_MASK;
4985	6117	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4986	6118	}
..	..	@@ -4988,7 +6120,7 @@
4988	6120	gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
4989	6121	struct x86_exception *exception)
4990	6122	{
4991		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6123	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4992	6124	access \|= PFERR_WRITE_MASK;
4993	6125	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4994	6126	}
..	..	@@ -5037,7 +6169,7 @@
5037	6169	struct x86_exception *exception)
5038	6170	{
5039	6171	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5040		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6172	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5041	6173	unsigned offset;
5042	6174	int ret;
5043	6175
..	..	@@ -5062,7 +6194,7 @@
5062	6194	gva_t addr, void *val, unsigned int bytes,
5063	6195	struct x86_exception *exception)
5064	6196	{
5065		- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
	6197	+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
5066	6198
5067	6199	/*
5068	6200	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
..	..	@@ -5083,7 +6215,7 @@
5083	6215	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5084	6216	u32 access = 0;
5085	6217
5086		- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
	6218	+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
5087	6219	access \|= PFERR_USER_MASK;
5088	6220
5089	6221	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
..	..	@@ -5136,7 +6268,7 @@
5136	6268	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5137	6269	u32 access = PFERR_WRITE_MASK;
5138	6270
5139		- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
	6271	+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
5140	6272	access \|= PFERR_USER_MASK;
5141	6273
5142	6274	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
..	..	@@ -5149,13 +6281,6 @@
5149	6281	/* kvm_write_guest_virt_system can pull in tons of pages. */
5150	6282	vcpu->arch.l1tf_flush_l1d = true;
5151	6283
5152		- /*
5153		- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5154		- * is returned, but our callers are not ready for that and they blindly
5155		- * call kvm_inject_page_fault. Ensure that they at least do not leak
5156		- * uninitialized kernel stack memory into cr2 and error code.
5157		- */
5158		- memset(exception, 0, sizeof(*exception));
5159	6284	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
5160	6285	PFERR_WRITE_MASK, exception);
5161	6286	}
..	..	@@ -5163,25 +6288,23 @@
5163	6288
5164	6289	int handle_ud(struct kvm_vcpu *vcpu)
5165	6290	{
	6291	+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
5166	6292	int emul_type = EMULTYPE_TRAP_UD;
5167		- enum emulation_result er;
5168	6293	char sig[5]; /* ud2; .ascii "kvm" */
5169	6294	struct x86_exception e;
	6295	+
	6296	+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
	6297	+ return 1;
5170	6298
5171	6299	if (force_emulation_prefix &&
5172	6300	kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
5173	6301	sig, sizeof(sig), &e) == 0 &&
5174		- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
	6302	+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
5175	6303	kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
5176		- emul_type = 0;
	6304	+ emul_type = EMULTYPE_TRAP_UD_FORCED;
5177	6305	}
5178	6306
5179		- er = kvm_emulate_instruction(vcpu, emul_type);
5180		- if (er == EMULATE_USER_EXIT)
5181		- return 0;
5182		- if (er != EMULATE_DONE)
5183		- kvm_queue_exception(vcpu, UD_VECTOR);
5184		- return 1;
	6307	+ return kvm_emulate_instruction(vcpu, emul_type);
5185	6308	}
5186	6309	EXPORT_SYMBOL_GPL(handle_ud);
5187	6310
..	..	@@ -5204,7 +6327,7 @@
5204	6327	gpa_t gpa, struct x86_exception exception,
5205	6328	bool write)
5206	6329	{
5207		- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
	6330	+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
5208	6331	\| (write ? PFERR_WRITE_MASK : 0);
5209	6332
5210	6333	/*
..	..	@@ -5214,7 +6337,7 @@
5214	6337	*/
5215	6338	if (vcpu_match_mmio_gva(vcpu, gva)
5216	6339	&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
5217		- vcpu->arch.access, 0, access)) {
	6340	+ vcpu->arch.mmio_access, 0, access)) {
5218	6341	*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT \|
5219	6342	(gva & (PAGE_SIZE - 1));
5220	6343	trace_vcpu_match_mmio(gva, *gpa, write, false);
..	..	@@ -5323,7 +6446,7 @@
5323	6446	int handled, ret;
5324	6447	bool write = ops->write;
5325	6448	struct kvm_mmio_fragment *frag;
5326		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	6449	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5327	6450
5328	6451	/*
5329	6452	* If the exit was due to a NPF we may already have a GPA.
..	..	@@ -5332,10 +6455,9 @@
5332	6455	* operation using rep will only have the initial GPA from the NPF
5333	6456	* occurred.
5334	6457	*/
5335		- if (vcpu->arch.gpa_available &&
5336		- emulator_can_use_gpa(ctxt) &&
5337		- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5338		- gpa = vcpu->arch.gpa_val;
	6458	+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
	6459	+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
	6460	+ gpa = ctxt->gpa_val;
5339	6461	ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
5340	6462	} else {
5341	6463	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
..	..	@@ -5456,9 +6578,10 @@
5456	6578	unsigned int bytes,
5457	6579	struct x86_exception *exception)
5458	6580	{
	6581	+ struct kvm_host_map map;
5459	6582	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6583	+ u64 page_line_mask;
5460	6584	gpa_t gpa;
5461		- struct page *page;
5462	6585	char *kaddr;
5463	6586	bool exchanged;
5464	6587
..	..	@@ -5472,15 +6595,23 @@
5472	6595	(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
5473	6596	goto emul_write;
5474	6597
5475		- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
	6598	+ /*
	6599	+ * Emulate the atomic as a straight write to avoid #AC if SLD is
	6600	+ * enabled in the host and the access splits a cache line.
	6601	+ */
	6602	+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
	6603	+ page_line_mask = ~(cache_line_size() - 1);
	6604	+ else
	6605	+ page_line_mask = PAGE_MASK;
	6606	+
	6607	+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
5476	6608	goto emul_write;
5477	6609
5478		- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
5479		- if (is_error_page(page))
	6610	+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
5480	6611	goto emul_write;
5481	6612
5482		- kaddr = kmap_atomic(page);
5483		- kaddr += offset_in_page(gpa);
	6613	+ kaddr = map.hva + offset_in_page(gpa);
	6614	+
5484	6615	switch (bytes) {
5485	6616	case 1:
5486	6617	exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
..	..	@@ -5497,13 +6628,12 @@
5497	6628	default:
5498	6629	BUG();
5499	6630	}
5500		- kunmap_atomic(kaddr);
5501		- kvm_release_page_dirty(page);
	6631	+
	6632	+ kvm_vcpu_unmap(vcpu, &map, true);
5502	6633
5503	6634	if (!exchanged)
5504	6635	return X86EMUL_CMPXCHG_FAILED;
5505	6636
5506		- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
5507	6637	kvm_page_track_write(vcpu, gpa, new, bytes);
5508	6638
5509	6639	return X86EMUL_CONTINUE;
..	..	@@ -5557,11 +6687,9 @@
5557	6687	return 0;
5558	6688	}
5559	6689
5560		-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5561		- int size, unsigned short port, void *val,
5562		- unsigned int count)
	6690	+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
	6691	+ unsigned short port, void *val, unsigned int count)
5563	6692	{
5564		- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5565	6693	int ret;
5566	6694
5567	6695	if (vcpu->arch.pio.count)
..	..	@@ -5581,20 +6709,33 @@
5581	6709	return 0;
5582	6710	}
5583	6711
5584		-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
5585		- int size, unsigned short port,
5586		- const void *val, unsigned int count)
	6712	+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
	6713	+ int size, unsigned short port, void *val,
	6714	+ unsigned int count)
5587	6715	{
5588		- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6716	+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
5589	6717
	6718	+}
	6719	+
	6720	+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
	6721	+ unsigned short port, const void *val,
	6722	+ unsigned int count)
	6723	+{
5590	6724	memcpy(vcpu->arch.pio_data, val, size * count);
5591	6725	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
5592	6726	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
5593	6727	}
5594	6728
	6729	+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
	6730	+ int size, unsigned short port,
	6731	+ const void *val, unsigned int count)
	6732	+{
	6733	+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
	6734	+}
	6735	+
5595	6736	static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
5596	6737	{
5597		- return kvm_x86_ops->get_segment_base(vcpu, seg);
	6738	+ return kvm_x86_ops.get_segment_base(vcpu, seg);
5598	6739	}
5599	6740
5600	6741	static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
..	..	@@ -5607,7 +6748,7 @@
5607	6748	if (!need_emulate_wbinvd(vcpu))
5608	6749	return X86EMUL_CONTINUE;
5609	6750
5610		- if (kvm_x86_ops->has_wbinvd_exit()) {
	6751	+ if (kvm_x86_ops.has_wbinvd_exit()) {
5611	6752	int cpu = get_cpu();
5612	6753
5613	6754	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
..	..	@@ -5712,27 +6853,27 @@
5712	6853
5713	6854	static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
5714	6855	{
5715		- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
	6856	+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
5716	6857	}
5717	6858
5718	6859	static void emulator_get_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5719	6860	{
5720		- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
	6861	+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
5721	6862	}
5722	6863
5723	6864	static void emulator_get_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5724	6865	{
5725		- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
	6866	+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
5726	6867	}
5727	6868
5728	6869	static void emulator_set_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5729	6870	{
5730		- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
	6871	+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
5731	6872	}
5732	6873
5733	6874	static void emulator_set_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
5734	6875	{
5735		- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
	6876	+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
5736	6877	}
5737	6878
5738	6879	static unsigned long emulator_get_cached_segment_base(
..	..	@@ -5810,28 +6951,33 @@
5810	6951	static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
5811	6952	u32 msr_index, u64 *pdata)
5812	6953	{
5813		- struct msr_data msr;
	6954	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5814	6955	int r;
5815	6956
5816		- msr.index = msr_index;
5817		- msr.host_initiated = false;
5818		- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
5819		- if (r)
5820		- return r;
	6957	+ r = kvm_get_msr(vcpu, msr_index, pdata);
5821	6958
5822		- *pdata = msr.data;
5823		- return 0;
	6959	+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
	6960	+ /* Bounce to user space */
	6961	+ return X86EMUL_IO_NEEDED;
	6962	+ }
	6963	+
	6964	+ return r;
5824	6965	}
5825	6966
5826	6967	static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
5827	6968	u32 msr_index, u64 data)
5828	6969	{
5829		- struct msr_data msr;
	6970	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	6971	+ int r;
5830	6972
5831		- msr.data = data;
5832		- msr.index = msr_index;
5833		- msr.host_initiated = false;
5834		- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
	6973	+ r = kvm_set_msr(vcpu, msr_index, data);
	6974	+
	6975	+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
	6976	+ /* Bounce to user space */
	6977	+ return X86EMUL_IO_NEEDED;
	6978	+ }
	6979	+
	6980	+ return r;
5835	6981	}
5836	6982
5837	6983	static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
..	..	@@ -5851,7 +6997,7 @@
5851	6997	static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
5852	6998	u32 pmc)
5853	6999	{
5854		- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
	7000	+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
5855	7001	}
5856	7002
5857	7003	static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
..	..	@@ -5869,13 +7015,35 @@
5869	7015	struct x86_instruction_info *info,
5870	7016	enum x86_intercept_stage stage)
5871	7017	{
5872		- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
	7018	+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
	7019	+ &ctxt->exception);
5873	7020	}
5874	7021
5875	7022	static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5876		- u32 eax, u32 ebx, u32 ecx, u32 edx, bool check_limit)
	7023	+ u32 eax, u32 ebx, u32 ecx, u32 edx,
	7024	+ bool exact_only)
5877	7025	{
5878		- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
	7026	+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
	7027	+}
	7028	+
	7029	+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
	7030	+{
	7031	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
	7032	+}
	7033	+
	7034	+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
	7035	+{
	7036	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
	7037	+}
	7038	+
	7039	+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
	7040	+{
	7041	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
	7042	+}
	7043	+
	7044	+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
	7045	+{
	7046	+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
5879	7047	}
5880	7048
5881	7049	static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
..	..	@@ -5890,7 +7058,7 @@
5890	7058
5891	7059	static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
5892	7060	{
5893		- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
	7061	+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
5894	7062	}
5895	7063
5896	7064	static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
..	..	@@ -5900,12 +7068,26 @@
5900	7068
5901	7069	static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
5902	7070	{
5903		- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
	7071	+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
	7072	+
	7073	+ vcpu->arch.hflags = emul_flags;
	7074	+ kvm_mmu_reset_context(vcpu);
5904	7075	}
5905	7076
5906		-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
	7077	+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
	7078	+ const char *smstate)
5907	7079	{
5908		- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
	7080	+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
	7081	+}
	7082	+
	7083	+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
	7084	+{
	7085	+ kvm_smm_changed(emul_to_vcpu(ctxt));
	7086	+}
	7087	+
	7088	+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
	7089	+{
	7090	+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
5909	7091	}
5910	7092
5911	7093	static const struct x86_emulate_ops emulate_ops = {
..	..	@@ -5944,15 +7126,21 @@
5944	7126	.fix_hypercall = emulator_fix_hypercall,
5945	7127	.intercept = emulator_intercept,
5946	7128	.get_cpuid = emulator_get_cpuid,
	7129	+ .guest_has_long_mode = emulator_guest_has_long_mode,
	7130	+ .guest_has_movbe = emulator_guest_has_movbe,
	7131	+ .guest_has_fxsr = emulator_guest_has_fxsr,
	7132	+ .guest_has_rdpid = emulator_guest_has_rdpid,
5947	7133	.set_nmi_mask = emulator_set_nmi_mask,
5948	7134	.get_hflags = emulator_get_hflags,
5949	7135	.set_hflags = emulator_set_hflags,
5950	7136	.pre_leave_smm = emulator_pre_leave_smm,
	7137	+ .post_leave_smm = emulator_post_leave_smm,
	7138	+ .set_xcr = emulator_set_xcr,
5951	7139	};
5952	7140
5953	7141	static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
5954	7142	{
5955		- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
	7143	+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
5956	7144	/*
5957	7145	* an sti; sti; sequence only disable interrupts for the first
5958	7146	* instruction. So, if the last instruction, be it emulated or
..	..	@@ -5963,7 +7151,7 @@
5963	7151	if (int_shadow & mask)
5964	7152	mask = 0;
5965	7153	if (unlikely(int_shadow \|\| mask)) {
5966		- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
	7154	+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
5967	7155	if (!mask)
5968	7156	kvm_make_request(KVM_REQ_EVENT, vcpu);
5969	7157	}
..	..	@@ -5971,9 +7159,9 @@
5971	7159
5972	7160	static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
5973	7161	{
5974		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7162	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5975	7163	if (ctxt->exception.vector == PF_VECTOR)
5976		- return kvm_propagate_fault(vcpu, &ctxt->exception);
	7164	+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
5977	7165
5978	7166	if (ctxt->exception.error_code_valid)
5979	7167	kvm_queue_exception_e(vcpu, ctxt->exception.vector,
..	..	@@ -5983,13 +7171,31 @@
5983	7171	return false;
5984	7172	}
5985	7173
	7174	+static struct x86_emulate_ctxt alloc_emulate_ctxt(struct kvm_vcpu vcpu)
	7175	+{
	7176	+ struct x86_emulate_ctxt *ctxt;
	7177	+
	7178	+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
	7179	+ if (!ctxt) {
	7180	+ pr_err("kvm: failed to allocate vcpu's emulator\n");
	7181	+ return NULL;
	7182	+ }
	7183	+
	7184	+ ctxt->vcpu = vcpu;
	7185	+ ctxt->ops = &emulate_ops;
	7186	+ vcpu->arch.emulate_ctxt = ctxt;
	7187	+
	7188	+ return ctxt;
	7189	+}
	7190	+
5986	7191	static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
5987	7192	{
5988		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7193	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
5989	7194	int cs_db, cs_l;
5990	7195
5991		- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
	7196	+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5992	7197
	7198	+ ctxt->gpa_available = false;
5993	7199	ctxt->eflags = kvm_get_rflags(vcpu);
5994	7200	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
5995	7201
..	..	@@ -6003,13 +7209,18 @@
6003	7209	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
6004	7210	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
6005	7211
	7212	+ ctxt->interruptibility = 0;
	7213	+ ctxt->have_exception = false;
	7214	+ ctxt->exception.vector = -1;
	7215	+ ctxt->perm_ok = false;
	7216	+
6006	7217	init_decode_cache(ctxt);
6007	7218	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
6008	7219	}
6009	7220
6010		-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
	7221	+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
6011	7222	{
6012		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7223	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6013	7224	int ret;
6014	7225
6015	7226	init_emulate_ctxt(vcpu);
..	..	@@ -6019,37 +7230,43 @@
6019	7230	ctxt->_eip = ctxt->eip + inc_eip;
6020	7231	ret = emulate_int_real(ctxt, irq);
6021	7232
6022		- if (ret != X86EMUL_CONTINUE)
6023		- return EMULATE_FAIL;
6024		-
6025		- ctxt->eip = ctxt->_eip;
6026		- kvm_rip_write(vcpu, ctxt->eip);
6027		- kvm_set_rflags(vcpu, ctxt->eflags);
6028		-
6029		- return EMULATE_DONE;
	7233	+ if (ret != X86EMUL_CONTINUE) {
	7234	+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
	7235	+ } else {
	7236	+ ctxt->eip = ctxt->_eip;
	7237	+ kvm_rip_write(vcpu, ctxt->eip);
	7238	+ kvm_set_rflags(vcpu, ctxt->eflags);
	7239	+ }
6030	7240	}
6031	7241	EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
6032	7242
6033	7243	static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
6034	7244	{
6035		- int r = EMULATE_DONE;
6036		-
6037	7245	++vcpu->stat.insn_emulation_fail;
6038	7246	trace_kvm_emulate_insn_failed(vcpu);
6039	7247
6040		- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
6041		- return EMULATE_FAIL;
	7248	+ if (emulation_type & EMULTYPE_VMWARE_GP) {
	7249	+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	7250	+ return 1;
	7251	+ }
6042	7252
6043		- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
	7253	+ if (emulation_type & EMULTYPE_SKIP) {
6044	7254	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6045	7255	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
6046	7256	vcpu->run->internal.ndata = 0;
6047		- r = EMULATE_USER_EXIT;
	7257	+ return 0;
6048	7258	}
6049	7259
6050	7260	kvm_queue_exception(vcpu, UD_VECTOR);
6051	7261
6052		- return r;
	7262	+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
	7263	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	7264	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	7265	+ vcpu->run->internal.ndata = 0;
	7266	+ return 0;
	7267	+ }
	7268	+
	7269	+ return 1;
6053	7270	}
6054	7271
6055	7272	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
..	..	@@ -6059,13 +7276,14 @@
6059	7276	gpa_t gpa = cr2_or_gpa;
6060	7277	kvm_pfn_t pfn;
6061	7278
6062		- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
	7279	+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
6063	7280	return false;
6064	7281
6065		- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
	7282	+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) \|\|
	7283	+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6066	7284	return false;
6067	7285
6068		- if (!vcpu->arch.mmu.direct_map) {
	7286	+ if (!vcpu->arch.mmu->direct_map) {
6069	7287	/*
6070	7288	* Write permission should be allowed since only
6071	7289	* write access need to be emulated.
..	..	@@ -6098,7 +7316,7 @@
6098	7316	kvm_release_pfn_clean(pfn);
6099	7317
6100	7318	/* The instructions are well-emulated on direct mmu. */
6101		- if (vcpu->arch.mmu.direct_map) {
	7319	+ if (vcpu->arch.mmu->direct_map) {
6102	7320	unsigned int indirect_shadow_pages;
6103	7321
6104	7322	spin_lock(&vcpu->kvm->mmu_lock);
..	..	@@ -6150,10 +7368,11 @@
6150	7368	*/
6151	7369	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
6152	7370
6153		- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
	7371	+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
6154	7372	return false;
6155	7373
6156		- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
	7374	+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) \|\|
	7375	+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
6157	7376	return false;
6158	7377
6159	7378	if (x86_page_table_writing_insn(ctxt))
..	..	@@ -6165,7 +7384,7 @@
6165	7384	vcpu->arch.last_retry_eip = ctxt->eip;
6166	7385	vcpu->arch.last_retry_addr = cr2_or_gpa;
6167	7386
6168		- if (!vcpu->arch.mmu.direct_map)
	7387	+ if (!vcpu->arch.mmu->direct_map)
6169	7388	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
6170	7389
6171	7390	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
..	..	@@ -6189,16 +7408,6 @@
6189	7408	kvm_mmu_reset_context(vcpu);
6190	7409	}
6191	7410
6192		-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
6193		-{
6194		- unsigned changed = vcpu->arch.hflags ^ emul_flags;
6195		-
6196		- vcpu->arch.hflags = emul_flags;
6197		-
6198		- if (changed & HF_SMM_MASK)
6199		- kvm_smm_changed(vcpu);
6200		-}
6201		-
6202	7411	static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
6203	7412	unsigned long *db)
6204	7413	{
..	..	@@ -6214,34 +7423,29 @@
6214	7423	return dr6;
6215	7424	}
6216	7425
6217		-static void kvm_vcpu_do_singlestep(struct kvm_vcpu vcpu, int r)
	7426	+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
6218	7427	{
6219	7428	struct kvm_run *kvm_run = vcpu->run;
6220	7429
6221	7430	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
6222	7431	kvm_run->debug.arch.dr6 = DR6_BS \| DR6_FIXED_1 \| DR6_RTM;
6223		- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
	7432	+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
6224	7433	kvm_run->debug.arch.exception = DB_VECTOR;
6225	7434	kvm_run->exit_reason = KVM_EXIT_DEBUG;
6226		- *r = EMULATE_USER_EXIT;
6227		- } else {
6228		- /*
6229		- * "Certain debug exceptions may clear bit 0-3. The
6230		- * remaining contents of the DR6 register are never
6231		- * cleared by the processor".
6232		- */
6233		- vcpu->arch.dr6 &= ~15;
6234		- vcpu->arch.dr6 \|= DR6_BS \| DR6_RTM;
6235		- kvm_queue_exception(vcpu, DB_VECTOR);
	7435	+ return 0;
6236	7436	}
	7437	+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
	7438	+ return 1;
6237	7439	}
6238	7440
6239	7441	int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
6240	7442	{
6241		- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
6242		- int r = EMULATE_DONE;
	7443	+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
	7444	+ int r;
6243	7445
6244		- kvm_x86_ops->skip_emulated_instruction(vcpu);
	7446	+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
	7447	+ if (unlikely(!r))
	7448	+ return 0;
6245	7449
6246	7450	/*
6247	7451	* rflags is the old, "raw" value of the flags. The new value has
..	..	@@ -6252,12 +7456,12 @@
6252	7456	* that sets the TF flag".
6253	7457	*/
6254	7458	if (unlikely(rflags & X86_EFLAGS_TF))
6255		- kvm_vcpu_do_singlestep(vcpu, &r);
6256		- return r == EMULATE_DONE;
	7459	+ r = kvm_vcpu_do_singlestep(vcpu);
	7460	+ return r;
6257	7461	}
6258	7462	EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
6259	7463
6260		-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu vcpu, int r)
	7464	+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu vcpu, int r)
6261	7465	{
6262	7466	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
6263	7467	(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
..	..	@@ -6272,7 +7476,7 @@
6272	7476	kvm_run->debug.arch.pc = eip;
6273	7477	kvm_run->debug.arch.exception = DB_VECTOR;
6274	7478	kvm_run->exit_reason = KVM_EXIT_DEBUG;
6275		- *r = EMULATE_USER_EXIT;
	7479	+ *r = 0;
6276	7480	return true;
6277	7481	}
6278	7482	}
..	..	@@ -6285,10 +7489,8 @@
6285	7489	vcpu->arch.db);
6286	7490
6287	7491	if (dr6 != 0) {
6288		- vcpu->arch.dr6 &= ~15;
6289		- vcpu->arch.dr6 \|= dr6 \| DR6_RTM;
6290		- kvm_queue_exception(vcpu, DB_VECTOR);
6291		- *r = EMULATE_DONE;
	7492	+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
	7493	+ *r = 1;
6292	7494	return true;
6293	7495	}
6294	7496	}
..	..	@@ -6327,13 +7529,45 @@
6327	7529	return false;
6328	7530	}
6329	7531
	7532	+/*
	7533	+ * Decode an instruction for emulation. The caller is responsible for handling
	7534	+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
	7535	+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
	7536	+ * code breakpoints have higher priority and thus have already been done by
	7537	+ * hardware.
	7538	+ *
	7539	+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
	7540	+ * response to a machine check.
	7541	+ */
	7542	+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
	7543	+ void *insn, int insn_len)
	7544	+{
	7545	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
	7546	+ int r;
	7547	+
	7548	+ init_emulate_ctxt(vcpu);
	7549	+
	7550	+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
	7551	+
	7552	+ r = x86_decode_insn(ctxt, insn, insn_len);
	7553	+
	7554	+ trace_kvm_emulate_insn_start(vcpu);
	7555	+ ++vcpu->stat.insn_emulation;
	7556	+
	7557	+ return r;
	7558	+}
	7559	+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
	7560	+
6330	7561	int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
6331	7562	int emulation_type, void *insn, int insn_len)
6332	7563	{
6333	7564	int r;
6334		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	7565	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
6335	7566	bool writeback = true;
6336		- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
	7567	+ bool write_fault_to_spt;
	7568	+
	7569	+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
	7570	+ return 1;
6337	7571
6338	7572	vcpu->arch.l1tf_flush_l1d = true;
6339	7573
..	..	@@ -6341,40 +7575,36 @@
6341	7575	* Clear write_fault_to_shadow_pgtable here to ensure it is
6342	7576	* never reused.
6343	7577	*/
	7578	+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
6344	7579	vcpu->arch.write_fault_to_shadow_pgtable = false;
6345		- kvm_clear_exception_queue(vcpu);
6346	7580
6347	7581	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
6348		- init_emulate_ctxt(vcpu);
	7582	+ kvm_clear_exception_queue(vcpu);
6349	7583
6350	7584	/*
6351		- * We will reenter on the same instruction since
6352		- * we do not set complete_userspace_io. This does not
6353		- * handle watchpoints yet, those would be handled in
6354		- * the emulate_ops.
	7585	+ * Return immediately if RIP hits a code breakpoint, such #DBs
	7586	+ * are fault-like and are higher priority than any faults on
	7587	+ * the code fetch itself.
6355	7588	*/
6356	7589	if (!(emulation_type & EMULTYPE_SKIP) &&
6357		- kvm_vcpu_check_breakpoint(vcpu, &r))
	7590	+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
6358	7591	return r;
6359	7592
6360		- ctxt->interruptibility = 0;
6361		- ctxt->have_exception = false;
6362		- ctxt->exception.vector = -1;
6363		- ctxt->perm_ok = false;
6364		-
6365		- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
6366		-
6367		- r = x86_decode_insn(ctxt, insn, insn_len);
6368		-
6369		- trace_kvm_emulate_insn_start(vcpu);
6370		- ++vcpu->stat.insn_emulation;
	7593	+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
	7594	+ insn, insn_len);
6371	7595	if (r != EMULATION_OK) {
6372		- if (emulation_type & EMULTYPE_TRAP_UD)
6373		- return EMULATE_FAIL;
6374		- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6375		- emulation_type))
6376		- return EMULATE_DONE;
6377		- if (ctxt->have_exception) {
	7596	+ if ((emulation_type & EMULTYPE_TRAP_UD) \|\|
	7597	+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
	7598	+ kvm_queue_exception(vcpu, UD_VECTOR);
	7599	+ return 1;
	7600	+ }
	7601	+ if (reexecute_instruction(vcpu, cr2_or_gpa,
	7602	+ write_fault_to_spt,
	7603	+ emulation_type))
	7604	+ return 1;
	7605	+
	7606	+ if (ctxt->have_exception &&
	7607	+ !(emulation_type & EMULTYPE_SKIP)) {
6378	7608	/*
6379	7609	* #UD should result in just EMULATION_FAILED, and trap-like
6380	7610	* exception should not be encountered during decode.
..	..	@@ -6382,27 +7612,32 @@
6382	7612	WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR \|\|
6383	7613	exception_type(ctxt->exception.vector) == EXCPT_TRAP);
6384	7614	inject_emulated_exception(vcpu);
6385		- return EMULATE_DONE;
	7615	+ return 1;
6386	7616	}
6387		- if (emulation_type & EMULTYPE_SKIP)
6388		- return EMULATE_FAIL;
6389	7617	return handle_emulation_failure(vcpu, emulation_type);
6390	7618	}
6391	7619	}
6392	7620
6393		- if ((emulation_type & EMULTYPE_VMWARE) &&
6394		- !is_vmware_backdoor_opcode(ctxt))
6395		- return EMULATE_FAIL;
	7621	+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
	7622	+ !is_vmware_backdoor_opcode(ctxt)) {
	7623	+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	7624	+ return 1;
	7625	+ }
6396	7626
	7627	+ /*
	7628	+ * Note, EMULTYPE_SKIP is intended for use only by vendor callbacks
	7629	+ * for kvm_skip_emulated_instruction(). The caller is responsible for
	7630	+ * updating interruptibility state and injecting single-step #DBs.
	7631	+ */
6397	7632	if (emulation_type & EMULTYPE_SKIP) {
6398	7633	kvm_rip_write(vcpu, ctxt->_eip);
6399	7634	if (ctxt->eflags & X86_EFLAGS_RF)
6400	7635	kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
6401		- return EMULATE_DONE;
	7636	+ return 1;
6402	7637	}
6403	7638
6404	7639	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
6405		- return EMULATE_DONE;
	7640	+ return 1;
6406	7641
6407	7642	/* this is needed for vmware backdoor interface to work since it
6408	7643	changes registers values during IO operation */
..	..	@@ -6412,24 +7647,35 @@
6412	7647	}
6413	7648
6414	7649	restart:
6415		- /* Save the faulting GPA (cr2) in the address field */
6416		- ctxt->exception.address = cr2_or_gpa;
	7650	+ if (emulation_type & EMULTYPE_PF) {
	7651	+ /* Save the faulting GPA (cr2) in the address field */
	7652	+ ctxt->exception.address = cr2_or_gpa;
	7653	+
	7654	+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
	7655	+ if (vcpu->arch.mmu->direct_map) {
	7656	+ ctxt->gpa_available = true;
	7657	+ ctxt->gpa_val = cr2_or_gpa;
	7658	+ }
	7659	+ } else {
	7660	+ /* Sanitize the address out of an abundance of paranoia. */
	7661	+ ctxt->exception.address = 0;
	7662	+ }
6417	7663
6418	7664	r = x86_emulate_insn(ctxt);
6419	7665
6420	7666	if (r == EMULATION_INTERCEPTED)
6421		- return EMULATE_DONE;
	7667	+ return 1;
6422	7668
6423	7669	if (r == EMULATION_FAILED) {
6424	7670	if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6425	7671	emulation_type))
6426		- return EMULATE_DONE;
	7672	+ return 1;
6427	7673
6428	7674	return handle_emulation_failure(vcpu, emulation_type);
6429	7675	}
6430	7676
6431	7677	if (ctxt->have_exception) {
6432		- r = EMULATE_DONE;
	7678	+ r = 1;
6433	7679	if (inject_emulated_exception(vcpu))
6434	7680	return r;
6435	7681	} else if (vcpu->arch.pio.count) {
..	..	@@ -6440,26 +7686,36 @@
6440	7686	writeback = false;
6441	7687	vcpu->arch.complete_userspace_io = complete_emulated_pio;
6442	7688	}
6443		- r = EMULATE_USER_EXIT;
	7689	+ r = 0;
6444	7690	} else if (vcpu->mmio_needed) {
	7691	+ ++vcpu->stat.mmio_exits;
	7692	+
6445	7693	if (!vcpu->mmio_is_write)
6446	7694	writeback = false;
6447		- r = EMULATE_USER_EXIT;
	7695	+ r = 0;
6448	7696	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
6449	7697	} else if (r == EMULATION_RESTART)
6450	7698	goto restart;
6451	7699	else
6452		- r = EMULATE_DONE;
	7700	+ r = 1;
6453	7701
6454	7702	if (writeback) {
6455		- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
	7703	+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
6456	7704	toggle_interruptibility(vcpu, ctxt->interruptibility);
6457	7705	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
	7706	+
	7707	+ /*
	7708	+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
	7709	+ * only supports code breakpoints and general detect #DB, both
	7710	+ * of which are fault-like.
	7711	+ */
6458	7712	if (!ctxt->have_exception \|\|
6459	7713	exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
6460	7714	kvm_rip_write(vcpu, ctxt->eip);
6461		- if (r == EMULATE_DONE && ctxt->tf)
6462		- kvm_vcpu_do_singlestep(vcpu, &r);
	7715	+ if (r && (ctxt->tf \|\| (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
	7716	+ r = kvm_vcpu_do_singlestep(vcpu);
	7717	+ if (kvm_x86_ops.update_emulated_instruction)
	7718	+ kvm_x86_ops.update_emulated_instruction(vcpu);
6463	7719	__kvm_set_rflags(vcpu, ctxt->eflags);
6464	7720	}
6465	7721
..	..	@@ -6509,9 +7765,9 @@
6509	7765	static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
6510	7766	unsigned short port)
6511	7767	{
6512		- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6513		- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6514		- size, port, &val, 1);
	7768	+ unsigned long val = kvm_rax_read(vcpu);
	7769	+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
	7770	+
6515	7771	if (ret)
6516	7772	return ret;
6517	7773
..	..	@@ -6544,16 +7800,14 @@
6544	7800	}
6545	7801
6546	7802	/* For size less than 4 we merge, else we zero extend */
6547		- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
6548		- : 0;
	7803	+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
6549	7804
6550	7805	/*
6551		- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
	7806	+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
6552	7807	* the copy and tracing
6553	7808	*/
6554		- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
6555		- vcpu->arch.pio.port, &val, 1);
6556		- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
	7809	+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
	7810	+ kvm_rax_write(vcpu, val);
6557	7811
6558	7812	return kvm_skip_emulated_instruction(vcpu);
6559	7813	}
..	..	@@ -6565,12 +7819,11 @@
6565	7819	int ret;
6566	7820
6567	7821	/* For size less than 4 we merge, else we zero extend */
6568		- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
	7822	+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
6569	7823
6570		- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
6571		- &val, 1);
	7824	+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
6572	7825	if (ret) {
6573		- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
	7826	+ kvm_rax_write(vcpu, val);
6574	7827	return ret;
6575	7828	}
6576	7829
..	..	@@ -6649,10 +7902,8 @@
6649	7902	}
6650	7903	#endif
6651	7904
6652		-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
6653		- void *data)
	7905	+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
6654	7906	{
6655		- struct cpufreq_freqs *freq = data;
6656	7907	struct kvm *kvm;
6657	7908	struct kvm_vcpu *vcpu;
6658	7909	int i, send_ipi = 0;
..	..	@@ -6696,17 +7947,12 @@
6696	7947	*
6697	7948	*/
6698	7949
6699		- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
6700		- return 0;
6701		- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
6702		- return 0;
6703		-
6704		- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
	7950	+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
6705	7951
6706	7952	mutex_lock(&kvm_lock);
6707	7953	list_for_each_entry(kvm, &vm_list, vm_list) {
6708	7954	kvm_for_each_vcpu(i, vcpu, kvm) {
6709		- if (vcpu->cpu != freq->cpu)
	7955	+ if (vcpu->cpu != cpu)
6710	7956	continue;
6711	7957	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6712	7958	if (vcpu->cpu != raw_smp_processor_id())
..	..	@@ -6728,8 +7974,24 @@
6728	7974	* guest context is entered kvmclock will be updated,
6729	7975	* so the guest will not see stale values.
6730	7976	*/
6731		- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
	7977	+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
6732	7978	}
	7979	+}
	7980	+
	7981	+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
	7982	+ void *data)
	7983	+{
	7984	+ struct cpufreq_freqs *freq = data;
	7985	+ int cpu;
	7986	+
	7987	+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
	7988	+ return 0;
	7989	+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
	7990	+ return 0;
	7991	+
	7992	+ for_each_cpu(cpu, freq->policy->cpus)
	7993	+ __kvmclock_cpufreq_notifier(freq, cpu);
	7994	+
6733	7995	return 0;
6734	7996	}
6735	7997
..	..	@@ -6749,20 +8011,21 @@
6749	8011
6750	8012	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
6751	8013	#ifdef CONFIG_CPU_FREQ
6752		- struct cpufreq_policy policy;
	8014	+ struct cpufreq_policy *policy;
6753	8015	int cpu;
6754	8016
6755		- memset(&policy, 0, sizeof(policy));
6756	8017	cpu = get_cpu();
6757		- cpufreq_get_policy(&policy, cpu);
6758		- if (policy.cpuinfo.max_freq)
6759		- max_tsc_khz = policy.cpuinfo.max_freq;
	8018	+ policy = cpufreq_cpu_get(cpu);
	8019	+ if (policy) {
	8020	+ if (policy->cpuinfo.max_freq)
	8021	+ max_tsc_khz = policy->cpuinfo.max_freq;
	8022	+ cpufreq_cpu_put(policy);
	8023	+ }
6760	8024	put_cpu();
6761	8025	#endif
6762	8026	cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
6763	8027	CPUFREQ_TRANSITION_NOTIFIER);
6764	8028	}
6765		- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
6766	8029
6767	8030	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
6768	8031	kvmclock_cpu_online, kvmclock_cpu_down_prep);
..	..	@@ -6781,7 +8044,7 @@
6781	8044	int user_mode = 3;
6782	8045
6783	8046	if (__this_cpu_read(current_vcpu))
6784		- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
	8047	+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
6785	8048
6786	8049	return user_mode != 0;
6787	8050	}
..	..	@@ -6796,10 +8059,20 @@
6796	8059	return ip;
6797	8060	}
6798	8061
	8062	+static void kvm_handle_intel_pt_intr(void)
	8063	+{
	8064	+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
	8065	+
	8066	+ kvm_make_request(KVM_REQ_PMI, vcpu);
	8067	+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
	8068	+ (unsigned long *)&vcpu->arch.pmu.global_status);
	8069	+}
	8070	+
6799	8071	static struct perf_guest_info_callbacks kvm_guest_cbs = {
6800	8072	.is_in_guest = kvm_is_in_guest,
6801	8073	.is_user_mode = kvm_is_user_mode,
6802	8074	.get_guest_ip = kvm_get_guest_ip,
	8075	+ .handle_intel_pt_intr = NULL,
6803	8076	};
6804	8077
6805	8078	#ifdef CONFIG_X86_64
..	..	@@ -6821,6 +8094,18 @@
6821	8094	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
6822	8095
6823	8096	/*
	8097	+ * Indirection to move queue_work() out of the tk_core.seq write held
	8098	+ * region to prevent possible deadlocks against time accessors which
	8099	+ * are invoked with work related locks held.
	8100	+ */
	8101	+static void pvclock_irq_work_fn(struct irq_work *w)
	8102	+{
	8103	+ queue_work(system_long_wq, &pvclock_gtod_work);
	8104	+}
	8105	+
	8106	+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
	8107	+
	8108	+/*
6824	8109	* Notification about pvclock gtod data update.
6825	8110	*/
6826	8111	static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
..	..	@@ -6831,13 +8116,14 @@
6831	8116
6832	8117	update_pvclock_gtod(tk);
6833	8118
6834		- /* disable master clock if host does not trust, or does not
6835		- * use, TSC based clocksource.
	8119	+ /*
	8120	+ * Disable master clock if host does not trust, or does not use,
	8121	+ * TSC based clocksource. Delegate queue_work() to irq_work as
	8122	+ * this is invoked with tk_core.seq write held.
6836	8123	*/
6837	8124	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
6838	8125	atomic_read(&kvm_guest_has_master_clock) != 0)
6839		- queue_work(system_long_wq, &pvclock_gtod_work);
6840		-
	8126	+ irq_work_queue(&pvclock_irq_work);
6841	8127	return 0;
6842	8128	}
6843	8129
..	..	@@ -6848,50 +8134,79 @@
6848	8134
6849	8135	int kvm_arch_init(void *opaque)
6850	8136	{
	8137	+ struct kvm_x86_init_ops *ops = opaque;
6851	8138	int r;
6852		- struct kvm_x86_ops *ops = opaque;
6853	8139
6854		- if (kvm_x86_ops) {
	8140	+ if (kvm_x86_ops.hardware_enable) {
6855	8141	printk(KERN_ERR "kvm: already loaded the other module\n");
6856	8142	r = -EEXIST;
6857	8143	goto out;
6858	8144	}
6859	8145
6860	8146	if (!ops->cpu_has_kvm_support()) {
6861		- printk(KERN_ERR "kvm: no hardware support\n");
	8147	+ pr_err_ratelimited("kvm: no hardware support\n");
6862	8148	r = -EOPNOTSUPP;
6863	8149	goto out;
6864	8150	}
6865	8151	if (ops->disabled_by_bios()) {
6866		- printk(KERN_ERR "kvm: disabled by bios\n");
	8152	+ pr_err_ratelimited("kvm: disabled by bios\n");
	8153	+ r = -EOPNOTSUPP;
	8154	+ goto out;
	8155	+ }
	8156	+
	8157	+ /*
	8158	+ * KVM explicitly assumes that the guest has an FPU and
	8159	+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
	8160	+ * vCPU's FPU state as a fxregs_state struct.
	8161	+ */
	8162	+ if (!boot_cpu_has(X86_FEATURE_FPU) \|\| !boot_cpu_has(X86_FEATURE_FXSR)) {
	8163	+ printk(KERN_ERR "kvm: inadequate fpu\n");
6867	8164	r = -EOPNOTSUPP;
6868	8165	goto out;
6869	8166	}
6870	8167
6871	8168	r = -ENOMEM;
6872		- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
6873		- if (!shared_msrs) {
6874		- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
	8169	+ x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
	8170	+ __alignof__(struct fpu), SLAB_ACCOUNT,
	8171	+ NULL);
	8172	+ if (!x86_fpu_cache) {
	8173	+ printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
6875	8174	goto out;
6876	8175	}
6877	8176
6878		- r = kvm_mmu_module_init();
	8177	+ x86_emulator_cache = kvm_alloc_emulator_cache();
	8178	+ if (!x86_emulator_cache) {
	8179	+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
	8180	+ goto out_free_x86_fpu_cache;
	8181	+ }
	8182	+
	8183	+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
	8184	+ if (!user_return_msrs) {
	8185	+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
	8186	+ goto out_free_x86_emulator_cache;
	8187	+ }
	8188	+
	8189	+ r = kvm_mmu_vendor_module_init();
6879	8190	if (r)
6880	8191	goto out_free_percpu;
6881		-
6882		- kvm_x86_ops = ops;
6883	8192
6884	8193	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
6885	8194	PT_DIRTY_MASK, PT64_NX_MASK, 0,
6886	8195	PT_PRESENT_MASK, 0, sme_me_mask);
6887	8196	kvm_timer_init();
6888	8197
	8198	+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
	8199	+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
6889	8200	perf_register_guest_info_callbacks(&kvm_guest_cbs);
6890	8201
6891		- if (boot_cpu_has(X86_FEATURE_XSAVE))
	8202	+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
6892	8203	host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
	8204	+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
	8205	+ }
6893	8206
6894	8207	kvm_lapic_init();
	8208	+ if (pi_inject_timer == -1)
	8209	+ pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
6895	8210	#ifdef CONFIG_X86_64
6896	8211	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
6897	8212
..	..	@@ -6902,7 +8217,11 @@
6902	8217	return 0;
6903	8218
6904	8219	out_free_percpu:
6905		- free_percpu(shared_msrs);
	8220	+ free_percpu(user_return_msrs);
	8221	+out_free_x86_emulator_cache:
	8222	+ kmem_cache_destroy(x86_emulator_cache);
	8223	+out_free_x86_fpu_cache:
	8224	+ kmem_cache_destroy(x86_fpu_cache);
6906	8225	out:
6907	8226	return r;
6908	8227	}
..	..	@@ -6915,6 +8234,7 @@
6915	8234	#endif
6916	8235	kvm_lapic_exit();
6917	8236	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
	8237	+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
6918	8238
6919	8239	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
6920	8240	cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
..	..	@@ -6922,11 +8242,14 @@
6922	8242	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
6923	8243	#ifdef CONFIG_X86_64
6924	8244	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
	8245	+ irq_work_sync(&pvclock_irq_work);
6925	8246	cancel_work_sync(&pvclock_gtod_work);
6926	8247	#endif
6927		- kvm_x86_ops = NULL;
6928		- kvm_mmu_module_exit();
6929		- free_percpu(shared_msrs);
	8248	+ kvm_x86_ops.hardware_enable = NULL;
	8249	+ kvm_mmu_vendor_module_exit();
	8250	+ free_percpu(user_return_msrs);
	8251	+ kmem_cache_destroy(x86_emulator_cache);
	8252	+ kmem_cache_destroy(x86_fpu_cache);
6930	8253	}
6931	8254
6932	8255	int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
..	..	@@ -6990,22 +8313,52 @@
6990	8313	*/
6991	8314	static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
6992	8315	{
6993		- struct kvm_lapic_irq lapic_irq;
	8316	+ /*
	8317	+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
	8318	+ * common code, e.g. for tracing. Defer initialization to the compiler.
	8319	+ */
	8320	+ struct kvm_lapic_irq lapic_irq = {
	8321	+ .delivery_mode = APIC_DM_REMRD,
	8322	+ .dest_mode = APIC_DEST_PHYSICAL,
	8323	+ .shorthand = APIC_DEST_NOSHORT,
	8324	+ .dest_id = apicid,
	8325	+ };
6994	8326
6995		- lapic_irq.shorthand = 0;
6996		- lapic_irq.dest_mode = 0;
6997		- lapic_irq.level = 0;
6998		- lapic_irq.dest_id = apicid;
6999		- lapic_irq.msi_redir_hint = false;
7000		-
7001		- lapic_irq.delivery_mode = APIC_DM_REMRD;
7002	8327	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
7003	8328	}
7004	8329
7005		-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
	8330	+bool kvm_apicv_activated(struct kvm *kvm)
7006	8331	{
7007		- vcpu->arch.apicv_active = false;
7008		- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
	8332	+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
	8333	+}
	8334	+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
	8335	+
	8336	+void kvm_apicv_init(struct kvm *kvm, bool enable)
	8337	+{
	8338	+ if (enable)
	8339	+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
	8340	+ &kvm->arch.apicv_inhibit_reasons);
	8341	+ else
	8342	+ set_bit(APICV_INHIBIT_REASON_DISABLE,
	8343	+ &kvm->arch.apicv_inhibit_reasons);
	8344	+}
	8345	+EXPORT_SYMBOL_GPL(kvm_apicv_init);
	8346	+
	8347	+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
	8348	+{
	8349	+ struct kvm_vcpu *target = NULL;
	8350	+ struct kvm_apic_map *map;
	8351	+
	8352	+ rcu_read_lock();
	8353	+ map = rcu_dereference(kvm->arch.apic_map);
	8354	+
	8355	+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
	8356	+ target = map->phys_map[dest_id]->vcpu;
	8357	+
	8358	+ rcu_read_unlock();
	8359	+
	8360	+ if (target && READ_ONCE(target->ready))
	8361	+ kvm_vcpu_yield_to(target);
7009	8362	}
7010	8363
7011	8364	int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
..	..	@@ -7016,11 +8369,11 @@
7016	8369	if (kvm_hv_hypercall_enabled(vcpu->kvm))
7017	8370	return kvm_hv_hypercall(vcpu);
7018	8371
7019		- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
7020		- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
7021		- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
7022		- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
7023		- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
	8372	+ nr = kvm_rax_read(vcpu);
	8373	+ a0 = kvm_rbx_read(vcpu);
	8374	+ a1 = kvm_rcx_read(vcpu);
	8375	+ a2 = kvm_rdx_read(vcpu);
	8376	+ a3 = kvm_rsi_read(vcpu);
7024	8377
7025	8378	trace_kvm_hypercall(nr, a0, a1, a2, a3);
7026	8379
..	..	@@ -7033,17 +8386,23 @@
7033	8386	a3 &= 0xFFFFFFFF;
7034	8387	}
7035	8388
7036		- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
	8389	+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
7037	8390	ret = -KVM_EPERM;
7038	8391	goto out;
7039	8392	}
	8393	+
	8394	+ ret = -KVM_ENOSYS;
7040	8395
7041	8396	switch (nr) {
7042	8397	case KVM_HC_VAPIC_POLL_IRQ:
7043	8398	ret = 0;
7044	8399	break;
7045	8400	case KVM_HC_KICK_CPU:
	8401	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
	8402	+ break;
	8403	+
7046	8404	kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
	8405	+ kvm_sched_yield(vcpu->kvm, a1);
7047	8406	ret = 0;
7048	8407	break;
7049	8408	#ifdef CONFIG_X86_64
..	..	@@ -7052,7 +8411,17 @@
7052	8411	break;
7053	8412	#endif
7054	8413	case KVM_HC_SEND_IPI:
	8414	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
	8415	+ break;
	8416	+
7055	8417	ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
	8418	+ break;
	8419	+ case KVM_HC_SCHED_YIELD:
	8420	+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
	8421	+ break;
	8422	+
	8423	+ kvm_sched_yield(vcpu->kvm, a0);
	8424	+ ret = 0;
7056	8425	break;
7057	8426	default:
7058	8427	ret = -KVM_ENOSYS;
..	..	@@ -7061,7 +8430,7 @@
7061	8430	out:
7062	8431	if (!op_64_bit)
7063	8432	ret = (u32)ret;
7064		- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
	8433	+ kvm_rax_write(vcpu, ret);
7065	8434
7066	8435	++vcpu->stat.hypercalls;
7067	8436	return kvm_skip_emulated_instruction(vcpu);
..	..	@@ -7074,7 +8443,7 @@
7074	8443	char instruction[3];
7075	8444	unsigned long rip = kvm_rip_read(vcpu);
7076	8445
7077		- kvm_x86_ops->patch_hypercall(vcpu, instruction);
	8446	+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
7078	8447
7079	8448	return emulator_write_emulated(ctxt, rip, instruction, 3,
7080	8449	&ctxt->exception);
..	..	@@ -7103,7 +8472,7 @@
7103	8472	{
7104	8473	int max_irr, tpr;
7105	8474
7106		- if (!kvm_x86_ops->update_cr8_intercept)
	8475	+ if (!kvm_x86_ops.update_cr8_intercept)
7107	8476	return;
7108	8477
7109	8478	if (!lapic_in_kernel(vcpu))
..	..	@@ -7122,24 +8491,32 @@
7122	8491
7123	8492	tpr = kvm_lapic_get_cr8(vcpu);
7124	8493
7125		- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
	8494	+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
7126	8495	}
7127	8496
7128	8497	static void kvm_inject_exception(struct kvm_vcpu *vcpu)
7129	8498	{
7130		- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
7131		- vcpu->arch.exception.error_code = false;
7132		- kvm_x86_ops->queue_exception(vcpu);
	8499	+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
	8500	+ vcpu->arch.exception.has_error_code,
	8501	+ vcpu->arch.exception.error_code,
	8502	+ vcpu->arch.exception.injected);
	8503	+
	8504	+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
	8505	+ vcpu->arch.exception.error_code = false;
	8506	+ kvm_x86_ops.queue_exception(vcpu);
7133	8507	}
7134	8508
7135		-static int inject_pending_event(struct kvm_vcpu *vcpu)
	8509	+static void inject_pending_event(struct kvm_vcpu vcpu, bool req_immediate_exit)
7136	8510	{
7137	8511	int r;
	8512	+ bool can_inject = true;
7138	8513
7139	8514	/* try to reinject previous events if any */
7140	8515
7141		- if (vcpu->arch.exception.injected)
	8516	+ if (vcpu->arch.exception.injected) {
7142	8517	kvm_inject_exception(vcpu);
	8518	+ can_inject = false;
	8519	+ }
7143	8520	/*
7144	8521	* Do not inject an NMI or interrupt if there is a pending
7145	8522	* exception. Exceptions and interrupts are recognized at
..	..	@@ -7155,11 +8532,17 @@
7155	8532	* fully complete the previous instruction.
7156	8533	*/
7157	8534	else if (!vcpu->arch.exception.pending) {
7158		- if (vcpu->arch.nmi_injected)
7159		- kvm_x86_ops->set_nmi(vcpu);
7160		- else if (vcpu->arch.interrupt.injected)
7161		- kvm_x86_ops->set_irq(vcpu);
	8535	+ if (vcpu->arch.nmi_injected) {
	8536	+ kvm_x86_ops.set_nmi(vcpu);
	8537	+ can_inject = false;
	8538	+ } else if (vcpu->arch.interrupt.injected) {
	8539	+ kvm_x86_ops.set_irq(vcpu);
	8540	+ can_inject = false;
	8541	+ }
7162	8542	}
	8543	+
	8544	+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
	8545	+ vcpu->arch.exception.pending);
7163	8546
7164	8547	/*
7165	8548	* Call check_nested_events() even if we reinjected a previous event
..	..	@@ -7167,69 +8550,107 @@
7167	8550	* from L2 to L1 due to pending L1 events which require exit
7168	8551	* from L2 to L1.
7169	8552	*/
7170		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7171		- r = kvm_x86_ops->check_nested_events(vcpu);
7172		- if (r != 0)
7173		- return r;
	8553	+ if (is_guest_mode(vcpu)) {
	8554	+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
	8555	+ if (r < 0)
	8556	+ goto busy;
7174	8557	}
7175	8558
7176	8559	/* try to inject new event if pending */
7177	8560	if (vcpu->arch.exception.pending) {
7178		- trace_kvm_inj_exception(vcpu->arch.exception.nr,
7179		- vcpu->arch.exception.has_error_code,
7180		- vcpu->arch.exception.error_code);
7181		-
7182		- WARN_ON_ONCE(vcpu->arch.exception.injected);
7183		- vcpu->arch.exception.pending = false;
7184		- vcpu->arch.exception.injected = true;
7185		-
	8561	+ /*
	8562	+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
	8563	+ * value pushed on the stack. Trap-like exception and all #DBs
	8564	+ * leave RF as-is (KVM follows Intel's behavior in this regard;
	8565	+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
	8566	+ *
	8567	+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
	8568	+ * describe the behavior of General Detect #DBs, which are
	8569	+ * fault-like. They do _not_ set RF, a la code breakpoints.
	8570	+ */
7186	8571	if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
7187	8572	__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) \|
7188	8573	X86_EFLAGS_RF);
7189	8574
7190		- if (vcpu->arch.exception.nr == DB_VECTOR &&
7191		- (vcpu->arch.dr7 & DR7_GD)) {
7192		- vcpu->arch.dr7 &= ~DR7_GD;
7193		- kvm_update_dr7(vcpu);
	8575	+ if (vcpu->arch.exception.nr == DB_VECTOR) {
	8576	+ kvm_deliver_exception_payload(vcpu);
	8577	+ if (vcpu->arch.dr7 & DR7_GD) {
	8578	+ vcpu->arch.dr7 &= ~DR7_GD;
	8579	+ kvm_update_dr7(vcpu);
	8580	+ }
7194	8581	}
7195	8582
7196	8583	kvm_inject_exception(vcpu);
	8584	+
	8585	+ vcpu->arch.exception.pending = false;
	8586	+ vcpu->arch.exception.injected = true;
	8587	+
	8588	+ can_inject = false;
7197	8589	}
7198	8590
7199		- /* Don't consider new event if we re-injected an event */
7200		- if (kvm_event_needs_reinjection(vcpu))
7201		- return 0;
7202		-
7203		- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
7204		- kvm_x86_ops->smi_allowed(vcpu)) {
7205		- vcpu->arch.smi_pending = false;
7206		- ++vcpu->arch.smi_count;
7207		- enter_smm(vcpu);
7208		- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
7209		- --vcpu->arch.nmi_pending;
7210		- vcpu->arch.nmi_injected = true;
7211		- kvm_x86_ops->set_nmi(vcpu);
7212		- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
7213		- /*
7214		- * Because interrupts can be injected asynchronously, we are
7215		- * calling check_nested_events again here to avoid a race condition.
7216		- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
7217		- * proposal and current concerns. Perhaps we should be setting
7218		- * KVM_REQ_EVENT only on certain events and not unconditionally?
7219		- */
7220		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7221		- r = kvm_x86_ops->check_nested_events(vcpu);
7222		- if (r != 0)
7223		- return r;
7224		- }
7225		- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
7226		- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
7227		- false);
7228		- kvm_x86_ops->set_irq(vcpu);
7229		- }
	8591	+ /*
	8592	+ * Finally, inject interrupt events. If an event cannot be injected
	8593	+ * due to architectural conditions (e.g. IF=0) a window-open exit
	8594	+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
	8595	+ * and can architecturally be injected, but we cannot do it right now:
	8596	+ * an interrupt could have arrived just now and we have to inject it
	8597	+ * as a vmexit, or there could already an event in the queue, which is
	8598	+ * indicated by can_inject. In that case we request an immediate exit
	8599	+ * in order to make progress and get back here for another iteration.
	8600	+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
	8601	+ */
	8602	+ if (vcpu->arch.smi_pending) {
	8603	+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
	8604	+ if (r < 0)
	8605	+ goto busy;
	8606	+ if (r) {
	8607	+ vcpu->arch.smi_pending = false;
	8608	+ ++vcpu->arch.smi_count;
	8609	+ enter_smm(vcpu);
	8610	+ can_inject = false;
	8611	+ } else
	8612	+ kvm_x86_ops.enable_smi_window(vcpu);
7230	8613	}
7231	8614
7232		- return 0;
	8615	+ if (vcpu->arch.nmi_pending) {
	8616	+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
	8617	+ if (r < 0)
	8618	+ goto busy;
	8619	+ if (r) {
	8620	+ --vcpu->arch.nmi_pending;
	8621	+ vcpu->arch.nmi_injected = true;
	8622	+ kvm_x86_ops.set_nmi(vcpu);
	8623	+ can_inject = false;
	8624	+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
	8625	+ }
	8626	+ if (vcpu->arch.nmi_pending)
	8627	+ kvm_x86_ops.enable_nmi_window(vcpu);
	8628	+ }
	8629	+
	8630	+ if (kvm_cpu_has_injectable_intr(vcpu)) {
	8631	+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
	8632	+ if (r < 0)
	8633	+ goto busy;
	8634	+ if (r) {
	8635	+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
	8636	+ kvm_x86_ops.set_irq(vcpu);
	8637	+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
	8638	+ }
	8639	+ if (kvm_cpu_has_injectable_intr(vcpu))
	8640	+ kvm_x86_ops.enable_irq_window(vcpu);
	8641	+ }
	8642	+
	8643	+ if (is_guest_mode(vcpu) &&
	8644	+ kvm_x86_ops.nested_ops->hv_timer_pending &&
	8645	+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
	8646	+ *req_immediate_exit = true;
	8647	+
	8648	+ WARN_ON(vcpu->arch.exception.pending);
	8649	+ return;
	8650	+
	8651	+busy:
	8652	+ *req_immediate_exit = true;
	8653	+ return;
7233	8654	}
7234	8655
7235	8656	static void process_nmi(struct kvm_vcpu *vcpu)
..	..	@@ -7241,7 +8662,7 @@
7241	8662	* If an NMI is already in progress, limit further NMIs to just one.
7242	8663	* Otherwise, allow two (and we'll inject the first one immediately).
7243	8664	*/
7244		- if (kvm_x86_ops->get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
	8665	+ if (kvm_x86_ops.get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
7245	8666	limit = 1;
7246	8667
7247	8668	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
..	..	@@ -7331,11 +8752,11 @@
7331	8752	put_smstate(u32, buf, 0x7f7c, seg.limit);
7332	8753	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
7333	8754
7334		- kvm_x86_ops->get_gdt(vcpu, &dt);
	8755	+ kvm_x86_ops.get_gdt(vcpu, &dt);
7335	8756	put_smstate(u32, buf, 0x7f74, dt.address);
7336	8757	put_smstate(u32, buf, 0x7f70, dt.size);
7337	8758
7338		- kvm_x86_ops->get_idt(vcpu, &dt);
	8759	+ kvm_x86_ops.get_idt(vcpu, &dt);
7339	8760	put_smstate(u32, buf, 0x7f58, dt.address);
7340	8761	put_smstate(u32, buf, 0x7f54, dt.size);
7341	8762
..	..	@@ -7385,7 +8806,7 @@
7385	8806	put_smstate(u32, buf, 0x7e94, seg.limit);
7386	8807	put_smstate(u64, buf, 0x7e98, seg.base);
7387	8808
7388		- kvm_x86_ops->get_idt(vcpu, &dt);
	8809	+ kvm_x86_ops.get_idt(vcpu, &dt);
7389	8810	put_smstate(u32, buf, 0x7e84, dt.size);
7390	8811	put_smstate(u64, buf, 0x7e88, dt.address);
7391	8812
..	..	@@ -7395,7 +8816,7 @@
7395	8816	put_smstate(u32, buf, 0x7e74, seg.limit);
7396	8817	put_smstate(u64, buf, 0x7e78, seg.base);
7397	8818
7398		- kvm_x86_ops->get_gdt(vcpu, &dt);
	8819	+ kvm_x86_ops.get_gdt(vcpu, &dt);
7399	8820	put_smstate(u32, buf, 0x7e64, dt.size);
7400	8821	put_smstate(u64, buf, 0x7e68, dt.address);
7401	8822
..	..	@@ -7425,28 +8846,28 @@
7425	8846	* vCPU state (e.g. leave guest mode) after we've saved the state into
7426	8847	* the SMM state-save area.
7427	8848	*/
7428		- kvm_x86_ops->pre_enter_smm(vcpu, buf);
	8849	+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
7429	8850
7430	8851	vcpu->arch.hflags \|= HF_SMM_MASK;
7431	8852	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
7432	8853
7433		- if (kvm_x86_ops->get_nmi_mask(vcpu))
	8854	+ if (kvm_x86_ops.get_nmi_mask(vcpu))
7434	8855	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
7435	8856	else
7436		- kvm_x86_ops->set_nmi_mask(vcpu, true);
	8857	+ kvm_x86_ops.set_nmi_mask(vcpu, true);
7437	8858
7438	8859	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
7439	8860	kvm_rip_write(vcpu, 0x8000);
7440	8861
7441	8862	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE \| X86_CR0_EM \| X86_CR0_TS \| X86_CR0_PG);
7442		- kvm_x86_ops->set_cr0(vcpu, cr0);
	8863	+ kvm_x86_ops.set_cr0(vcpu, cr0);
7443	8864	vcpu->arch.cr0 = cr0;
7444	8865
7445		- kvm_x86_ops->set_cr4(vcpu, 0);
	8866	+ kvm_x86_ops.set_cr4(vcpu, 0);
7446	8867
7447	8868	/* Undocumented: IDT limit is set to zero on entry to SMM. */
7448	8869	dt.address = dt.size = 0;
7449		- kvm_x86_ops->set_idt(vcpu, &dt);
	8870	+ kvm_x86_ops.set_idt(vcpu, &dt);
7450	8871
7451	8872	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
7452	8873
..	..	@@ -7477,10 +8898,10 @@
7477	8898
7478	8899	#ifdef CONFIG_X86_64
7479	8900	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
7480		- kvm_x86_ops->set_efer(vcpu, 0);
	8901	+ kvm_x86_ops.set_efer(vcpu, 0);
7481	8902	#endif
7482	8903
7483		- kvm_update_cpuid(vcpu);
	8904	+ kvm_update_cpuid_runtime(vcpu);
7484	8905	kvm_mmu_reset_context(vcpu);
7485	8906	}
7486	8907
..	..	@@ -7490,10 +8911,82 @@
7490	8911	kvm_make_request(KVM_REQ_EVENT, vcpu);
7491	8912	}
7492	8913
	8914	+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
	8915	+ unsigned long *vcpu_bitmap)
	8916	+{
	8917	+ cpumask_var_t cpus;
	8918	+
	8919	+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
	8920	+
	8921	+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
	8922	+ NULL, vcpu_bitmap, cpus);
	8923	+
	8924	+ free_cpumask_var(cpus);
	8925	+}
	8926	+
7493	8927	void kvm_make_scan_ioapic_request(struct kvm *kvm)
7494	8928	{
7495	8929	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
7496	8930	}
	8931	+
	8932	+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
	8933	+{
	8934	+ if (!lapic_in_kernel(vcpu))
	8935	+ return;
	8936	+
	8937	+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
	8938	+ kvm_apic_update_apicv(vcpu);
	8939	+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
	8940	+}
	8941	+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
	8942	+
	8943	+/*
	8944	+ * NOTE: Do not hold any lock prior to calling this.
	8945	+ *
	8946	+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
	8947	+ * locked, because it calls __x86_set_memory_region() which does
	8948	+ * synchronize_srcu(&kvm->srcu).
	8949	+ */
	8950	+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
	8951	+{
	8952	+ struct kvm_vcpu *except;
	8953	+ unsigned long old, new, expected;
	8954	+
	8955	+ if (!kvm_x86_ops.check_apicv_inhibit_reasons \|\|
	8956	+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
	8957	+ return;
	8958	+
	8959	+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
	8960	+ do {
	8961	+ expected = new = old;
	8962	+ if (activate)
	8963	+ __clear_bit(bit, &new);
	8964	+ else
	8965	+ __set_bit(bit, &new);
	8966	+ if (new == old)
	8967	+ break;
	8968	+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
	8969	+ } while (old != expected);
	8970	+
	8971	+ if (!!old == !!new)
	8972	+ return;
	8973	+
	8974	+ trace_kvm_apicv_update_request(activate, bit);
	8975	+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
	8976	+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
	8977	+
	8978	+ /*
	8979	+ * Sending request to update APICV for all other vcpus,
	8980	+ * while update the calling vcpu immediately instead of
	8981	+ * waiting for another #VMEXIT to handle the request.
	8982	+ */
	8983	+ except = kvm_get_running_vcpu();
	8984	+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
	8985	+ except);
	8986	+ if (except)
	8987	+ kvm_vcpu_update_apicv(except);
	8988	+}
	8989	+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
7497	8990
7498	8991	static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
7499	8992	{
..	..	@@ -7506,7 +8999,7 @@
7506	8999	kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
7507	9000	else {
7508	9001	if (vcpu->arch.apicv_active)
7509		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	9002	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
7510	9003	if (ioapic_in_kernel(vcpu->kvm))
7511	9004	kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
7512	9005	}
..	..	@@ -7526,7 +9019,7 @@
7526	9019
7527	9020	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
7528	9021	vcpu_to_synic(vcpu)->vec_bitmap, 256);
7529		- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
	9022	+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
7530	9023	}
7531	9024
7532	9025	void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
..	..	@@ -7543,28 +9036,22 @@
7543	9036	kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
7544	9037	}
7545	9038
	9039	+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
	9040	+{
	9041	+ if (kvm_x86_ops.guest_memory_reclaimed)
	9042	+ kvm_x86_ops.guest_memory_reclaimed(kvm);
	9043	+}
	9044	+
7546	9045	void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
7547	9046	{
7548		- struct page *page = NULL;
7549		-
7550	9047	if (!lapic_in_kernel(vcpu))
7551	9048	return;
7552	9049
7553		- if (!kvm_x86_ops->set_apic_access_page_addr)
	9050	+ if (!kvm_x86_ops.set_apic_access_page_addr)
7554	9051	return;
7555	9052
7556		- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
7557		- if (is_error_page(page))
7558		- return;
7559		- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
7560		-
7561		- /*
7562		- * Do not pin apic access page in memory, the MMU notifier
7563		- * will call us again if it is migrated or swapped out.
7564		- */
7565		- put_page(page);
	9053	+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
7566	9054	}
7567		-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
7568	9055
7569	9056	void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
7570	9057	{
..	..	@@ -7583,12 +9070,17 @@
7583	9070	bool req_int_win =
7584	9071	dm_request_for_irq_injection(vcpu) &&
7585	9072	kvm_cpu_accept_dm_intr(vcpu);
	9073	+ fastpath_t exit_fastpath;
7586	9074
7587	9075	bool req_immediate_exit = false;
7588	9076
7589	9077	if (kvm_request_pending(vcpu)) {
7590		- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
7591		- kvm_x86_ops->get_vmcs12_pages(vcpu);
	9078	+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
	9079	+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
	9080	+ r = 0;
	9081	+ goto out;
	9082	+ }
	9083	+ }
7592	9084	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
7593	9085	kvm_mmu_unload(vcpu);
7594	9086	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
..	..	@@ -7604,10 +9096,19 @@
7604	9096	}
7605	9097	if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
7606	9098	kvm_mmu_sync_roots(vcpu);
7607		- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
7608		- kvm_mmu_load_cr3(vcpu);
7609		- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
7610		- kvm_vcpu_flush_tlb(vcpu, true);
	9099	+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
	9100	+ kvm_mmu_load_pgd(vcpu);
	9101	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
	9102	+ kvm_vcpu_flush_tlb_all(vcpu);
	9103	+
	9104	+ /* Flushing all ASIDs flushes the current ASID... */
	9105	+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
	9106	+ }
	9107	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
	9108	+ kvm_vcpu_flush_tlb_current(vcpu);
	9109	+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
	9110	+ kvm_vcpu_flush_tlb_guest(vcpu);
	9111	+
7611	9112	if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
7612	9113	vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
7613	9114	r = 0;
..	..	@@ -7678,6 +9179,12 @@
7678	9179	*/
7679	9180	if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
7680	9181	kvm_hv_process_stimers(vcpu);
	9182	+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
	9183	+ kvm_vcpu_update_apicv(vcpu);
	9184	+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
	9185	+ kvm_check_async_pf_completion(vcpu);
	9186	+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
	9187	+ kvm_x86_ops.msr_filter_changed(vcpu);
7681	9188	}
7682	9189
7683	9190	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {
..	..	@@ -7688,32 +9195,9 @@
7688	9195	goto out;
7689	9196	}
7690	9197
7691		- if (inject_pending_event(vcpu) != 0)
7692		- req_immediate_exit = true;
7693		- else {
7694		- /* Enable SMI/NMI/IRQ window open exits if needed.
7695		- *
7696		- * SMIs have three cases:
7697		- * 1) They can be nested, and then there is nothing to
7698		- * do here because RSM will cause a vmexit anyway.
7699		- * 2) There is an ISA-specific reason why SMI cannot be
7700		- * injected, and the moment when this changes can be
7701		- * intercepted.
7702		- * 3) Or the SMI can be pending because
7703		- * inject_pending_event has completed the injection
7704		- * of an IRQ or NMI from the previous vmexit, and
7705		- * then we request an immediate exit to inject the
7706		- * SMI.
7707		- */
7708		- if (vcpu->arch.smi_pending && !is_smm(vcpu))
7709		- if (!kvm_x86_ops->enable_smi_window(vcpu))
7710		- req_immediate_exit = true;
7711		- if (vcpu->arch.nmi_pending)
7712		- kvm_x86_ops->enable_nmi_window(vcpu);
7713		- if (kvm_cpu_has_injectable_intr(vcpu) \|\| req_int_win)
7714		- kvm_x86_ops->enable_irq_window(vcpu);
7715		- WARN_ON(vcpu->arch.exception.pending);
7716		- }
	9198	+ inject_pending_event(vcpu, &req_immediate_exit);
	9199	+ if (req_int_win)
	9200	+ kvm_x86_ops.enable_irq_window(vcpu);
7717	9201
7718	9202	if (kvm_lapic_enabled(vcpu)) {
7719	9203	update_cr8_intercept(vcpu);
..	..	@@ -7728,7 +9212,7 @@
7728	9212
7729	9213	preempt_disable();
7730	9214
7731		- kvm_x86_ops->prepare_guest_switch(vcpu);
	9215	+ kvm_x86_ops.prepare_guest_switch(vcpu);
7732	9216
7733	9217	/*
7734	9218	* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
..	..	@@ -7744,7 +9228,7 @@
7744	9228	* 1) We should set ->mode before checking ->requests. Please see
7745	9229	* the comment in kvm_vcpu_exiting_guest_mode().
7746	9230	*
7747		- * 2) For APICv, we should set ->mode before checking PIR.ON. This
	9231	+ * 2) For APICv, we should set ->mode before checking PID.ON. This
7748	9232	* pairs with the memory barrier implicit in pi_test_and_set_on
7749	9233	* (see vmx_deliver_posted_interrupt).
7750	9234	*
..	..	@@ -7759,10 +9243,9 @@
7759	9243	* notified with kvm_vcpu_kick.
7760	9244	*/
7761	9245	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
7762		- kvm_x86_ops->sync_pir_to_irr(vcpu);
	9246	+ kvm_x86_ops.sync_pir_to_irr(vcpu);
7763	9247
7764		- if (vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu)
7765		- \|\| need_resched() \|\| signal_pending(current)) {
	9248	+ if (kvm_vcpu_exit_request(vcpu)) {
7766	9249	vcpu->mode = OUTSIDE_GUEST_MODE;
7767	9250	smp_wmb();
7768	9251	local_irq_enable();
..	..	@@ -7774,13 +9257,14 @@
7774	9257
7775	9258	if (req_immediate_exit) {
7776	9259	kvm_make_request(KVM_REQ_EVENT, vcpu);
7777		- kvm_x86_ops->request_immediate_exit(vcpu);
	9260	+ kvm_x86_ops.request_immediate_exit(vcpu);
7778	9261	}
7779	9262
7780		- trace_kvm_entry(vcpu->vcpu_id);
7781		- if (lapic_timer_advance_ns)
7782		- wait_lapic_expire(vcpu);
7783		- guest_enter_irqoff();
	9263	+ trace_kvm_entry(vcpu);
	9264	+
	9265	+ fpregs_assert_state_consistent();
	9266	+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
	9267	+ switch_fpu_return();
7784	9268
7785	9269	if (unlikely(vcpu->arch.switch_db_regs)) {
7786	9270	set_debugreg(0, 7);
..	..	@@ -7794,7 +9278,7 @@
7794	9278	set_debugreg(0, 7);
7795	9279	}
7796	9280
7797		- kvm_x86_ops->run(vcpu);
	9281	+ exit_fastpath = kvm_x86_ops.run(vcpu);
7798	9282
7799	9283	/*
7800	9284	* Do this here before restoring debug registers on the host. And
..	..	@@ -7804,9 +9288,8 @@
7804	9288	*/
7805	9289	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
7806	9290	WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
7807		- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
	9291	+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
7808	9292	kvm_update_dr0123(vcpu);
7809		- kvm_update_dr6(vcpu);
7810	9293	kvm_update_dr7(vcpu);
7811	9294	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
7812	9295	}
..	..	@@ -7821,18 +9304,43 @@
7821	9304	if (hw_breakpoint_active())
7822	9305	hw_breakpoint_restore();
7823	9306
	9307	+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
7824	9308	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
7825	9309
7826	9310	vcpu->mode = OUTSIDE_GUEST_MODE;
7827	9311	smp_wmb();
7828	9312
	9313	+ kvm_x86_ops.handle_exit_irqoff(vcpu);
	9314	+
	9315	+ /*
	9316	+ * Consume any pending interrupts, including the possible source of
	9317	+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
	9318	+ * An instruction is required after local_irq_enable() to fully unblock
	9319	+ * interrupts on processors that implement an interrupt shadow, the
	9320	+ * stat.exits increment will do nicely.
	9321	+ */
7829	9322	kvm_before_interrupt(vcpu);
7830		- kvm_x86_ops->handle_external_intr(vcpu);
	9323	+ local_irq_enable();
	9324	+ ++vcpu->stat.exits;
	9325	+ local_irq_disable();
7831	9326	kvm_after_interrupt(vcpu);
7832	9327
7833		- ++vcpu->stat.exits;
	9328	+ /*
	9329	+ * Wait until after servicing IRQs to account guest time so that any
	9330	+ * ticks that occurred while running the guest are properly accounted
	9331	+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
	9332	+ * of accounting via context tracking, but the loss of accuracy is
	9333	+ * acceptable for all known use cases.
	9334	+ */
	9335	+ vtime_account_guest_exit();
7834	9336
7835		- guest_exit_irqoff();
	9337	+ if (lapic_in_kernel(vcpu)) {
	9338	+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
	9339	+ if (delta != S64_MIN) {
	9340	+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
	9341	+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
	9342	+ }
	9343	+ }
7836	9344
7837	9345	local_irq_enable();
7838	9346	preempt_enable();
..	..	@@ -7853,12 +9361,13 @@
7853	9361	if (vcpu->arch.apic_attention)
7854	9362	kvm_lapic_sync_from_vapic(vcpu);
7855	9363
7856		- vcpu->arch.gpa_available = false;
7857		- r = kvm_x86_ops->handle_exit(vcpu);
	9364	+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
7858	9365	return r;
7859	9366
7860	9367	cancel_injection:
7861		- kvm_x86_ops->cancel_injection(vcpu);
	9368	+ if (req_immediate_exit)
	9369	+ kvm_make_request(KVM_REQ_EVENT, vcpu);
	9370	+ kvm_x86_ops.cancel_injection(vcpu);
7862	9371	if (unlikely(vcpu->arch.apic_attention))
7863	9372	kvm_lapic_sync_from_vapic(vcpu);
7864	9373	out:
..	..	@@ -7868,13 +9377,13 @@
7868	9377	static inline int vcpu_block(struct kvm kvm, struct kvm_vcpu vcpu)
7869	9378	{
7870	9379	if (!kvm_arch_vcpu_runnable(vcpu) &&
7871		- (!kvm_x86_ops->pre_block \|\| kvm_x86_ops->pre_block(vcpu) == 0)) {
	9380	+ (!kvm_x86_ops.pre_block \|\| kvm_x86_ops.pre_block(vcpu) == 0)) {
7872	9381	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7873	9382	kvm_vcpu_block(vcpu);
7874	9383	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
7875	9384
7876		- if (kvm_x86_ops->post_block)
7877		- kvm_x86_ops->post_block(vcpu);
	9385	+ if (kvm_x86_ops.post_block)
	9386	+ kvm_x86_ops.post_block(vcpu);
7878	9387
7879	9388	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
7880	9389	return 1;
..	..	@@ -7886,6 +9395,7 @@
7886	9395	vcpu->arch.pv.pv_unhalted = false;
7887	9396	vcpu->arch.mp_state =
7888	9397	KVM_MP_STATE_RUNNABLE;
	9398	+ fallthrough;
7889	9399	case KVM_MP_STATE_RUNNABLE:
7890	9400	vcpu->arch.apf.halted = false;
7891	9401	break;
..	..	@@ -7893,15 +9403,14 @@
7893	9403	break;
7894	9404	default:
7895	9405	return -EINTR;
7896		- break;
7897	9406	}
7898	9407	return 1;
7899	9408	}
7900	9409
7901	9410	static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
7902	9411	{
7903		- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7904		- kvm_x86_ops->check_nested_events(vcpu);
	9412	+ if (is_guest_mode(vcpu))
	9413	+ kvm_x86_ops.nested_ops->check_events(vcpu);
7905	9414
7906	9415	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7907	9416	!vcpu->arch.apf.halted);
..	..	@@ -7916,6 +9425,13 @@
7916	9425	vcpu->arch.l1tf_flush_l1d = true;
7917	9426
7918	9427	for (;;) {
	9428	+ /*
	9429	+ * If another guest vCPU requests a PV TLB flush in the middle
	9430	+ * of instruction emulation, the rest of the emulation could
	9431	+ * use a stale page translation. Assume that any code after
	9432	+ * this point can start executing an instruction.
	9433	+ */
	9434	+ vcpu->arch.at_instruction_boundary = false;
7919	9435	if (kvm_vcpu_running(vcpu)) {
7920	9436	r = vcpu_enter_guest(vcpu);
7921	9437	} else {
..	..	@@ -7937,17 +9453,11 @@
7937	9453	break;
7938	9454	}
7939	9455
7940		- kvm_check_async_pf_completion(vcpu);
7941		-
7942		- if (signal_pending(current)) {
7943		- r = -EINTR;
7944		- vcpu->run->exit_reason = KVM_EXIT_INTR;
7945		- ++vcpu->stat.signal_exits;
7946		- break;
7947		- }
7948		- if (need_resched()) {
	9456	+ if (__xfer_to_guest_mode_work_pending()) {
7949	9457	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7950		- cond_resched();
	9458	+ r = xfer_to_guest_mode_handle_work(vcpu);
	9459	+ if (r)
	9460	+ return r;
7951	9461	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
7952	9462	}
7953	9463	}
..	..	@@ -7960,12 +9470,11 @@
7960	9470	static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
7961	9471	{
7962	9472	int r;
	9473	+
7963	9474	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
7964	9475	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
7965	9476	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
7966		- if (r != EMULATE_DONE)
7967		- return 0;
7968		- return 1;
	9477	+ return r;
7969	9478	}
7970	9479
7971	9480	static int complete_emulated_pio(struct kvm_vcpu *vcpu)
..	..	@@ -8038,31 +9547,55 @@
8038	9547	return 0;
8039	9548	}
8040	9549
	9550	+static void kvm_save_current_fpu(struct fpu *fpu)
	9551	+{
	9552	+ /*
	9553	+ * If the target FPU state is not resident in the CPU registers, just
	9554	+ * memcpy() from current, else save CPU state directly to the target.
	9555	+ */
	9556	+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
	9557	+ memcpy(&fpu->state, &current->thread.fpu.state,
	9558	+ fpu_kernel_xstate_size);
	9559	+ else
	9560	+ copy_fpregs_to_fpstate(fpu);
	9561	+}
	9562	+
8041	9563	/* Swap (qemu) user FPU context for the guest FPU context. */
8042	9564	static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
8043	9565	{
8044		- preempt_disable();
8045		- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
8046		- /* PKRU is separately restored in kvm_x86_ops->run. */
8047		- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
	9566	+ fpregs_lock();
	9567	+
	9568	+ kvm_save_current_fpu(vcpu->arch.user_fpu);
	9569	+
	9570	+ /* PKRU is separately restored in kvm_x86_ops.run. */
	9571	+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
8048	9572	~XFEATURE_MASK_PKRU);
8049		- preempt_enable();
	9573	+
	9574	+ fpregs_mark_activate();
	9575	+ fpregs_unlock();
	9576	+
8050	9577	trace_kvm_fpu(1);
8051	9578	}
8052	9579
8053	9580	/* When vcpu_run ends, restore user space FPU context. */
8054	9581	static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
8055	9582	{
8056		- preempt_disable();
8057		- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
8058		- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
8059		- preempt_enable();
	9583	+ fpregs_lock();
	9584	+
	9585	+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
	9586	+
	9587	+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
	9588	+
	9589	+ fpregs_mark_activate();
	9590	+ fpregs_unlock();
	9591	+
8060	9592	++vcpu->stat.fpu_reload;
8061	9593	trace_kvm_fpu(0);
8062	9594	}
8063	9595
8064		-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu vcpu, struct kvm_run kvm_run)
	9596	+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
8065	9597	{
	9598	+ struct kvm_run *kvm_run = vcpu->run;
8066	9599	int r;
8067	9600
8068	9601	vcpu_load(vcpu);
..	..	@@ -8080,18 +9613,18 @@
8080	9613	r = -EAGAIN;
8081	9614	if (signal_pending(current)) {
8082	9615	r = -EINTR;
8083		- vcpu->run->exit_reason = KVM_EXIT_INTR;
	9616	+ kvm_run->exit_reason = KVM_EXIT_INTR;
8084	9617	++vcpu->stat.signal_exits;
8085	9618	}
8086	9619	goto out;
8087	9620	}
8088	9621
8089		- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
	9622	+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
8090	9623	r = -EINVAL;
8091	9624	goto out;
8092	9625	}
8093	9626
8094		- if (vcpu->run->kvm_dirty_regs) {
	9627	+ if (kvm_run->kvm_dirty_regs) {
8095	9628	r = sync_regs(vcpu);
8096	9629	if (r != 0)
8097	9630	goto out;
..	..	@@ -8121,7 +9654,7 @@
8121	9654
8122	9655	out:
8123	9656	kvm_put_guest_fpu(vcpu);
8124		- if (vcpu->run->kvm_valid_regs)
	9657	+ if (kvm_run->kvm_valid_regs)
8125	9658	store_regs(vcpu);
8126	9659	post_kvm_run_save(vcpu);
8127	9660	kvm_sigset_deactivate(vcpu);
..	..	@@ -8140,26 +9673,26 @@
8140	9673	* that usually, but some bad designed PV devices (vmware
8141	9674	* backdoor interface) need this to work
8142	9675	*/
8143		- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
	9676	+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
8144	9677	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8145	9678	}
8146		- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
8147		- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
8148		- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
8149		- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
8150		- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
8151		- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
8152		- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
8153		- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
	9679	+ regs->rax = kvm_rax_read(vcpu);
	9680	+ regs->rbx = kvm_rbx_read(vcpu);
	9681	+ regs->rcx = kvm_rcx_read(vcpu);
	9682	+ regs->rdx = kvm_rdx_read(vcpu);
	9683	+ regs->rsi = kvm_rsi_read(vcpu);
	9684	+ regs->rdi = kvm_rdi_read(vcpu);
	9685	+ regs->rsp = kvm_rsp_read(vcpu);
	9686	+ regs->rbp = kvm_rbp_read(vcpu);
8154	9687	#ifdef CONFIG_X86_64
8155		- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
8156		- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
8157		- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
8158		- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
8159		- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
8160		- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
8161		- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
8162		- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
	9688	+ regs->r8 = kvm_r8_read(vcpu);
	9689	+ regs->r9 = kvm_r9_read(vcpu);
	9690	+ regs->r10 = kvm_r10_read(vcpu);
	9691	+ regs->r11 = kvm_r11_read(vcpu);
	9692	+ regs->r12 = kvm_r12_read(vcpu);
	9693	+ regs->r13 = kvm_r13_read(vcpu);
	9694	+ regs->r14 = kvm_r14_read(vcpu);
	9695	+ regs->r15 = kvm_r15_read(vcpu);
8163	9696	#endif
8164	9697
8165	9698	regs->rip = kvm_rip_read(vcpu);
..	..	@@ -8179,23 +9712,23 @@
8179	9712	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
8180	9713	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8181	9714
8182		- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
8183		- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
8184		- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
8185		- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
8186		- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
8187		- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
8188		- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
8189		- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
	9715	+ kvm_rax_write(vcpu, regs->rax);
	9716	+ kvm_rbx_write(vcpu, regs->rbx);
	9717	+ kvm_rcx_write(vcpu, regs->rcx);
	9718	+ kvm_rdx_write(vcpu, regs->rdx);
	9719	+ kvm_rsi_write(vcpu, regs->rsi);
	9720	+ kvm_rdi_write(vcpu, regs->rdi);
	9721	+ kvm_rsp_write(vcpu, regs->rsp);
	9722	+ kvm_rbp_write(vcpu, regs->rbp);
8190	9723	#ifdef CONFIG_X86_64
8191		- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
8192		- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
8193		- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
8194		- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
8195		- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
8196		- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
8197		- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
8198		- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
	9724	+ kvm_r8_write(vcpu, regs->r8);
	9725	+ kvm_r9_write(vcpu, regs->r9);
	9726	+ kvm_r10_write(vcpu, regs->r10);
	9727	+ kvm_r11_write(vcpu, regs->r11);
	9728	+ kvm_r12_write(vcpu, regs->r12);
	9729	+ kvm_r13_write(vcpu, regs->r13);
	9730	+ kvm_r14_write(vcpu, regs->r14);
	9731	+ kvm_r15_write(vcpu, regs->r15);
8199	9732	#endif
8200	9733
8201	9734	kvm_rip_write(vcpu, regs->rip);
..	..	@@ -8238,10 +9771,10 @@
8238	9771	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
8239	9772	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
8240	9773
8241		- kvm_x86_ops->get_idt(vcpu, &dt);
	9774	+ kvm_x86_ops.get_idt(vcpu, &dt);
8242	9775	sregs->idt.limit = dt.size;
8243	9776	sregs->idt.base = dt.address;
8244		- kvm_x86_ops->get_gdt(vcpu, &dt);
	9777	+ kvm_x86_ops.get_gdt(vcpu, &dt);
8245	9778	sregs->gdt.limit = dt.size;
8246	9779	sregs->gdt.base = dt.address;
8247	9780
..	..	@@ -8253,7 +9786,7 @@
8253	9786	sregs->efer = vcpu->arch.efer;
8254	9787	sregs->apic_base = kvm_get_apic_base(vcpu);
8255	9788
8256		- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
	9789	+ memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
8257	9790
8258	9791	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
8259	9792	set_bit(vcpu->arch.interrupt.nr,
..	..	@@ -8300,8 +9833,12 @@
8300	9833	mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
8301	9834	goto out;
8302	9835
8303		- /* INITs are latched while in SMM */
8304		- if ((is_smm(vcpu) \|\| vcpu->arch.smi_pending) &&
	9836	+ /*
	9837	+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
	9838	+ * INIT state; latched init should be reported using
	9839	+ * KVM_SET_VCPU_EVENTS, so reject it here.
	9840	+ */
	9841	+ if ((kvm_vcpu_latch_init(vcpu) \|\| vcpu->arch.smi_pending) &&
8305	9842	(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED \|\|
8306	9843	mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
8307	9844	goto out;
..	..	@@ -8322,21 +9859,23 @@
8322	9859	int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
8323	9860	int reason, bool has_error_code, u32 error_code)
8324	9861	{
8325		- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
	9862	+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8326	9863	int ret;
8327	9864
8328	9865	init_emulate_ctxt(vcpu);
8329	9866
8330	9867	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
8331	9868	has_error_code, error_code);
8332		-
8333		- if (ret)
8334		- return EMULATE_FAIL;
	9869	+ if (ret) {
	9870	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	9871	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	9872	+ vcpu->run->internal.ndata = 0;
	9873	+ return 0;
	9874	+ }
8335	9875
8336	9876	kvm_rip_write(vcpu, ctxt->eip);
8337	9877	kvm_set_rflags(vcpu, ctxt->eflags);
8338		- kvm_make_request(KVM_REQ_EVENT, vcpu);
8339		- return EMULATE_DONE;
	9878	+ return 1;
8340	9879	}
8341	9880	EXPORT_SYMBOL_GPL(kvm_task_switch);
8342	9881
..	..	@@ -8350,6 +9889,8 @@
8350	9889	*/
8351	9890	if (!(sregs->cr4 & X86_CR4_PAE)
8352	9891	\|\| !(sregs->efer & EFER_LMA))
	9892	+ return -EINVAL;
	9893	+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
8353	9894	return -EINVAL;
8354	9895	} else {
8355	9896	/*
..	..	@@ -8382,31 +9923,31 @@
8382	9923
8383	9924	dt.size = sregs->idt.limit;
8384	9925	dt.address = sregs->idt.base;
8385		- kvm_x86_ops->set_idt(vcpu, &dt);
	9926	+ kvm_x86_ops.set_idt(vcpu, &dt);
8386	9927	dt.size = sregs->gdt.limit;
8387	9928	dt.address = sregs->gdt.base;
8388		- kvm_x86_ops->set_gdt(vcpu, &dt);
	9929	+ kvm_x86_ops.set_gdt(vcpu, &dt);
8389	9930
8390	9931	vcpu->arch.cr2 = sregs->cr2;
8391	9932	mmu_reset_needed \|= kvm_read_cr3(vcpu) != sregs->cr3;
8392	9933	vcpu->arch.cr3 = sregs->cr3;
8393		- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
	9934	+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
8394	9935
8395	9936	kvm_set_cr8(vcpu, sregs->cr8);
8396	9937
8397	9938	mmu_reset_needed \|= vcpu->arch.efer != sregs->efer;
8398		- kvm_x86_ops->set_efer(vcpu, sregs->efer);
	9939	+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
8399	9940
8400	9941	mmu_reset_needed \|= kvm_read_cr0(vcpu) != sregs->cr0;
8401		- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
	9942	+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
8402	9943	vcpu->arch.cr0 = sregs->cr0;
8403	9944
8404	9945	mmu_reset_needed \|= kvm_read_cr4(vcpu) != sregs->cr4;
8405	9946	cpuid_update_needed \|= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
8406	9947	(X86_CR4_OSXSAVE \| X86_CR4_PKE));
8407		- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
	9948	+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
8408	9949	if (cpuid_update_needed)
8409		- kvm_update_cpuid(vcpu);
	9950	+ kvm_update_cpuid_runtime(vcpu);
8410	9951
8411	9952	idx = srcu_read_lock(&vcpu->kvm->srcu);
8412	9953	if (is_pae_paging(vcpu)) {
..	..	@@ -8510,7 +10051,7 @@
8510	10051	*/
8511	10052	kvm_set_rflags(vcpu, rflags);
8512	10053
8513		- kvm_x86_ops->update_bp_intercept(vcpu);
	10054	+ kvm_x86_ops.update_exception_bitmap(vcpu);
8514	10055
8515	10056	r = 0;
8516	10057
..	..	@@ -8549,7 +10090,7 @@
8549	10090
8550	10091	vcpu_load(vcpu);
8551	10092
8552		- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
	10093	+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8553	10094	memcpy(fpu->fpr, fxsave->st_space, 128);
8554	10095	fpu->fcw = fxsave->cwd;
8555	10096	fpu->fsw = fxsave->swd;
..	..	@@ -8557,7 +10098,7 @@
8557	10098	fpu->last_opcode = fxsave->fop;
8558	10099	fpu->last_ip = fxsave->rip;
8559	10100	fpu->last_dp = fxsave->rdp;
8560		- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
	10101	+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
8561	10102
8562	10103	vcpu_put(vcpu);
8563	10104	return 0;
..	..	@@ -8569,7 +10110,7 @@
8569	10110
8570	10111	vcpu_load(vcpu);
8571	10112
8572		- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
	10113	+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
8573	10114
8574	10115	memcpy(fxsave->st_space, fpu->fpr, 128);
8575	10116	fxsave->cwd = fpu->fcw;
..	..	@@ -8578,7 +10119,7 @@
8578	10119	fxsave->fop = fpu->last_opcode;
8579	10120	fxsave->rip = fpu->last_ip;
8580	10121	fxsave->rdp = fpu->last_dp;
8581		- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
	10122	+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
8582	10123
8583	10124	vcpu_put(vcpu);
8584	10125	return 0;
..	..	@@ -8625,9 +10166,9 @@
8625	10166
8626	10167	static void fx_init(struct kvm_vcpu *vcpu)
8627	10168	{
8628		- fpstate_init(&vcpu->arch.guest_fpu.state);
	10169	+ fpstate_init(&vcpu->arch.guest_fpu->state);
8629	10170	if (boot_cpu_has(X86_FEATURE_XSAVES))
8630		- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
	10171	+ vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
8631	10172	host_xcr0 \| XSTATE_COMPACTION_ENABLED;
8632	10173
8633	10174	/*
..	..	@@ -8638,48 +10179,122 @@
8638	10179	vcpu->arch.cr0 \|= X86_CR0_ET;
8639	10180	}
8640	10181
8641		-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
	10182	+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
8642	10183	{
8643		- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
8644		- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
8645		-
8646		- kvm_release_pfn(cache->pfn, cache->dirty, cache);
8647		-
8648		- kvmclock_reset(vcpu);
8649		-
8650		- kvm_x86_ops->vcpu_free(vcpu);
8651		- free_cpumask_var(wbinvd_dirty_mask);
8652		-}
8653		-
8654		-struct kvm_vcpu kvm_arch_vcpu_create(struct kvm kvm,
8655		- unsigned int id)
8656		-{
8657		- struct kvm_vcpu *vcpu;
8658		-
8659	10184	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
8660		- printk_once(KERN_WARNING
8661		- "kvm: SMP vm created on host with unstable TSC; "
8662		- "guest TSC will not be reliable\n");
	10185	+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
	10186	+ "guest TSC will not be reliable\n");
8663	10187
8664		- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
8665		-
8666		- return vcpu;
	10188	+ return 0;
8667	10189	}
8668	10190
8669		-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
	10191	+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
8670	10192	{
	10193	+ struct page *page;
	10194	+ int r;
	10195	+
	10196	+ if (!irqchip_in_kernel(vcpu->kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
	10197	+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
	10198	+ else
	10199	+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
	10200	+
	10201	+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
	10202	+
	10203	+ r = kvm_mmu_create(vcpu);
	10204	+ if (r < 0)
	10205	+ return r;
	10206	+
	10207	+ if (irqchip_in_kernel(vcpu->kvm)) {
	10208	+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
	10209	+ if (r < 0)
	10210	+ goto fail_mmu_destroy;
	10211	+ if (kvm_apicv_activated(vcpu->kvm))
	10212	+ vcpu->arch.apicv_active = true;
	10213	+ } else
	10214	+ static_key_slow_inc(&kvm_no_apic_vcpu);
	10215	+
	10216	+ r = -ENOMEM;
	10217	+
	10218	+ page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
	10219	+ if (!page)
	10220	+ goto fail_free_lapic;
	10221	+ vcpu->arch.pio_data = page_address(page);
	10222	+
	10223	+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
	10224	+ GFP_KERNEL_ACCOUNT);
	10225	+ if (!vcpu->arch.mce_banks)
	10226	+ goto fail_free_pio_data;
	10227	+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
	10228	+
	10229	+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
	10230	+ GFP_KERNEL_ACCOUNT))
	10231	+ goto fail_free_mce_banks;
	10232	+
	10233	+ if (!alloc_emulate_ctxt(vcpu))
	10234	+ goto free_wbinvd_dirty_mask;
	10235	+
	10236	+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
	10237	+ GFP_KERNEL_ACCOUNT);
	10238	+ if (!vcpu->arch.user_fpu) {
	10239	+ pr_err("kvm: failed to allocate userspace's fpu\n");
	10240	+ goto free_emulate_ctxt;
	10241	+ }
	10242	+
	10243	+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
	10244	+ GFP_KERNEL_ACCOUNT);
	10245	+ if (!vcpu->arch.guest_fpu) {
	10246	+ pr_err("kvm: failed to allocate vcpu's fpu\n");
	10247	+ goto free_user_fpu;
	10248	+ }
	10249	+ fx_init(vcpu);
	10250	+
	10251	+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
	10252	+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
	10253	+
	10254	+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
	10255	+
	10256	+ kvm_async_pf_hash_reset(vcpu);
	10257	+ kvm_pmu_init(vcpu);
	10258	+
	10259	+ vcpu->arch.pending_external_vector = -1;
	10260	+ vcpu->arch.preempted_in_kernel = false;
	10261	+
	10262	+ kvm_hv_vcpu_init(vcpu);
	10263	+
	10264	+ r = kvm_x86_ops.vcpu_create(vcpu);
	10265	+ if (r)
	10266	+ goto free_guest_fpu;
	10267	+
8671	10268	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
	10269	+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8672	10270	kvm_vcpu_mtrr_init(vcpu);
8673	10271	vcpu_load(vcpu);
8674	10272	kvm_vcpu_reset(vcpu, false);
8675		- kvm_mmu_setup(vcpu);
	10273	+ kvm_init_mmu(vcpu, false);
8676	10274	vcpu_put(vcpu);
8677	10275	return 0;
	10276	+
	10277	+free_guest_fpu:
	10278	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
	10279	+free_user_fpu:
	10280	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
	10281	+free_emulate_ctxt:
	10282	+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
	10283	+free_wbinvd_dirty_mask:
	10284	+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
	10285	+fail_free_mce_banks:
	10286	+ kfree(vcpu->arch.mce_banks);
	10287	+fail_free_pio_data:
	10288	+ free_page((unsigned long)vcpu->arch.pio_data);
	10289	+fail_free_lapic:
	10290	+ kvm_free_lapic(vcpu);
	10291	+fail_mmu_destroy:
	10292	+ kvm_mmu_destroy(vcpu);
	10293	+ return r;
8678	10294	}
8679	10295
8680	10296	void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
8681	10297	{
8682		- struct msr_data msr;
8683	10298	struct kvm *kvm = vcpu->kvm;
8684	10299
8685	10300	kvm_hv_vcpu_postcreate(vcpu);
..	..	@@ -8687,23 +10302,43 @@
8687	10302	if (mutex_lock_killable(&vcpu->mutex))
8688	10303	return;
8689	10304	vcpu_load(vcpu);
8690		- msr.data = 0x0;
8691		- msr.index = MSR_IA32_TSC;
8692		- msr.host_initiated = true;
8693		- kvm_write_tsc(vcpu, &msr);
	10305	+ kvm_synchronize_tsc(vcpu, 0);
8694	10306	vcpu_put(vcpu);
	10307	+
	10308	+ /* poll control enabled by default */
	10309	+ vcpu->arch.msr_kvm_poll_control = 1;
	10310	+
8695	10311	mutex_unlock(&vcpu->mutex);
8696	10312
8697		- if (!kvmclock_periodic_sync)
8698		- return;
8699		-
8700		- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
8701		- KVMCLOCK_SYNC_PERIOD);
	10313	+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
	10314	+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
	10315	+ KVMCLOCK_SYNC_PERIOD);
8702	10316	}
8703	10317
8704	10318	void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
8705	10319	{
8706		- kvm_arch_vcpu_free(vcpu);
	10320	+ int idx;
	10321	+
	10322	+ kvmclock_reset(vcpu);
	10323	+
	10324	+ kvm_x86_ops.vcpu_free(vcpu);
	10325	+
	10326	+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
	10327	+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
	10328	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
	10329	+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
	10330	+
	10331	+ kvm_hv_vcpu_uninit(vcpu);
	10332	+ kvm_pmu_destroy(vcpu);
	10333	+ kfree(vcpu->arch.mce_banks);
	10334	+ kvm_free_lapic(vcpu);
	10335	+ idx = srcu_read_lock(&vcpu->kvm->srcu);
	10336	+ kvm_mmu_destroy(vcpu);
	10337	+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
	10338	+ free_page((unsigned long)vcpu->arch.pio_data);
	10339	+ kvfree(vcpu->arch.cpuid_entries);
	10340	+ if (!lapic_in_kernel(vcpu))
	10341	+ static_key_slow_dec(&kvm_no_apic_vcpu);
8707	10342	}
8708	10343
8709	10344	void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
..	..	@@ -8719,19 +10354,18 @@
8719	10354	vcpu->arch.nmi_injected = false;
8720	10355	kvm_clear_interrupt_queue(vcpu);
8721	10356	kvm_clear_exception_queue(vcpu);
8722		- vcpu->arch.exception.pending = false;
8723	10357
8724	10358	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
8725	10359	kvm_update_dr0123(vcpu);
8726	10360	vcpu->arch.dr6 = DR6_INIT;
8727		- kvm_update_dr6(vcpu);
8728	10361	vcpu->arch.dr7 = DR7_FIXED_1;
8729	10362	kvm_update_dr7(vcpu);
8730	10363
8731	10364	vcpu->arch.cr2 = 0;
8732	10365
8733	10366	kvm_make_request(KVM_REQ_EVENT, vcpu);
8734		- vcpu->arch.apf.msr_val = 0;
	10367	+ vcpu->arch.apf.msr_en_val = 0;
	10368	+ vcpu->arch.apf.msr_int_val = 0;
8735	10369	vcpu->arch.st.msr_val = 0;
8736	10370
8737	10371	kvmclock_reset(vcpu);
..	..	@@ -8749,12 +10383,12 @@
8749	10383	*/
8750	10384	if (init_event)
8751	10385	kvm_put_guest_fpu(vcpu);
8752		- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8753		- XFEATURE_MASK_BNDREGS);
	10386	+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
	10387	+ XFEATURE_BNDREGS);
8754	10388	if (mpx_state_buffer)
8755	10389	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
8756		- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8757		- XFEATURE_MASK_BNDCSR);
	10390	+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
	10391	+ XFEATURE_BNDCSR);
8758	10392	if (mpx_state_buffer)
8759	10393	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
8760	10394	if (init_event)
..	..	@@ -8765,7 +10399,6 @@
8765	10399	kvm_pmu_reset(vcpu);
8766	10400	vcpu->arch.smbase = 0x30000;
8767	10401
8768		- vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
8769	10402	vcpu->arch.msr_misc_features_enables = 0;
8770	10403
8771	10404	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
..	..	@@ -8777,7 +10410,7 @@
8777	10410
8778	10411	vcpu->arch.ia32_xss = 0;
8779	10412
8780		- kvm_x86_ops->vcpu_reset(vcpu, init_event);
	10413	+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
8781	10414	}
8782	10415
8783	10416	void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
..	..	@@ -8801,8 +10434,8 @@
8801	10434	u64 max_tsc = 0;
8802	10435	bool stable, backwards_tsc = false;
8803	10436
8804		- kvm_shared_msr_cpu_online();
8805		- ret = kvm_x86_ops->hardware_enable();
	10437	+ kvm_user_return_msr_cpu_online();
	10438	+ ret = kvm_x86_ops.hardware_enable();
8806	10439	if (ret != 0)
8807	10440	return ret;
8808	10441
..	..	@@ -8828,7 +10461,7 @@
8828	10461	* before any KVM threads can be running. Unfortunately, we can't
8829	10462	* bring the TSCs fully up to date with real time, as we aren't yet far
8830	10463	* enough into CPU bringup that we know how much real time has actually
8831		- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
	10464	+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
8832	10465	* variables that haven't been updated yet.
8833	10466	*
8834	10467	* So we simply find the maximum observed TSC above, then record the
..	..	@@ -8884,19 +10517,32 @@
8884	10517
8885	10518	void kvm_arch_hardware_disable(void)
8886	10519	{
8887		- kvm_x86_ops->hardware_disable();
	10520	+ kvm_x86_ops.hardware_disable();
8888	10521	drop_user_return_notifiers();
8889	10522	}
8890	10523
8891		-int kvm_arch_hardware_setup(void)
	10524	+int kvm_arch_hardware_setup(void *opaque)
8892	10525	{
	10526	+ struct kvm_x86_init_ops *ops = opaque;
8893	10527	int r;
8894	10528
8895		- r = kvm_x86_ops->hardware_setup();
	10529	+ rdmsrl_safe(MSR_EFER, &host_efer);
	10530	+
	10531	+ if (boot_cpu_has(X86_FEATURE_XSAVES))
	10532	+ rdmsrl(MSR_IA32_XSS, host_xss);
	10533	+
	10534	+ r = ops->hardware_setup();
8896	10535	if (r != 0)
8897	10536	return r;
8898	10537
8899		- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
	10538	+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
	10539	+
	10540	+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
	10541	+ supported_xss = 0;
	10542	+
	10543	+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
	10544	+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
	10545	+#undef __kvm_cpu_cap_has
8900	10546
8901	10547	if (kvm_has_tsc_control) {
8902	10548	/*
..	..	@@ -8918,12 +10564,21 @@
8918	10564
8919	10565	void kvm_arch_hardware_unsetup(void)
8920	10566	{
8921		- kvm_x86_ops->hardware_unsetup();
	10567	+ kvm_x86_ops.hardware_unsetup();
8922	10568	}
8923	10569
8924		-void kvm_arch_check_processor_compat(void *rtn)
	10570	+int kvm_arch_check_processor_compat(void *opaque)
8925	10571	{
8926		- kvm_x86_ops->check_processor_compatibility(rtn);
	10572	+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
	10573	+ struct kvm_x86_init_ops *ops = opaque;
	10574	+
	10575	+ WARN_ON(!irqs_disabled());
	10576	+
	10577	+ if (__cr4_reserved_bits(cpu_has, c) !=
	10578	+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
	10579	+ return -EIO;
	10580	+
	10581	+ return ops->check_processor_compatibility();
8927	10582	}
8928	10583
8929	10584	bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
..	..	@@ -8940,107 +10595,35 @@
8940	10595	struct static_key kvm_no_apic_vcpu __read_mostly;
8941	10596	EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
8942	10597
8943		-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8944		-{
8945		- struct page *page;
8946		- int r;
8947		-
8948		- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
8949		- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
8950		- if (!irqchip_in_kernel(vcpu->kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
8951		- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
8952		- else
8953		- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
8954		-
8955		- page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
8956		- if (!page) {
8957		- r = -ENOMEM;
8958		- goto fail;
8959		- }
8960		- vcpu->arch.pio_data = page_address(page);
8961		-
8962		- kvm_set_tsc_khz(vcpu, max_tsc_khz);
8963		-
8964		- r = kvm_mmu_create(vcpu);
8965		- if (r < 0)
8966		- goto fail_free_pio_data;
8967		-
8968		- if (irqchip_in_kernel(vcpu->kvm)) {
8969		- r = kvm_create_lapic(vcpu);
8970		- if (r < 0)
8971		- goto fail_mmu_destroy;
8972		- } else
8973		- static_key_slow_inc(&kvm_no_apic_vcpu);
8974		-
8975		- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
8976		- GFP_KERNEL);
8977		- if (!vcpu->arch.mce_banks) {
8978		- r = -ENOMEM;
8979		- goto fail_free_lapic;
8980		- }
8981		- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
8982		-
8983		- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
8984		- r = -ENOMEM;
8985		- goto fail_free_mce_banks;
8986		- }
8987		-
8988		- fx_init(vcpu);
8989		-
8990		- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
8991		-
8992		- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
8993		-
8994		- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
8995		-
8996		- kvm_async_pf_hash_reset(vcpu);
8997		- kvm_pmu_init(vcpu);
8998		-
8999		- vcpu->arch.pending_external_vector = -1;
9000		- vcpu->arch.preempted_in_kernel = false;
9001		-
9002		- kvm_hv_vcpu_init(vcpu);
9003		-
9004		- return 0;
9005		-
9006		-fail_free_mce_banks:
9007		- kfree(vcpu->arch.mce_banks);
9008		-fail_free_lapic:
9009		- kvm_free_lapic(vcpu);
9010		-fail_mmu_destroy:
9011		- kvm_mmu_destroy(vcpu);
9012		-fail_free_pio_data:
9013		- free_page((unsigned long)vcpu->arch.pio_data);
9014		-fail:
9015		- return r;
9016		-}
9017		-
9018		-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
9019		-{
9020		- int idx;
9021		-
9022		- kvm_hv_vcpu_uninit(vcpu);
9023		- kvm_pmu_destroy(vcpu);
9024		- kfree(vcpu->arch.mce_banks);
9025		- kvm_free_lapic(vcpu);
9026		- idx = srcu_read_lock(&vcpu->kvm->srcu);
9027		- kvm_mmu_destroy(vcpu);
9028		- srcu_read_unlock(&vcpu->kvm->srcu, idx);
9029		- free_page((unsigned long)vcpu->arch.pio_data);
9030		- if (!lapic_in_kernel(vcpu))
9031		- static_key_slow_dec(&kvm_no_apic_vcpu);
9032		-}
9033		-
9034	10598	void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
9035	10599	{
	10600	+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
	10601	+
9036	10602	vcpu->arch.l1tf_flush_l1d = true;
9037		- kvm_x86_ops->sched_in(vcpu, cpu);
	10603	+ if (pmu->version && unlikely(pmu->event_count)) {
	10604	+ pmu->need_cleanup = true;
	10605	+ kvm_make_request(KVM_REQ_PMU, vcpu);
	10606	+ }
	10607	+ kvm_x86_ops.sched_in(vcpu, cpu);
9038	10608	}
	10609	+
	10610	+void kvm_arch_free_vm(struct kvm *kvm)
	10611	+{
	10612	+ kfree(kvm->arch.hyperv.hv_pa_pg);
	10613	+ vfree(kvm);
	10614	+}
	10615	+
9039	10616
9040	10617	int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
9041	10618	{
	10619	+ int ret;
	10620	+
9042	10621	if (type)
9043	10622	return -EINVAL;
	10623	+
	10624	+ ret = kvm_page_track_init(kvm);
	10625	+ if (ret)
	10626	+ return ret;
9044	10627
9045	10628	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
9046	10629	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
..	..	@@ -9059,7 +10642,7 @@
9059	10642	mutex_init(&kvm->arch.apic_map_lock);
9060	10643	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
9061	10644
9062		- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
	10645	+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
9063	10646	pvclock_update_vm_gtod_copy(kvm);
9064	10647
9065	10648	kvm->arch.guest_can_read_msr_platform_info = true;
..	..	@@ -9068,13 +10651,9 @@
9068	10651	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
9069	10652
9070	10653	kvm_hv_init_vm(kvm);
9071		- kvm_page_track_init(kvm);
9072	10654	kvm_mmu_init_vm(kvm);
9073	10655
9074		- if (kvm_x86_ops->vm_init)
9075		- return kvm_x86_ops->vm_init(kvm);
9076		-
9077		- return 0;
	10656	+ return kvm_x86_ops.vm_init(kvm);
9078	10657	}
9079	10658
9080	10659	int kvm_arch_post_init_vm(struct kvm *kvm)
..	..	@@ -9102,7 +10681,7 @@
9102	10681	kvm_unload_vcpu_mmu(vcpu);
9103	10682	}
9104	10683	kvm_for_each_vcpu(i, vcpu, kvm)
9105		- kvm_arch_vcpu_free(vcpu);
	10684	+ kvm_vcpu_destroy(vcpu);
9106	10685
9107	10686	mutex_lock(&kvm->lock);
9108	10687	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
..	..	@@ -9122,9 +10701,9 @@
9122	10701	int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9123	10702	{
9124	10703	int i, r;
9125		- unsigned long hva;
	10704	+ unsigned long hva, old_npages;
9126	10705	struct kvm_memslots *slots = kvm_memslots(kvm);
9127		- struct kvm_memory_slot *slot, old;
	10706	+ struct kvm_memory_slot *slot;
9128	10707
9129	10708	/* Called with kvm->slots_lock held. */
9130	10709	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
..	..	@@ -9132,7 +10711,7 @@
9132	10711
9133	10712	slot = id_to_memslot(slots, id);
9134	10713	if (size) {
9135		- if (slot->npages)
	10714	+ if (slot && slot->npages)
9136	10715	return -EEXIST;
9137	10716
9138	10717	/*
..	..	@@ -9144,13 +10723,13 @@
9144	10723	if (IS_ERR((void *)hva))
9145	10724	return PTR_ERR((void *)hva);
9146	10725	} else {
9147		- if (!slot->npages)
	10726	+ if (!slot \|\| !slot->npages)
9148	10727	return 0;
9149	10728
	10729	+ old_npages = slot->npages;
9150	10730	hva = 0;
9151	10731	}
9152	10732
9153		- old = *slot;
9154	10733	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
9155	10734	struct kvm_userspace_memory_region m;
9156	10735
..	..	@@ -9165,23 +10744,11 @@
9165	10744	}
9166	10745
9167	10746	if (!size)
9168		- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
	10747	+ vm_munmap(hva, old_npages * PAGE_SIZE);
9169	10748
9170	10749	return 0;
9171	10750	}
9172	10751	EXPORT_SYMBOL_GPL(__x86_set_memory_region);
9173		-
9174		-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9175		-{
9176		- int r;
9177		-
9178		- mutex_lock(&kvm->slots_lock);
9179		- r = __x86_set_memory_region(kvm, id, gpa, size);
9180		- mutex_unlock(&kvm->slots_lock);
9181		-
9182		- return r;
9183		-}
9184		-EXPORT_SYMBOL_GPL(x86_set_memory_region);
9185	10752
9186	10753	void kvm_arch_pre_destroy_vm(struct kvm *kvm)
9187	10754	{
..	..	@@ -9196,46 +10763,47 @@
9196	10763	* unless the the memory map has changed due to process exit
9197	10764	* or fd copying.
9198	10765	*/
9199		- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
9200		- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
9201		- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
	10766	+ mutex_lock(&kvm->slots_lock);
	10767	+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
	10768	+ 0, 0);
	10769	+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
	10770	+ 0, 0);
	10771	+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
	10772	+ mutex_unlock(&kvm->slots_lock);
9202	10773	}
9203		- if (kvm_x86_ops->vm_destroy)
9204		- kvm_x86_ops->vm_destroy(kvm);
	10774	+ if (kvm_x86_ops.vm_destroy)
	10775	+ kvm_x86_ops.vm_destroy(kvm);
	10776	+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
9205	10777	kvm_pic_destroy(kvm);
9206	10778	kvm_ioapic_destroy(kvm);
9207	10779	kvm_free_vcpus(kvm);
9208	10780	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
	10781	+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
9209	10782	kvm_mmu_uninit_vm(kvm);
9210	10783	kvm_page_track_cleanup(kvm);
9211	10784	kvm_hv_destroy_vm(kvm);
9212	10785	}
9213	10786
9214		-void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
9215		- struct kvm_memory_slot *dont)
	10787	+void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot slot)
9216	10788	{
9217	10789	int i;
9218	10790
9219	10791	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
9220		- if (!dont \|\| free->arch.rmap[i] != dont->arch.rmap[i]) {
9221		- kvfree(free->arch.rmap[i]);
9222		- free->arch.rmap[i] = NULL;
9223		- }
	10792	+ kvfree(slot->arch.rmap[i]);
	10793	+ slot->arch.rmap[i] = NULL;
	10794	+
9224	10795	if (i == 0)
9225	10796	continue;
9226	10797
9227		- if (!dont \|\| free->arch.lpage_info[i - 1] !=
9228		- dont->arch.lpage_info[i - 1]) {
9229		- kvfree(free->arch.lpage_info[i - 1]);
9230		- free->arch.lpage_info[i - 1] = NULL;
9231		- }
	10798	+ kvfree(slot->arch.lpage_info[i - 1]);
	10799	+ slot->arch.lpage_info[i - 1] = NULL;
9232	10800	}
9233	10801
9234		- kvm_page_track_free_memslot(free, dont);
	10802	+ kvm_page_track_free_memslot(slot);
9235	10803	}
9236	10804
9237		-int kvm_arch_create_memslot(struct kvm kvm, struct kvm_memory_slot slot,
9238		- unsigned long npages)
	10805	+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
	10806	+ unsigned long npages)
9239	10807	{
9240	10808	int i;
9241	10809
..	..	@@ -9257,13 +10825,13 @@
9257	10825
9258	10826	slot->arch.rmap[i] =
9259	10827	kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
9260		- GFP_KERNEL);
	10828	+ GFP_KERNEL_ACCOUNT);
9261	10829	if (!slot->arch.rmap[i])
9262	10830	goto out_free;
9263	10831	if (i == 0)
9264	10832	continue;
9265	10833
9266		- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
	10834	+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
9267	10835	if (!linfo)
9268	10836	goto out_free;
9269	10837
..	..	@@ -9276,11 +10844,9 @@
9276	10844	ugfn = slot->userspace_addr >> PAGE_SHIFT;
9277	10845	/*
9278	10846	* If the gfn and userspace address are not aligned wrt each
9279		- * other, or if explicitly asked to, disable large page
9280		- * support for this slot
	10847	+ * other, disable large page support for this slot.
9281	10848	*/
9282		- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) \|\|
9283		- !kvm_largepages_enabled()) {
	10849	+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
9284	10850	unsigned long j;
9285	10851
9286	10852	for (j = 0; j < lpages; ++j)
..	..	@@ -9327,76 +10893,23 @@
9327	10893	const struct kvm_userspace_memory_region *mem,
9328	10894	enum kvm_mr_change change)
9329	10895	{
9330		- if (change == KVM_MR_MOVE)
9331		- return kvm_arch_create_memslot(kvm, memslot,
9332		- mem->memory_size >> PAGE_SHIFT);
9333		-
	10896	+ if (change == KVM_MR_CREATE \|\| change == KVM_MR_MOVE)
	10897	+ return kvm_alloc_memslot_metadata(memslot,
	10898	+ mem->memory_size >> PAGE_SHIFT);
9334	10899	return 0;
9335	10900	}
9336	10901
9337	10902	static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
9338		- struct kvm_memory_slot *new)
	10903	+ struct kvm_memory_slot *old,
	10904	+ struct kvm_memory_slot *new,
	10905	+ enum kvm_mr_change change)
9339	10906	{
9340		- /* Still write protect RO slot */
9341		- if (new->flags & KVM_MEM_READONLY) {
9342		- kvm_mmu_slot_remove_write_access(kvm, new);
9343		- return;
9344		- }
9345		-
9346	10907	/*
9347		- * Call kvm_x86_ops dirty logging hooks when they are valid.
9348		- *
9349		- * kvm_x86_ops->slot_disable_log_dirty is called when:
9350		- *
9351		- * - KVM_MR_CREATE with dirty logging is disabled
9352		- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
9353		- *
9354		- * The reason is, in case of PML, we need to set D-bit for any slots
9355		- * with dirty logging disabled in order to eliminate unnecessary GPA
9356		- * logging in PML buffer (and potential PML buffer full VMEXT). This
9357		- * guarantees leaving PML enabled during guest's lifetime won't have
9358		- * any additonal overhead from PML when guest is running with dirty
9359		- * logging disabled for memory slots.
9360		- *
9361		- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
9362		- * to dirty logging mode.
9363		- *
9364		- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
9365		- *
9366		- * In case of write protect:
9367		- *
9368		- * Write protect all pages for dirty logging.
9369		- *
9370		- * All the sptes including the large sptes which point to this
9371		- * slot are set to readonly. We can not create any new large
9372		- * spte on this slot until the end of the logging.
9373		- *
9374		- * See the comments in fast_page_fault().
	10908	+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
	10909	+ * See comments below.
9375	10910	*/
9376		- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
9377		- if (kvm_x86_ops->slot_enable_log_dirty)
9378		- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
9379		- else
9380		- kvm_mmu_slot_remove_write_access(kvm, new);
9381		- } else {
9382		- if (kvm_x86_ops->slot_disable_log_dirty)
9383		- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
9384		- }
9385		-}
9386		-
9387		-void kvm_arch_commit_memory_region(struct kvm *kvm,
9388		- const struct kvm_userspace_memory_region *mem,
9389		- const struct kvm_memory_slot *old,
9390		- const struct kvm_memory_slot *new,
9391		- enum kvm_mr_change change)
9392		-{
9393		- int nr_mmu_pages = 0;
9394		-
9395		- if (!kvm->arch.n_requested_mmu_pages)
9396		- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
9397		-
9398		- if (nr_mmu_pages)
9399		- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
	10911	+ if ((change != KVM_MR_FLAGS_ONLY) \|\| (new->flags & KVM_MEM_READONLY))
	10912	+ return;
9400	10913
9401	10914	/*
9402	10915	* Dirty logging tracks sptes in 4k granularity, meaning that large
..	..	@@ -9409,29 +10922,91 @@
9409	10922	* Scan sptes if dirty logging has been stopped, dropping those
9410	10923	* which can be collapsed into a single large-page spte. Later
9411	10924	* page faults will create the large-page sptes.
	10925	+ *
	10926	+ * There is no need to do this in any of the following cases:
	10927	+ * CREATE: No dirty mappings will already exist.
	10928	+ * MOVE/DELETE: The old mappings will already have been cleaned up by
	10929	+ * kvm_arch_flush_shadow_memslot()
9412	10930	*/
9413		- if ((change != KVM_MR_DELETE) &&
9414		- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
9415		- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
	10931	+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
	10932	+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
9416	10933	kvm_mmu_zap_collapsible_sptes(kvm, new);
9417	10934
9418	10935	/*
9419		- * Set up write protection and/or dirty logging for the new slot.
	10936	+ * Enable or disable dirty logging for the slot.
9420	10937	*
9421		- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
9422		- * been zapped so no dirty logging staff is needed for old slot. For
9423		- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
9424		- * new and it's also covered when dealing with the new slot.
	10938	+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
	10939	+ * slot have been zapped so no dirty logging updates are needed for
	10940	+ * the old slot.
	10941	+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
	10942	+ * any mappings that might be created in it will consume the
	10943	+ * properties of the new slot and do not need to be updated here.
9425	10944	*
	10945	+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
	10946	+ * called to enable/disable dirty logging.
	10947	+ *
	10948	+ * When disabling dirty logging with PML enabled, the D-bit is set
	10949	+ * for sptes in the slot in order to prevent unnecessary GPA
	10950	+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
	10951	+ * This guarantees leaving PML enabled for the guest's lifetime
	10952	+ * won't have any additional overhead from PML when the guest is
	10953	+ * running with dirty logging disabled.
	10954	+ *
	10955	+ * When enabling dirty logging, large sptes are write-protected
	10956	+ * so they can be split on first write. New large sptes cannot
	10957	+ * be created for this slot until the end of the logging.
	10958	+ * See the comments in fast_page_fault().
	10959	+ * For small sptes, nothing is done if the dirty log is in the
	10960	+ * initial-all-set state. Otherwise, depending on whether pml
	10961	+ * is enabled the D-bit or the W-bit will be cleared.
	10962	+ */
	10963	+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
	10964	+ if (kvm_x86_ops.slot_enable_log_dirty) {
	10965	+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
	10966	+ } else {
	10967	+ int level =
	10968	+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
	10969	+ PG_LEVEL_2M : PG_LEVEL_4K;
	10970	+
	10971	+ /*
	10972	+ * If we're with initial-all-set, we don't need
	10973	+ * to write protect any small page because
	10974	+ * they're reported as dirty already. However
	10975	+ * we still need to write-protect huge pages
	10976	+ * so that the page split can happen lazily on
	10977	+ * the first write to the huge page.
	10978	+ */
	10979	+ kvm_mmu_slot_remove_write_access(kvm, new, level);
	10980	+ }
	10981	+ } else {
	10982	+ if (kvm_x86_ops.slot_disable_log_dirty)
	10983	+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
	10984	+ }
	10985	+}
	10986	+
	10987	+void kvm_arch_commit_memory_region(struct kvm *kvm,
	10988	+ const struct kvm_userspace_memory_region *mem,
	10989	+ struct kvm_memory_slot *old,
	10990	+ const struct kvm_memory_slot *new,
	10991	+ enum kvm_mr_change change)
	10992	+{
	10993	+ if (!kvm->arch.n_requested_mmu_pages)
	10994	+ kvm_mmu_change_mmu_pages(kvm,
	10995	+ kvm_mmu_calculate_default_mmu_pages(kvm));
	10996	+
	10997	+ /*
9426	10998	* FIXME: const-ify all uses of struct kvm_memory_slot.
9427	10999	*/
9428		- if (change != KVM_MR_DELETE)
9429		- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
	11000	+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
	11001	+
	11002	+ /* Free the arrays associated with the old memslot. */
	11003	+ if (change == KVM_MR_MOVE)
	11004	+ kvm_arch_free_memslot(kvm, old);
9430	11005	}
9431	11006
9432	11007	void kvm_arch_flush_shadow_all(struct kvm *kvm)
9433	11008	{
9434		- kvm_mmu_invalidate_zap_all_pages(kvm);
	11009	+ kvm_mmu_zap_all(kvm);
9435	11010	}
9436	11011
9437	11012	void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
..	..	@@ -9443,8 +11018,8 @@
9443	11018	static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
9444	11019	{
9445	11020	return (is_guest_mode(vcpu) &&
9446		- kvm_x86_ops->guest_apic_has_interrupt &&
9447		- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
	11021	+ kvm_x86_ops.guest_apic_has_interrupt &&
	11022	+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
9448	11023	}
9449	11024
9450	11025	static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
..	..	@@ -9463,11 +11038,12 @@
9463	11038
9464	11039	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
9465	11040	(vcpu->arch.nmi_pending &&
9466		- kvm_x86_ops->nmi_allowed(vcpu)))
	11041	+ kvm_x86_ops.nmi_allowed(vcpu, false)))
9467	11042	return true;
9468	11043
9469	11044	if (kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
9470		- (vcpu->arch.smi_pending && !is_smm(vcpu)))
	11045	+ (vcpu->arch.smi_pending &&
	11046	+ kvm_x86_ops.smi_allowed(vcpu, false)))
9471	11047	return true;
9472	11048
9473	11049	if (kvm_arch_interrupt_allowed(vcpu) &&
..	..	@@ -9476,6 +11052,11 @@
9476	11052	return true;
9477	11053
9478	11054	if (kvm_hv_has_stimer_pending(vcpu))
	11055	+ return true;
	11056	+
	11057	+ if (is_guest_mode(vcpu) &&
	11058	+ kvm_x86_ops.nested_ops->hv_timer_pending &&
	11059	+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
9479	11060	return true;
9480	11061
9481	11062	return false;
..	..	@@ -9496,7 +11077,7 @@
9496	11077	kvm_test_request(KVM_REQ_EVENT, vcpu))
9497	11078	return true;
9498	11079
9499		- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
	11080	+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
9500	11081	return true;
9501	11082
9502	11083	return false;
..	..	@@ -9514,7 +11095,7 @@
9514	11095
9515	11096	int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
9516	11097	{
9517		- return kvm_x86_ops->interrupt_allowed(vcpu);
	11098	+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
9518	11099	}
9519	11100
9520	11101	unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
..	..	@@ -9536,7 +11117,7 @@
9536	11117	{
9537	11118	unsigned long rflags;
9538	11119
9539		- rflags = kvm_x86_ops->get_rflags(vcpu);
	11120	+ rflags = kvm_x86_ops.get_rflags(vcpu);
9540	11121	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
9541	11122	rflags &= ~X86_EFLAGS_TF;
9542	11123	return rflags;
..	..	@@ -9548,7 +11129,7 @@
9548	11129	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
9549	11130	kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
9550	11131	rflags \|= X86_EFLAGS_TF;
9551		- kvm_x86_ops->set_rflags(vcpu, rflags);
	11132	+ kvm_x86_ops.set_rflags(vcpu, rflags);
9552	11133	}
9553	11134
9554	11135	void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
..	..	@@ -9562,7 +11143,7 @@
9562	11143	{
9563	11144	int r;
9564	11145
9565		- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) \|\|
	11146	+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) \|\|
9566	11147	work->wakeup_all)
9567	11148	return;
9568	11149
..	..	@@ -9570,21 +11151,23 @@
9570	11151	if (unlikely(r))
9571	11152	return;
9572	11153
9573		- if (!vcpu->arch.mmu.direct_map &&
9574		- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
	11154	+ if (!vcpu->arch.mmu->direct_map &&
	11155	+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
9575	11156	return;
9576	11157
9577		- vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
	11158	+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
9578	11159	}
9579	11160
9580	11161	static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
9581	11162	{
	11163	+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
	11164	+
9582	11165	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
9583	11166	}
9584	11167
9585	11168	static inline u32 kvm_async_pf_next_probe(u32 key)
9586	11169	{
9587		- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
	11170	+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
9588	11171	}
9589	11172
9590	11173	static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
..	..	@@ -9602,7 +11185,7 @@
9602	11185	int i;
9603	11186	u32 key = kvm_async_pf_hash_fn(gfn);
9604	11187
9605		- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
	11188	+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
9606	11189	(vcpu->arch.apf.gfns[key] != gfn &&
9607	11190	vcpu->arch.apf.gfns[key] != ~0); i++)
9608	11191	key = kvm_async_pf_next_probe(key);
..	..	@@ -9620,6 +11203,10 @@
9620	11203	u32 i, j, k;
9621	11204
9622	11205	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
	11206	+
	11207	+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
	11208	+ return;
	11209	+
9623	11210	while (true) {
9624	11211	vcpu->arch.apf.gfns[i] = ~0;
9625	11212	do {
..	..	@@ -9638,21 +11225,64 @@
9638	11225	}
9639	11226	}
9640	11227
9641		-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
	11228	+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
9642	11229	{
	11230	+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
9643	11231
9644		- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
9645		- sizeof(val));
	11232	+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
	11233	+ sizeof(reason));
9646	11234	}
9647	11235
9648		-static int apf_get_user(struct kvm_vcpu vcpu, u32 val)
	11236	+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
9649	11237	{
	11238	+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
9650	11239
9651		- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
9652		- sizeof(u32));
	11240	+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
	11241	+ &token, offset, sizeof(token));
9653	11242	}
9654	11243
9655		-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
	11244	+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
	11245	+{
	11246	+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
	11247	+ u32 val;
	11248	+
	11249	+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
	11250	+ &val, offset, sizeof(val)))
	11251	+ return false;
	11252	+
	11253	+ return !val;
	11254	+}
	11255	+
	11256	+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
	11257	+{
	11258	+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
	11259	+ return false;
	11260	+
	11261	+ if (!kvm_pv_async_pf_enabled(vcpu) \|\|
	11262	+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
	11263	+ return false;
	11264	+
	11265	+ return true;
	11266	+}
	11267	+
	11268	+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
	11269	+{
	11270	+ if (unlikely(!lapic_in_kernel(vcpu) \|\|
	11271	+ kvm_event_needs_reinjection(vcpu) \|\|
	11272	+ vcpu->arch.exception.pending))
	11273	+ return false;
	11274	+
	11275	+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
	11276	+ return false;
	11277	+
	11278	+ /*
	11279	+ * If interrupts are off we cannot even use an artificial
	11280	+ * halt state.
	11281	+ */
	11282	+ return kvm_arch_interrupt_allowed(vcpu);
	11283	+}
	11284	+
	11285	+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
9656	11286	struct kvm_async_pf *work)
9657	11287	{
9658	11288	struct x86_exception fault;
..	..	@@ -9660,11 +11290,8 @@
9660	11290	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
9661	11291	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
9662	11292
9663		- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) \|\|
9664		- (vcpu->arch.apf.send_user_only &&
9665		- kvm_x86_ops->get_cpl(vcpu) == 0))
9666		- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9667		- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
	11293	+ if (kvm_can_deliver_async_pf(vcpu) &&
	11294	+ !apf_put_user_notpresent(vcpu)) {
9668	11295	fault.vector = PF_VECTOR;
9669	11296	fault.error_code_valid = true;
9670	11297	fault.error_code = 0;
..	..	@@ -9672,14 +11299,28 @@
9672	11299	fault.address = work->arch.token;
9673	11300	fault.async_page_fault = true;
9674	11301	kvm_inject_page_fault(vcpu, &fault);
	11302	+ return true;
	11303	+ } else {
	11304	+ /*
	11305	+ * It is not possible to deliver a paravirtualized asynchronous
	11306	+ * page fault, but putting the guest in an artificial halt state
	11307	+ * can be beneficial nevertheless: if an interrupt arrives, we
	11308	+ * can deliver it timely and perhaps the guest will schedule
	11309	+ * another process. When the instruction that triggered a page
	11310	+ * fault is retried, hopefully the page will be ready in the host.
	11311	+ */
	11312	+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
	11313	+ return false;
9675	11314	}
9676	11315	}
9677	11316
9678	11317	void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
9679	11318	struct kvm_async_pf *work)
9680	11319	{
9681		- struct x86_exception fault;
9682		- u32 val;
	11320	+ struct kvm_lapic_irq irq = {
	11321	+ .delivery_mode = APIC_DM_FIXED,
	11322	+ .vector = vcpu->arch.apf.vec
	11323	+ };
9683	11324
9684	11325	if (work->wakeup_all)
9685	11326	work->arch.token = ~0; /* broadcast wakeup */
..	..	@@ -9687,37 +11328,30 @@
9687	11328	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
9688	11329	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
9689	11330
9690		- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
9691		- !apf_get_user(vcpu, &val)) {
9692		- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
9693		- vcpu->arch.exception.pending &&
9694		- vcpu->arch.exception.nr == PF_VECTOR &&
9695		- !apf_put_user(vcpu, 0)) {
9696		- vcpu->arch.exception.injected = false;
9697		- vcpu->arch.exception.pending = false;
9698		- vcpu->arch.exception.nr = 0;
9699		- vcpu->arch.exception.has_error_code = false;
9700		- vcpu->arch.exception.error_code = 0;
9701		- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
9702		- fault.vector = PF_VECTOR;
9703		- fault.error_code_valid = true;
9704		- fault.error_code = 0;
9705		- fault.nested_page_fault = false;
9706		- fault.address = work->arch.token;
9707		- fault.async_page_fault = true;
9708		- kvm_inject_page_fault(vcpu, &fault);
9709		- }
	11331	+ if ((work->wakeup_all \|\| work->notpresent_injected) &&
	11332	+ kvm_pv_async_pf_enabled(vcpu) &&
	11333	+ !apf_put_user_ready(vcpu, work->arch.token)) {
	11334	+ vcpu->arch.apf.pageready_pending = true;
	11335	+ kvm_apic_set_irq(vcpu, &irq, NULL);
9710	11336	}
	11337	+
9711	11338	vcpu->arch.apf.halted = false;
9712	11339	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
9713	11340	}
9714	11341
9715		-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
	11342	+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
9716	11343	{
9717		- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
	11344	+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
	11345	+ if (!vcpu->arch.apf.pageready_pending)
	11346	+ kvm_vcpu_kick(vcpu);
	11347	+}
	11348	+
	11349	+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
	11350	+{
	11351	+ if (!kvm_pv_async_pf_enabled(vcpu))
9718	11352	return true;
9719	11353	else
9720		- return kvm_can_do_async_pf(vcpu);
	11354	+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
9721	11355	}
9722	11356
9723	11357	void kvm_arch_start_assignment(struct kvm *kvm)
..	..	@@ -9732,9 +11366,9 @@
9732	11366	}
9733	11367	EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
9734	11368
9735		-bool kvm_arch_has_assigned_device(struct kvm *kvm)
	11369	+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
9736	11370	{
9737		- return atomic_read(&kvm->arch.assigned_device_count);
	11371	+ return arch_atomic_read(&kvm->arch.assigned_device_count);
9738	11372	}
9739	11373	EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
9740	11374
..	..	@@ -9758,7 +11392,7 @@
9758	11392
9759	11393	bool kvm_arch_has_irq_bypass(void)
9760	11394	{
9761		- return kvm_x86_ops->update_pi_irte != NULL;
	11395	+ return true;
9762	11396	}
9763	11397
9764	11398	int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
..	..	@@ -9766,11 +11400,17 @@
9766	11400	{
9767	11401	struct kvm_kernel_irqfd *irqfd =
9768	11402	container_of(cons, struct kvm_kernel_irqfd, consumer);
	11403	+ int ret;
9769	11404
9770	11405	irqfd->producer = prod;
	11406	+ kvm_arch_start_assignment(irqfd->kvm);
	11407	+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
	11408	+ prod->irq, irqfd->gsi, 1);
9771	11409
9772		- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
9773		- prod->irq, irqfd->gsi, 1);
	11410	+ if (ret)
	11411	+ kvm_arch_end_assignment(irqfd->kvm);
	11412	+
	11413	+ return ret;
9774	11414	}
9775	11415
9776	11416	void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
..	..	@@ -9789,26 +11429,185 @@
9789	11429	* when the irq is masked/disabled or the consumer side (KVM
9790	11430	* int this case doesn't want to receive the interrupts.
9791	11431	*/
9792		- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
	11432	+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
9793	11433	if (ret)
9794	11434	printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
9795	11435	" fails: %d\n", irqfd->consumer.token, ret);
	11436	+
	11437	+ kvm_arch_end_assignment(irqfd->kvm);
9796	11438	}
9797	11439
9798	11440	int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
9799	11441	uint32_t guest_irq, bool set)
9800	11442	{
9801		- if (!kvm_x86_ops->update_pi_irte)
9802		- return -EINVAL;
9803		-
9804		- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
	11443	+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
9805	11444	}
9806	11445
9807	11446	bool kvm_vector_hashing_enabled(void)
9808	11447	{
9809	11448	return vector_hashing;
9810	11449	}
9811		-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
	11450	+
	11451	+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
	11452	+{
	11453	+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
	11454	+}
	11455	+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
	11456	+
	11457	+
	11458	+int kvm_spec_ctrl_test_value(u64 value)
	11459	+{
	11460	+ /*
	11461	+ * test that setting IA32_SPEC_CTRL to given value
	11462	+ * is allowed by the host processor
	11463	+ */
	11464	+
	11465	+ u64 saved_value;
	11466	+ unsigned long flags;
	11467	+ int ret = 0;
	11468	+
	11469	+ local_irq_save(flags);
	11470	+
	11471	+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
	11472	+ ret = 1;
	11473	+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
	11474	+ ret = 1;
	11475	+ else
	11476	+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
	11477	+
	11478	+ local_irq_restore(flags);
	11479	+
	11480	+ return ret;
	11481	+}
	11482	+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
	11483	+
	11484	+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
	11485	+{
	11486	+ struct x86_exception fault;
	11487	+ u32 access = error_code &
	11488	+ (PFERR_WRITE_MASK \| PFERR_FETCH_MASK \| PFERR_USER_MASK);
	11489	+
	11490	+ if (!(error_code & PFERR_PRESENT_MASK) \|\|
	11491	+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
	11492	+ /*
	11493	+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
	11494	+ * tables probably do not match the TLB. Just proceed
	11495	+ * with the error code that the processor gave.
	11496	+ */
	11497	+ fault.vector = PF_VECTOR;
	11498	+ fault.error_code_valid = true;
	11499	+ fault.error_code = error_code;
	11500	+ fault.nested_page_fault = false;
	11501	+ fault.address = gva;
	11502	+ }
	11503	+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
	11504	+}
	11505	+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
	11506	+
	11507	+/*
	11508	+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
	11509	+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
	11510	+ * indicates whether exit to userspace is needed.
	11511	+ */
	11512	+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
	11513	+ struct x86_exception *e)
	11514	+{
	11515	+ if (r == X86EMUL_PROPAGATE_FAULT) {
	11516	+ kvm_inject_emulated_page_fault(vcpu, e);
	11517	+ return 1;
	11518	+ }
	11519	+
	11520	+ /*
	11521	+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
	11522	+ * while handling a VMX instruction KVM could've handled the request
	11523	+ * correctly by exiting to userspace and performing I/O but there
	11524	+ * doesn't seem to be a real use-case behind such requests, just return
	11525	+ * KVM_EXIT_INTERNAL_ERROR for now.
	11526	+ */
	11527	+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
	11528	+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
	11529	+ vcpu->run->internal.ndata = 0;
	11530	+
	11531	+ return 0;
	11532	+}
	11533	+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
	11534	+
	11535	+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
	11536	+{
	11537	+ bool pcid_enabled;
	11538	+ struct x86_exception e;
	11539	+ unsigned i;
	11540	+ unsigned long roots_to_free = 0;
	11541	+ struct {
	11542	+ u64 pcid;
	11543	+ u64 gla;
	11544	+ } operand;
	11545	+ int r;
	11546	+
	11547	+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
	11548	+ if (r != X86EMUL_CONTINUE)
	11549	+ return kvm_handle_memory_failure(vcpu, r, &e);
	11550	+
	11551	+ if (operand.pcid >> 12 != 0) {
	11552	+ kvm_inject_gp(vcpu, 0);
	11553	+ return 1;
	11554	+ }
	11555	+
	11556	+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
	11557	+
	11558	+ switch (type) {
	11559	+ case INVPCID_TYPE_INDIV_ADDR:
	11560	+ if ((!pcid_enabled && (operand.pcid != 0)) \|\|
	11561	+ is_noncanonical_address(operand.gla, vcpu)) {
	11562	+ kvm_inject_gp(vcpu, 0);
	11563	+ return 1;
	11564	+ }
	11565	+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
	11566	+ return kvm_skip_emulated_instruction(vcpu);
	11567	+
	11568	+ case INVPCID_TYPE_SINGLE_CTXT:
	11569	+ if (!pcid_enabled && (operand.pcid != 0)) {
	11570	+ kvm_inject_gp(vcpu, 0);
	11571	+ return 1;
	11572	+ }
	11573	+
	11574	+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
	11575	+ kvm_mmu_sync_roots(vcpu);
	11576	+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
	11577	+ }
	11578	+
	11579	+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
	11580	+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
	11581	+ == operand.pcid)
	11582	+ roots_to_free \|= KVM_MMU_ROOT_PREVIOUS(i);
	11583	+
	11584	+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
	11585	+ /*
	11586	+ * If neither the current cr3 nor any of the prev_roots use the
	11587	+ * given PCID, then nothing needs to be done here because a
	11588	+ * resync will happen anyway before switching to any other CR3.
	11589	+ */
	11590	+
	11591	+ return kvm_skip_emulated_instruction(vcpu);
	11592	+
	11593	+ case INVPCID_TYPE_ALL_NON_GLOBAL:
	11594	+ /*
	11595	+ * Currently, KVM doesn't mark global entries in the shadow
	11596	+ * page tables, so a non-global flush just degenerates to a
	11597	+ * global flush. If needed, we could optimize this later by
	11598	+ * keeping track of global entries in shadow page tables.
	11599	+ */
	11600	+
	11601	+ fallthrough;
	11602	+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
	11603	+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
	11604	+ return kvm_skip_emulated_instruction(vcpu);
	11605	+
	11606	+ default:
	11607	+ BUG(); /* We have already checked above that type <= 3 */
	11608	+ }
	11609	+}
	11610	+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
9812	11611
9813	11612	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
9814	11613	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
..	..	@@ -9820,12 +11619,31 @@
9820	11619	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
9821	11620	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
9822	11621	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
	11622	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
9823	11623	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
9824	11624	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
9825	11625	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
9826	11626	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
9827		-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
	11627	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
9828	11628	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
9829	11629	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
9830	11630	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
9831	11631	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
	11632	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
	11633	+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
	11634	+
	11635	+static int __init kvm_x86_init(void)
	11636	+{
	11637	+ kvm_mmu_x86_module_init();
	11638	+ return 0;
	11639	+}
	11640	+module_init(kvm_x86_init);
	11641	+
	11642	+static void __exit kvm_x86_exit(void)
	11643	+{
	11644	+ /*
	11645	+ * If module_init() is implemented, module_exit() must also be
	11646	+ * implemented to allow module unload.
	11647	+ */
	11648	+}
	11649	+module_exit(kvm_x86_exit);