forked from ~ljy/RK356X_SDK_RELEASE

hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/arch/x86/kvm/x86.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Kernel-based Virtual Machine driver for Linux
34 *
....@@ -13,22 +14,21 @@
1314 * Yaniv Kamay <yaniv@qumranet.com>
1415 * Amit Shah <amit.shah@qumranet.com>
1516 * Ben-Ami Yassour <benami@il.ibm.com>
16
- *
17
- * This work is licensed under the terms of the GNU GPL, version 2. See
18
- * the COPYING file in the top-level directory.
19
- *
2017 */
2118
2219 #include <linux/kvm_host.h>
2320 #include "irq.h"
21
+#include "ioapic.h"
2422 #include "mmu.h"
2523 #include "i8254.h"
2624 #include "tss.h"
2725 #include "kvm_cache_regs.h"
26
+#include "kvm_emulate.h"
2827 #include "x86.h"
2928 #include "cpuid.h"
3029 #include "pmu.h"
3130 #include "hyperv.h"
31
+#include "lapic.h"
3232
3333 #include <linux/clocksource.h>
3434 #include <linux/interrupt.h>
....@@ -54,7 +54,9 @@
5454 #include <linux/kvm_irqfd.h>
5555 #include <linux/irqbypass.h>
5656 #include <linux/sched/stat.h>
57
+#include <linux/sched/isolation.h>
5758 #include <linux/mem_encrypt.h>
59
+#include <linux/entry-kvm.h>
5860
5961 #include <trace/events/kvm.h>
6062
....@@ -69,6 +71,10 @@
6971 #include <asm/irq_remapping.h>
7072 #include <asm/mshyperv.h>
7173 #include <asm/hypervisor.h>
74
+#include <asm/tlbflush.h>
75
+#include <asm/intel_pt.h>
76
+#include <asm/emulate_prefix.h>
77
+#include <clocksource/hyperv_timer.h>
7278
7379 #define CREATE_TRACE_POINTS
7480 #include "trace.h"
....@@ -79,7 +85,7 @@
7985 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
8086
8187 #define emul_to_vcpu(ctxt) \
82
- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
88
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
8389
8490 /* EFER defaults:
8591 * - enable syscall per default because its emulated by KVM
....@@ -94,9 +100,6 @@
94100
95101 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
96102
97
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
98
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
99
-
100103 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
101104 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
102105
....@@ -108,7 +111,7 @@
108111 static void store_regs(struct kvm_vcpu *vcpu);
109112 static int sync_regs(struct kvm_vcpu *vcpu);
110113
111
-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
114
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
112115 EXPORT_SYMBOL_GPL(kvm_x86_ops);
113116
114117 static bool __read_mostly ignore_msrs = 0;
....@@ -138,10 +141,14 @@
138141 static u32 __read_mostly tsc_tolerance_ppm = 250;
139142 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
140143
141
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
142
-unsigned int __read_mostly lapic_timer_advance_ns = 0;
143
-module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
144
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
144
+/*
145
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
146
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
147
+ * advancement entirely. Any other value is used as-is and disables adaptive
148
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
149
+ */
150
+static int __read_mostly lapic_timer_advance_ns = -1;
151
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
145152
146153 static bool __read_mostly vector_hashing = true;
147154 module_param(vector_hashing, bool, S_IRUGO);
....@@ -153,85 +160,147 @@
153160 static bool __read_mostly force_emulation_prefix = false;
154161 module_param(force_emulation_prefix, bool, S_IRUGO);
155162
156
-#define KVM_NR_SHARED_MSRS 16
163
+int __read_mostly pi_inject_timer = -1;
164
+module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
157165
158
-struct kvm_shared_msrs_global {
166
+/*
167
+ * Restoring the host value for MSRs that are only consumed when running in
168
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
169
+ * returns to userspace, i.e. the kernel can run with the guest's value.
170
+ */
171
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
172
+
173
+struct kvm_user_return_msrs_global {
159174 int nr;
160
- u32 msrs[KVM_NR_SHARED_MSRS];
175
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
161176 };
162177
163
-struct kvm_shared_msrs {
178
+struct kvm_user_return_msrs {
164179 struct user_return_notifier urn;
165180 bool registered;
166
- struct kvm_shared_msr_values {
181
+ struct kvm_user_return_msr_values {
167182 u64 host;
168183 u64 curr;
169
- } values[KVM_NR_SHARED_MSRS];
184
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
170185 };
171186
172
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
173
-static struct kvm_shared_msrs __percpu *shared_msrs;
187
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
188
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
189
+
190
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
191
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
192
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
193
+ | XFEATURE_MASK_PKRU)
194
+
195
+u64 __read_mostly host_efer;
196
+EXPORT_SYMBOL_GPL(host_efer);
197
+
198
+bool __read_mostly allow_smaller_maxphyaddr = 0;
199
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
200
+
201
+static u64 __read_mostly host_xss;
202
+u64 __read_mostly supported_xss;
203
+EXPORT_SYMBOL_GPL(supported_xss);
174204
175205 struct kvm_stats_debugfs_item debugfs_entries[] = {
176
- { "pf_fixed", VCPU_STAT(pf_fixed) },
177
- { "pf_guest", VCPU_STAT(pf_guest) },
178
- { "tlb_flush", VCPU_STAT(tlb_flush) },
179
- { "invlpg", VCPU_STAT(invlpg) },
180
- { "exits", VCPU_STAT(exits) },
181
- { "io_exits", VCPU_STAT(io_exits) },
182
- { "mmio_exits", VCPU_STAT(mmio_exits) },
183
- { "signal_exits", VCPU_STAT(signal_exits) },
184
- { "irq_window", VCPU_STAT(irq_window_exits) },
185
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
186
- { "halt_exits", VCPU_STAT(halt_exits) },
187
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
188
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
189
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
190
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
191
- { "hypercalls", VCPU_STAT(hypercalls) },
192
- { "request_irq", VCPU_STAT(request_irq_exits) },
193
- { "irq_exits", VCPU_STAT(irq_exits) },
194
- { "host_state_reload", VCPU_STAT(host_state_reload) },
195
- { "fpu_reload", VCPU_STAT(fpu_reload) },
196
- { "insn_emulation", VCPU_STAT(insn_emulation) },
197
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
198
- { "irq_injections", VCPU_STAT(irq_injections) },
199
- { "nmi_injections", VCPU_STAT(nmi_injections) },
200
- { "req_event", VCPU_STAT(req_event) },
201
- { "l1d_flush", VCPU_STAT(l1d_flush) },
202
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
203
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
204
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
205
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
206
- { "mmu_flooded", VM_STAT(mmu_flooded) },
207
- { "mmu_recycled", VM_STAT(mmu_recycled) },
208
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
209
- { "mmu_unsync", VM_STAT(mmu_unsync) },
210
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
211
- { "largepages", VM_STAT(lpages, .mode = 0444) },
212
- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
213
- { "max_mmu_page_hash_collisions",
214
- VM_STAT(max_mmu_page_hash_collisions) },
206
+ VCPU_STAT("pf_fixed", pf_fixed),
207
+ VCPU_STAT("pf_guest", pf_guest),
208
+ VCPU_STAT("tlb_flush", tlb_flush),
209
+ VCPU_STAT("invlpg", invlpg),
210
+ VCPU_STAT("exits", exits),
211
+ VCPU_STAT("io_exits", io_exits),
212
+ VCPU_STAT("mmio_exits", mmio_exits),
213
+ VCPU_STAT("signal_exits", signal_exits),
214
+ VCPU_STAT("irq_window", irq_window_exits),
215
+ VCPU_STAT("nmi_window", nmi_window_exits),
216
+ VCPU_STAT("halt_exits", halt_exits),
217
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
218
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
219
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
220
+ VCPU_STAT("halt_wakeup", halt_wakeup),
221
+ VCPU_STAT("hypercalls", hypercalls),
222
+ VCPU_STAT("request_irq", request_irq_exits),
223
+ VCPU_STAT("irq_exits", irq_exits),
224
+ VCPU_STAT("host_state_reload", host_state_reload),
225
+ VCPU_STAT("fpu_reload", fpu_reload),
226
+ VCPU_STAT("insn_emulation", insn_emulation),
227
+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
228
+ VCPU_STAT("irq_injections", irq_injections),
229
+ VCPU_STAT("nmi_injections", nmi_injections),
230
+ VCPU_STAT("req_event", req_event),
231
+ VCPU_STAT("l1d_flush", l1d_flush),
232
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
233
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
234
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
235
+ VM_STAT("mmu_pte_write", mmu_pte_write),
236
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
237
+ VM_STAT("mmu_flooded", mmu_flooded),
238
+ VM_STAT("mmu_recycled", mmu_recycled),
239
+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
240
+ VM_STAT("mmu_unsync", mmu_unsync),
241
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
242
+ VM_STAT("largepages", lpages, .mode = 0444),
243
+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
244
+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
215245 { NULL }
216246 };
217247
218248 u64 __read_mostly host_xcr0;
249
+u64 __read_mostly supported_xcr0;
250
+EXPORT_SYMBOL_GPL(supported_xcr0);
251
+
252
+static struct kmem_cache *x86_fpu_cache;
253
+
254
+static struct kmem_cache *x86_emulator_cache;
255
+
256
+/*
257
+ * When called, it means the previous get/set msr reached an invalid msr.
258
+ * Return true if we want to ignore/silent this failed msr access.
259
+ */
260
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
261
+ u64 data, bool write)
262
+{
263
+ const char *op = write ? "wrmsr" : "rdmsr";
264
+
265
+ if (ignore_msrs) {
266
+ if (report_ignored_msrs)
267
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
268
+ op, msr, data);
269
+ /* Mask the error */
270
+ return true;
271
+ } else {
272
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
273
+ op, msr, data);
274
+ return false;
275
+ }
276
+}
277
+
278
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
279
+{
280
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
281
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
282
+
283
+ return kmem_cache_create_usercopy("x86_emulator", size,
284
+ __alignof__(struct x86_emulate_ctxt),
285
+ SLAB_ACCOUNT, useroffset,
286
+ size - useroffset, NULL);
287
+}
219288
220289 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
221290
222291 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
223292 {
224293 int i;
225
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
294
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
226295 vcpu->arch.apf.gfns[i] = ~0;
227296 }
228297
229298 static void kvm_on_user_return(struct user_return_notifier *urn)
230299 {
231300 unsigned slot;
232
- struct kvm_shared_msrs *locals
233
- = container_of(urn, struct kvm_shared_msrs, urn);
234
- struct kvm_shared_msr_values *values;
301
+ struct kvm_user_return_msrs *msrs
302
+ = container_of(urn, struct kvm_user_return_msrs, urn);
303
+ struct kvm_user_return_msr_values *values;
235304 unsigned long flags;
236305
237306 /*
....@@ -239,84 +308,89 @@
239308 * interrupted and executed through kvm_arch_hardware_disable()
240309 */
241310 local_irq_save(flags);
242
- if (locals->registered) {
243
- locals->registered = false;
311
+ if (msrs->registered) {
312
+ msrs->registered = false;
244313 user_return_notifier_unregister(urn);
245314 }
246315 local_irq_restore(flags);
247
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
248
- values = &locals->values[slot];
316
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
317
+ values = &msrs->values[slot];
249318 if (values->host != values->curr) {
250
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
319
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
251320 values->curr = values->host;
252321 }
253322 }
254323 }
255324
256
-static void shared_msr_update(unsigned slot, u32 msr)
325
+int kvm_probe_user_return_msr(u32 msr)
257326 {
327
+ u64 val;
328
+ int ret;
329
+
330
+ preempt_disable();
331
+ ret = rdmsrl_safe(msr, &val);
332
+ if (ret)
333
+ goto out;
334
+ ret = wrmsrl_safe(msr, val);
335
+out:
336
+ preempt_enable();
337
+ return ret;
338
+}
339
+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
340
+
341
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
342
+{
343
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
344
+ user_return_msrs_global.msrs[slot] = msr;
345
+ if (slot >= user_return_msrs_global.nr)
346
+ user_return_msrs_global.nr = slot + 1;
347
+}
348
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
349
+
350
+static void kvm_user_return_msr_cpu_online(void)
351
+{
352
+ unsigned int cpu = smp_processor_id();
353
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
258354 u64 value;
259
- unsigned int cpu = smp_processor_id();
260
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
355
+ int i;
261356
262
- /* only read, and nobody should modify it at this time,
263
- * so don't need lock */
264
- if (slot >= shared_msrs_global.nr) {
265
- printk(KERN_ERR "kvm: invalid MSR slot!");
266
- return;
357
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
358
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
359
+ msrs->values[i].host = value;
360
+ msrs->values[i].curr = value;
267361 }
268
- rdmsrl_safe(msr, &value);
269
- smsr->values[slot].host = value;
270
- smsr->values[slot].curr = value;
271362 }
272363
273
-void kvm_define_shared_msr(unsigned slot, u32 msr)
274
-{
275
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
276
- shared_msrs_global.msrs[slot] = msr;
277
- if (slot >= shared_msrs_global.nr)
278
- shared_msrs_global.nr = slot + 1;
279
-}
280
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
281
-
282
-static void kvm_shared_msr_cpu_online(void)
283
-{
284
- unsigned i;
285
-
286
- for (i = 0; i < shared_msrs_global.nr; ++i)
287
- shared_msr_update(i, shared_msrs_global.msrs[i]);
288
-}
289
-
290
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
364
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
291365 {
292366 unsigned int cpu = smp_processor_id();
293
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
367
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
294368 int err;
295369
296
- value = (value & mask) | (smsr->values[slot].host & ~mask);
297
- if (value == smsr->values[slot].curr)
370
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
371
+ if (value == msrs->values[slot].curr)
298372 return 0;
299
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
373
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
300374 if (err)
301375 return 1;
302376
303
- smsr->values[slot].curr = value;
304
- if (!smsr->registered) {
305
- smsr->urn.on_user_return = kvm_on_user_return;
306
- user_return_notifier_register(&smsr->urn);
307
- smsr->registered = true;
377
+ msrs->values[slot].curr = value;
378
+ if (!msrs->registered) {
379
+ msrs->urn.on_user_return = kvm_on_user_return;
380
+ user_return_notifier_register(&msrs->urn);
381
+ msrs->registered = true;
308382 }
309383 return 0;
310384 }
311
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
385
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
312386
313387 static void drop_user_return_notifiers(void)
314388 {
315389 unsigned int cpu = smp_processor_id();
316
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
390
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
317391
318
- if (smsr->registered)
319
- kvm_on_user_return(&smsr->urn);
392
+ if (msrs->registered)
393
+ kvm_on_user_return(&msrs->urn);
320394 }
321395
322396 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
....@@ -348,14 +422,15 @@
348422 }
349423
350424 kvm_lapic_set_base(vcpu, msr_info->data);
425
+ kvm_recalculate_apic_map(vcpu->kvm);
351426 return 0;
352427 }
353428 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
354429
355
-asmlinkage __visible void kvm_spurious_fault(void)
430
+asmlinkage __visible noinstr void kvm_spurious_fault(void)
356431 {
357432 /* Fault while not rebooting. We want the trace. */
358
- BUG();
433
+ BUG_ON(!kvm_rebooting);
359434 }
360435 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
361436
....@@ -384,6 +459,7 @@
384459 #define EXCPT_TRAP 1
385460 #define EXCPT_ABORT 2
386461 #define EXCPT_INTERRUPT 3
462
+#define EXCPT_DB 4
387463
388464 static int exception_type(int vector)
389465 {
....@@ -394,8 +470,14 @@
394470
395471 mask = 1 << vector;
396472
397
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
398
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
473
+ /*
474
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
475
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
476
+ */
477
+ if (mask & (1 << DB_VECTOR))
478
+ return EXCPT_DB;
479
+
480
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
399481 return EXCPT_TRAP;
400482
401483 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
....@@ -405,9 +487,59 @@
405487 return EXCPT_FAULT;
406488 }
407489
490
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
491
+{
492
+ unsigned nr = vcpu->arch.exception.nr;
493
+ bool has_payload = vcpu->arch.exception.has_payload;
494
+ unsigned long payload = vcpu->arch.exception.payload;
495
+
496
+ if (!has_payload)
497
+ return;
498
+
499
+ switch (nr) {
500
+ case DB_VECTOR:
501
+ /*
502
+ * "Certain debug exceptions may clear bit 0-3. The
503
+ * remaining contents of the DR6 register are never
504
+ * cleared by the processor".
505
+ */
506
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
507
+ /*
508
+ * DR6.RTM is set by all #DB exceptions that don't clear it.
509
+ */
510
+ vcpu->arch.dr6 |= DR6_RTM;
511
+ vcpu->arch.dr6 |= payload;
512
+ /*
513
+ * Bit 16 should be set in the payload whenever the #DB
514
+ * exception should clear DR6.RTM. This makes the payload
515
+ * compatible with the pending debug exceptions under VMX.
516
+ * Though not currently documented in the SDM, this also
517
+ * makes the payload compatible with the exit qualification
518
+ * for #DB exceptions under VMX.
519
+ */
520
+ vcpu->arch.dr6 ^= payload & DR6_RTM;
521
+
522
+ /*
523
+ * The #DB payload is defined as compatible with the 'pending
524
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
525
+ * defined in the 'pending debug exceptions' field (enabled
526
+ * breakpoint), it is reserved and must be zero in DR6.
527
+ */
528
+ vcpu->arch.dr6 &= ~BIT(12);
529
+ break;
530
+ case PF_VECTOR:
531
+ vcpu->arch.cr2 = payload;
532
+ break;
533
+ }
534
+
535
+ vcpu->arch.exception.has_payload = false;
536
+ vcpu->arch.exception.payload = 0;
537
+}
538
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
539
+
408540 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
409541 unsigned nr, bool has_error, u32 error_code,
410
- bool reinject)
542
+ bool has_payload, unsigned long payload, bool reinject)
411543 {
412544 u32 prev_nr;
413545 int class1, class2;
....@@ -427,6 +559,14 @@
427559 */
428560 WARN_ON_ONCE(vcpu->arch.exception.pending);
429561 vcpu->arch.exception.injected = true;
562
+ if (WARN_ON_ONCE(has_payload)) {
563
+ /*
564
+ * A reinjected event has already
565
+ * delivered its payload.
566
+ */
567
+ has_payload = false;
568
+ payload = 0;
569
+ }
430570 } else {
431571 vcpu->arch.exception.pending = true;
432572 vcpu->arch.exception.injected = false;
....@@ -434,6 +574,10 @@
434574 vcpu->arch.exception.has_error_code = has_error;
435575 vcpu->arch.exception.nr = nr;
436576 vcpu->arch.exception.error_code = error_code;
577
+ vcpu->arch.exception.has_payload = has_payload;
578
+ vcpu->arch.exception.payload = payload;
579
+ if (!is_guest_mode(vcpu))
580
+ kvm_deliver_exception_payload(vcpu);
437581 return;
438582 }
439583
....@@ -458,6 +602,8 @@
458602 vcpu->arch.exception.has_error_code = true;
459603 vcpu->arch.exception.nr = DF_VECTOR;
460604 vcpu->arch.exception.error_code = 0;
605
+ vcpu->arch.exception.has_payload = false;
606
+ vcpu->arch.exception.payload = 0;
461607 } else
462608 /* replace previous exception with a new one in a hope
463609 that instruction re-execution will regenerate lost
....@@ -467,15 +613,29 @@
467613
468614 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
469615 {
470
- kvm_multiple_exception(vcpu, nr, false, 0, false);
616
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
471617 }
472618 EXPORT_SYMBOL_GPL(kvm_queue_exception);
473619
474620 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
475621 {
476
- kvm_multiple_exception(vcpu, nr, false, 0, true);
622
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
477623 }
478624 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
625
+
626
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
627
+ unsigned long payload)
628
+{
629
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
630
+}
631
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
632
+
633
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
634
+ u32 error_code, unsigned long payload)
635
+{
636
+ kvm_multiple_exception(vcpu, nr, true, error_code,
637
+ true, payload, false);
638
+}
479639
480640 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
481641 {
....@@ -493,23 +653,38 @@
493653 ++vcpu->stat.pf_guest;
494654 vcpu->arch.exception.nested_apf =
495655 is_guest_mode(vcpu) && fault->async_page_fault;
496
- if (vcpu->arch.exception.nested_apf)
656
+ if (vcpu->arch.exception.nested_apf) {
497657 vcpu->arch.apf.nested_apf_token = fault->address;
498
- else
499
- vcpu->arch.cr2 = fault->address;
500
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
658
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
659
+ } else {
660
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
661
+ fault->address);
662
+ }
501663 }
502664 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
503665
504
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
666
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
667
+ struct x86_exception *fault)
505668 {
506
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
507
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
508
- else
509
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
669
+ struct kvm_mmu *fault_mmu;
670
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
510671
672
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
673
+ vcpu->arch.walk_mmu;
674
+
675
+ /*
676
+ * Invalidate the TLB entry for the faulting address, if it exists,
677
+ * else the access will fault indefinitely (and to emulate hardware).
678
+ */
679
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
680
+ !(fault->error_code & PFERR_RSVD_MASK))
681
+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
682
+ fault_mmu->root_hpa);
683
+
684
+ fault_mmu->inject_page_fault(vcpu, fault);
511685 return fault->nested_page_fault;
512686 }
687
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
513688
514689 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
515690 {
....@@ -520,13 +695,13 @@
520695
521696 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
522697 {
523
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
698
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
524699 }
525700 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
526701
527702 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
528703 {
529
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
704
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
530705 }
531706 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
532707
....@@ -536,7 +711,7 @@
536711 */
537712 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
538713 {
539
- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
714
+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
540715 return true;
541716 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
542717 return false;
....@@ -618,10 +793,8 @@
618793 ret = 1;
619794
620795 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
621
- __set_bit(VCPU_EXREG_PDPTR,
622
- (unsigned long *)&vcpu->arch.regs_avail);
623
- __set_bit(VCPU_EXREG_PDPTR,
624
- (unsigned long *)&vcpu->arch.regs_dirty);
796
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
797
+
625798 out:
626799
627800 return ret;
....@@ -631,7 +804,6 @@
631804 bool pdptrs_changed(struct kvm_vcpu *vcpu)
632805 {
633806 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
634
- bool changed = true;
635807 int offset;
636808 gfn_t gfn;
637809 int r;
....@@ -639,8 +811,7 @@
639811 if (!is_pae_paging(vcpu))
640812 return false;
641813
642
- if (!test_bit(VCPU_EXREG_PDPTR,
643
- (unsigned long *)&vcpu->arch.regs_avail))
814
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
644815 return true;
645816
646817 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
....@@ -648,17 +819,16 @@
648819 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
649820 PFERR_USER_MASK | PFERR_WRITE_MASK);
650821 if (r < 0)
651
- goto out;
652
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
653
-out:
822
+ return true;
654823
655
- return changed;
824
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
656825 }
657826 EXPORT_SYMBOL_GPL(pdptrs_changed);
658827
659828 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
660829 {
661830 unsigned long old_cr0 = kvm_read_cr0(vcpu);
831
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
662832 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
663833
664834 cr0 |= X86_CR0_ET;
....@@ -676,27 +846,27 @@
676846 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
677847 return 1;
678848
679
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
680849 #ifdef CONFIG_X86_64
681
- if ((vcpu->arch.efer & EFER_LME)) {
682
- int cs_db, cs_l;
850
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
851
+ (cr0 & X86_CR0_PG)) {
852
+ int cs_db, cs_l;
683853
684
- if (!is_pae(vcpu))
685
- return 1;
686
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
687
- if (cs_l)
688
- return 1;
689
- } else
690
-#endif
691
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
692
- kvm_read_cr3(vcpu)))
854
+ if (!is_pae(vcpu))
855
+ return 1;
856
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
857
+ if (cs_l)
693858 return 1;
694859 }
860
+#endif
861
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
862
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
863
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
864
+ return 1;
695865
696866 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
697867 return 1;
698868
699
- kvm_x86_ops->set_cr0(vcpu, cr0);
869
+ kvm_x86_ops.set_cr0(vcpu, cr0);
700870
701871 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
702872 kvm_clear_async_pf_completion_queue(vcpu);
....@@ -721,27 +891,48 @@
721891 }
722892 EXPORT_SYMBOL_GPL(kvm_lmsw);
723893
724
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
894
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
725895 {
726
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
727
- !vcpu->guest_xcr0_loaded) {
728
- /* kvm_set_xcr() also depends on this */
896
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
897
+
729898 if (vcpu->arch.xcr0 != host_xcr0)
730899 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
731
- vcpu->guest_xcr0_loaded = 1;
732
- }
733
-}
734
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
735900
736
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
901
+ if (vcpu->arch.xsaves_enabled &&
902
+ vcpu->arch.ia32_xss != host_xss)
903
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
904
+ }
905
+
906
+ if (static_cpu_has(X86_FEATURE_PKU) &&
907
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
908
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
909
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
910
+ __write_pkru(vcpu->arch.pkru);
911
+}
912
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
913
+
914
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
737915 {
738
- if (vcpu->guest_xcr0_loaded) {
916
+ if (static_cpu_has(X86_FEATURE_PKU) &&
917
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
918
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
919
+ vcpu->arch.pkru = rdpkru();
920
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
921
+ __write_pkru(vcpu->arch.host_pkru);
922
+ }
923
+
924
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
925
+
739926 if (vcpu->arch.xcr0 != host_xcr0)
740927 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
741
- vcpu->guest_xcr0_loaded = 0;
928
+
929
+ if (vcpu->arch.xsaves_enabled &&
930
+ vcpu->arch.ia32_xss != host_xss)
931
+ wrmsrl(MSR_IA32_XSS, host_xss);
742932 }
933
+
743934 }
744
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
935
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
745936
746937 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
747938 {
....@@ -779,13 +970,13 @@
779970 vcpu->arch.xcr0 = xcr0;
780971
781972 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
782
- kvm_update_cpuid(vcpu);
973
+ kvm_update_cpuid_runtime(vcpu);
783974 return 0;
784975 }
785976
786977 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
787978 {
788
- if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
979
+ if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
789980 __kvm_set_xcr(vcpu, index, xcr)) {
790981 kvm_inject_gp(vcpu, 0);
791982 return 1;
....@@ -794,63 +985,20 @@
794985 }
795986 EXPORT_SYMBOL_GPL(kvm_set_xcr);
796987
797
-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
798
-{
799
- u64 reserved_bits = CR4_RESERVED_BITS;
800
-
801
- if (!cpu_has(c, X86_FEATURE_XSAVE))
802
- reserved_bits |= X86_CR4_OSXSAVE;
803
-
804
- if (!cpu_has(c, X86_FEATURE_SMEP))
805
- reserved_bits |= X86_CR4_SMEP;
806
-
807
- if (!cpu_has(c, X86_FEATURE_SMAP))
808
- reserved_bits |= X86_CR4_SMAP;
809
-
810
- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
811
- reserved_bits |= X86_CR4_FSGSBASE;
812
-
813
- if (!cpu_has(c, X86_FEATURE_PKU))
814
- reserved_bits |= X86_CR4_PKE;
815
-
816
- if (!cpu_has(c, X86_FEATURE_LA57) &&
817
- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
818
- reserved_bits |= X86_CR4_LA57;
819
-
820
- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
821
- reserved_bits |= X86_CR4_UMIP;
822
-
823
- return reserved_bits;
824
-}
825
-
826
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
988
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
827989 {
828990 if (cr4 & cr4_reserved_bits)
829991 return -EINVAL;
830992
831
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
993
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
832994 return -EINVAL;
833995
834
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
835
- return -EINVAL;
836
-
837
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
838
- return -EINVAL;
839
-
840
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
841
- return -EINVAL;
842
-
843
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
844
- return -EINVAL;
845
-
846
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
847
- return -EINVAL;
848
-
849
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
996
+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
850997 return -EINVAL;
851998
852999 return 0;
8531000 }
1001
+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
8541002
8551003 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
8561004 {
....@@ -882,15 +1030,14 @@
8821030 return 1;
8831031 }
8841032
885
- if (kvm_x86_ops->set_cr4(vcpu, cr4))
886
- return 1;
1033
+ kvm_x86_ops.set_cr4(vcpu, cr4);
8871034
8881035 if (((cr4 ^ old_cr4) & mmu_role_bits) ||
8891036 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
8901037 kvm_mmu_reset_context(vcpu);
8911038
8921039 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
893
- kvm_update_cpuid(vcpu);
1040
+ kvm_update_cpuid_runtime(vcpu);
8941041
8951042 return 0;
8961043 }
....@@ -911,21 +1058,21 @@
9111058 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
9121059 if (!skip_tlb_flush) {
9131060 kvm_mmu_sync_roots(vcpu);
914
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1061
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
9151062 }
9161063 return 0;
9171064 }
9181065
9191066 if (is_long_mode(vcpu) &&
920
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
1067
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
9211068 return 1;
9221069 else if (is_pae_paging(vcpu) &&
9231070 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
9241071 return 1;
9251072
926
- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
1073
+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
9271074 vcpu->arch.cr3 = cr3;
928
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1075
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
9291076
9301077 return 0;
9311078 }
....@@ -963,13 +1110,7 @@
9631110 }
9641111 }
9651112
966
-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
967
-{
968
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
969
- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
970
-}
971
-
972
-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
1113
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
9731114 {
9741115 unsigned long dr7;
9751116
....@@ -977,11 +1118,12 @@
9771118 dr7 = vcpu->arch.guest_debug_dr7;
9781119 else
9791120 dr7 = vcpu->arch.dr7;
980
- kvm_x86_ops->set_dr7(vcpu, dr7);
1121
+ kvm_x86_ops.set_dr7(vcpu, dr7);
9811122 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
9821123 if (dr7 & DR7_BP_EN_MASK)
9831124 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
9841125 }
1126
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
9851127
9861128 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
9871129 {
....@@ -1003,17 +1145,14 @@
10031145 vcpu->arch.eff_db[dr] = val;
10041146 break;
10051147 case 4:
1006
- /* fall through */
10071148 case 6:
1008
- if (val & 0xffffffff00000000ULL)
1149
+ if (!kvm_dr6_valid(val))
10091150 return -1; /* #GP */
10101151 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1011
- kvm_update_dr6(vcpu);
10121152 break;
10131153 case 5:
1014
- /* fall through */
10151154 default: /* 7 */
1016
- if (val & 0xffffffff00000000ULL)
1155
+ if (!kvm_dr7_valid(val))
10171156 return -1; /* #GP */
10181157 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
10191158 kvm_update_dr7(vcpu);
....@@ -1042,15 +1181,10 @@
10421181 *val = vcpu->arch.db[array_index_nospec(dr, size)];
10431182 break;
10441183 case 4:
1045
- /* fall through */
10461184 case 6:
1047
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1048
- *val = vcpu->arch.dr6;
1049
- else
1050
- *val = kvm_x86_ops->get_dr6(vcpu);
1185
+ *val = vcpu->arch.dr6;
10511186 break;
10521187 case 5:
1053
- /* fall through */
10541188 default: /* 7 */
10551189 *val = vcpu->arch.dr7;
10561190 break;
....@@ -1061,15 +1195,15 @@
10611195
10621196 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
10631197 {
1064
- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1198
+ u32 ecx = kvm_rcx_read(vcpu);
10651199 u64 data;
10661200 int err;
10671201
10681202 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
10691203 if (err)
10701204 return err;
1071
- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1072
- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1205
+ kvm_rax_write(vcpu, (u32)data);
1206
+ kvm_rdx_write(vcpu, data >> 32);
10731207 return err;
10741208 }
10751209 EXPORT_SYMBOL_GPL(kvm_rdpmc);
....@@ -1078,26 +1212,66 @@
10781212 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
10791213 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
10801214 *
1081
- * This list is modified at module load time to reflect the
1215
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1216
+ * extract the supported MSRs from the related const lists.
1217
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
10821218 * capabilities of the host cpu. This capabilities test skips MSRs that are
1083
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1219
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
10841220 * may depend on host virtualization features rather than host cpu features.
10851221 */
10861222
1087
-static u32 msrs_to_save[] = {
1223
+static const u32 msrs_to_save_all[] = {
10881224 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
10891225 MSR_STAR,
10901226 #ifdef CONFIG_X86_64
10911227 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
10921228 #endif
10931229 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1094
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1095
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1230
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1231
+ MSR_IA32_SPEC_CTRL,
1232
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1233
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1234
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1235
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1236
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1237
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1238
+ MSR_IA32_UMWAIT_CONTROL,
1239
+
1240
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1241
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
1242
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1243
+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1244
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1245
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1246
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1247
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1248
+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1249
+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1250
+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1251
+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1252
+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1253
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1254
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1255
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1256
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1257
+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1258
+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1259
+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1260
+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1261
+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1262
+
1263
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1264
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
1265
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1266
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1267
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1268
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
10961269 };
10971270
1271
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
10981272 static unsigned num_msrs_to_save;
10991273
1100
-static u32 emulated_msrs[] = {
1274
+static const u32 emulated_msrs_all[] = {
11011275 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
11021276 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
11031277 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
....@@ -1113,12 +1287,18 @@
11131287 HV_X64_MSR_VP_ASSIST_PAGE,
11141288 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
11151289 HV_X64_MSR_TSC_EMULATION_STATUS,
1290
+ HV_X64_MSR_SYNDBG_OPTIONS,
1291
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1292
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1293
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
11161294
11171295 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1118
- MSR_KVM_PV_EOI_EN,
1296
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
11191297
11201298 MSR_IA32_TSC_ADJUST,
11211299 MSR_IA32_TSCDEADLINE,
1300
+ MSR_IA32_ARCH_CAPABILITIES,
1301
+ MSR_IA32_PERF_CAPABILITIES,
11221302 MSR_IA32_MISC_ENABLE,
11231303 MSR_IA32_MCG_STATUS,
11241304 MSR_IA32_MCG_CTL,
....@@ -1128,15 +1308,41 @@
11281308 MSR_PLATFORM_INFO,
11291309 MSR_MISC_FEATURES_ENABLES,
11301310 MSR_AMD64_VIRT_SPEC_CTRL,
1311
+ MSR_IA32_POWER_CTL,
1312
+ MSR_IA32_UCODE_REV,
1313
+
1314
+ /*
1315
+ * The following list leaves out MSRs whose values are determined
1316
+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1317
+ * We always support the "true" VMX control MSRs, even if the host
1318
+ * processor does not, so I am putting these registers here rather
1319
+ * than in msrs_to_save_all.
1320
+ */
1321
+ MSR_IA32_VMX_BASIC,
1322
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1323
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1324
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
1325
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1326
+ MSR_IA32_VMX_MISC,
1327
+ MSR_IA32_VMX_CR0_FIXED0,
1328
+ MSR_IA32_VMX_CR4_FIXED0,
1329
+ MSR_IA32_VMX_VMCS_ENUM,
1330
+ MSR_IA32_VMX_PROCBASED_CTLS2,
1331
+ MSR_IA32_VMX_EPT_VPID_CAP,
1332
+ MSR_IA32_VMX_VMFUNC,
1333
+
1334
+ MSR_K7_HWCR,
1335
+ MSR_KVM_POLL_CONTROL,
11311336 };
11321337
1338
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
11331339 static unsigned num_emulated_msrs;
11341340
11351341 /*
11361342 * List of msr numbers which are used to expose MSR-based features that
11371343 * can be used by a hypervisor to validate requested CPU features.
11381344 */
1139
-static u32 msr_based_features[] = {
1345
+static const u32 msr_based_features_all[] = {
11401346 MSR_IA32_VMX_BASIC,
11411347 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
11421348 MSR_IA32_VMX_PINBASED_CTLS,
....@@ -1156,18 +1362,41 @@
11561362 MSR_IA32_VMX_EPT_VPID_CAP,
11571363 MSR_IA32_VMX_VMFUNC,
11581364
1159
- MSR_F10H_DECFG,
1365
+ MSR_AMD64_DE_CFG,
11601366 MSR_IA32_UCODE_REV,
11611367 MSR_IA32_ARCH_CAPABILITIES,
1368
+ MSR_IA32_PERF_CAPABILITIES,
11621369 };
11631370
1371
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
11641372 static unsigned int num_msr_based_features;
11651373
1166
-u64 kvm_get_arch_capabilities(void)
1167
-{
1168
- u64 data;
1374
+/*
1375
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1376
+ * does not yet virtualize. These include:
1377
+ * 10 - MISC_PACKAGE_CTRLS
1378
+ * 11 - ENERGY_FILTERING_CTL
1379
+ * 12 - DOITM
1380
+ * 18 - FB_CLEAR_CTRL
1381
+ * 21 - XAPIC_DISABLE_STATUS
1382
+ * 23 - OVERCLOCKING_STATUS
1383
+ */
11691384
1170
- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1385
+#define KVM_SUPPORTED_ARCH_CAP \
1386
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1387
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1388
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1389
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1390
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
1391
+
1392
+static u64 kvm_get_arch_capabilities(void)
1393
+{
1394
+ u64 data = 0;
1395
+
1396
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1397
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1398
+ data &= KVM_SUPPORTED_ARCH_CAP;
1399
+ }
11711400
11721401 /*
11731402 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
....@@ -1196,34 +1425,27 @@
11961425 if (!boot_cpu_has_bug(X86_BUG_MDS))
11971426 data |= ARCH_CAP_MDS_NO;
11981427
1199
- /*
1200
- * On TAA affected systems, export MDS_NO=0 when:
1201
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
1202
- * - Updated microcode is present. This is detected by
1203
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
1204
- * that VERW clears CPU buffers.
1205
- *
1206
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
1207
- * mitigation and don't complain:
1208
- *
1209
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
1210
- *
1211
- * If TSX is disabled on the system, guests are also mitigated against
1212
- * TAA and clear CPU buffer mitigation is not required for guests.
1213
- */
1214
- if (!boot_cpu_has(X86_FEATURE_RTM))
1428
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
1429
+ /*
1430
+ * If RTM=0 because the kernel has disabled TSX, the host might
1431
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
1432
+ * and therefore knows that there cannot be TAA) but keep
1433
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1434
+ * and we want to allow migrating those guests to tsx=off hosts.
1435
+ */
12151436 data &= ~ARCH_CAP_TAA_NO;
1216
- else if (!boot_cpu_has_bug(X86_BUG_TAA))
1437
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
12171438 data |= ARCH_CAP_TAA_NO;
1218
- else if (data & ARCH_CAP_TSX_CTRL_MSR)
1219
- data &= ~ARCH_CAP_MDS_NO;
1439
+ } else {
1440
+ /*
1441
+ * Nothing to do here; we emulate TSX_CTRL if present on the
1442
+ * host so the guest can choose between disabling TSX or
1443
+ * using VERW to clear CPU buffers.
1444
+ */
1445
+ }
12201446
1221
- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
1222
- data &= ~ARCH_CAP_TSX_CTRL_MSR;
12231447 return data;
12241448 }
1225
-
1226
-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
12271449
12281450 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
12291451 {
....@@ -1235,8 +1457,7 @@
12351457 rdmsrl_safe(msr->index, &msr->data);
12361458 break;
12371459 default:
1238
- if (kvm_x86_ops->get_msr_feature(msr))
1239
- return 1;
1460
+ return kvm_x86_ops.get_msr_feature(msr);
12401461 }
12411462 return 0;
12421463 }
....@@ -1248,6 +1469,14 @@
12481469
12491470 msr.index = index;
12501471 r = kvm_get_msr_feature(&msr);
1472
+
1473
+ if (r == KVM_MSR_RET_INVALID) {
1474
+ /* Unconditionally clear the output for simplicity */
1475
+ *data = 0;
1476
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
1477
+ r = 0;
1478
+ }
1479
+
12511480 if (r)
12521481 return r;
12531482
....@@ -1262,6 +1491,13 @@
12621491 return false;
12631492
12641493 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1494
+ return false;
1495
+
1496
+ if (efer & (EFER_LME | EFER_LMA) &&
1497
+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1498
+ return false;
1499
+
1500
+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
12651501 return false;
12661502
12671503 return true;
....@@ -1280,6 +1516,7 @@
12801516 {
12811517 u64 old_efer = vcpu->arch.efer;
12821518 u64 efer = msr_info->data;
1519
+ int r;
12831520
12841521 if (efer & efer_reserved_bits)
12851522 return 1;
....@@ -1296,7 +1533,11 @@
12961533 efer &= ~EFER_LMA;
12971534 efer |= vcpu->arch.efer & EFER_LMA;
12981535
1299
- kvm_x86_ops->set_efer(vcpu, efer);
1536
+ r = kvm_x86_ops.set_efer(vcpu, efer);
1537
+ if (r) {
1538
+ WARN_ON(r > 0);
1539
+ return r;
1540
+ }
13001541
13011542 /* Update reserved bits */
13021543 if ((efer ^ old_efer) & EFER_NX)
....@@ -1311,20 +1552,70 @@
13111552 }
13121553 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
13131554
1555
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1556
+{
1557
+ struct kvm_x86_msr_filter *msr_filter;
1558
+ struct msr_bitmap_range *ranges;
1559
+ struct kvm *kvm = vcpu->kvm;
1560
+ bool allowed;
1561
+ int idx;
1562
+ u32 i;
1563
+
1564
+ /* x2APIC MSRs do not support filtering. */
1565
+ if (index >= 0x800 && index <= 0x8ff)
1566
+ return true;
1567
+
1568
+ idx = srcu_read_lock(&kvm->srcu);
1569
+
1570
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1571
+ if (!msr_filter) {
1572
+ allowed = true;
1573
+ goto out;
1574
+ }
1575
+
1576
+ allowed = msr_filter->default_allow;
1577
+ ranges = msr_filter->ranges;
1578
+
1579
+ for (i = 0; i < msr_filter->count; i++) {
1580
+ u32 start = ranges[i].base;
1581
+ u32 end = start + ranges[i].nmsrs;
1582
+ u32 flags = ranges[i].flags;
1583
+ unsigned long *bitmap = ranges[i].bitmap;
1584
+
1585
+ if ((index >= start) && (index < end) && (flags & type)) {
1586
+ allowed = !!test_bit(index - start, bitmap);
1587
+ break;
1588
+ }
1589
+ }
1590
+
1591
+out:
1592
+ srcu_read_unlock(&kvm->srcu, idx);
1593
+
1594
+ return allowed;
1595
+}
1596
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1597
+
13141598 /*
1315
- * Writes msr value into into the appropriate "register".
1599
+ * Write @data into the MSR specified by @index. Select MSR specific fault
1600
+ * checks are bypassed if @host_initiated is %true.
13161601 * Returns 0 on success, non-0 otherwise.
13171602 * Assumes vcpu_load() was already called.
13181603 */
1319
-int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1604
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1605
+ bool host_initiated)
13201606 {
1321
- switch (msr->index) {
1607
+ struct msr_data msr;
1608
+
1609
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1610
+ return KVM_MSR_RET_FILTERED;
1611
+
1612
+ switch (index) {
13221613 case MSR_FS_BASE:
13231614 case MSR_GS_BASE:
13241615 case MSR_KERNEL_GS_BASE:
13251616 case MSR_CSTAR:
13261617 case MSR_LSTAR:
1327
- if (is_noncanonical_address(msr->data, vcpu))
1618
+ if (is_noncanonical_address(data, vcpu))
13281619 return 1;
13291620 break;
13301621 case MSR_IA32_SYSENTER_EIP:
....@@ -1341,54 +1632,313 @@
13411632 * value, and that something deterministic happens if the guest
13421633 * invokes 64-bit SYSENTER.
13431634 */
1344
- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1635
+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
13451636 }
1346
- return kvm_x86_ops->set_msr(vcpu, msr);
1637
+
1638
+ msr.data = data;
1639
+ msr.index = index;
1640
+ msr.host_initiated = host_initiated;
1641
+
1642
+ return kvm_x86_ops.set_msr(vcpu, &msr);
1643
+}
1644
+
1645
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1646
+ u32 index, u64 data, bool host_initiated)
1647
+{
1648
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1649
+
1650
+ if (ret == KVM_MSR_RET_INVALID)
1651
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
1652
+ ret = 0;
1653
+
1654
+ return ret;
1655
+}
1656
+
1657
+/*
1658
+ * Read the MSR specified by @index into @data. Select MSR specific fault
1659
+ * checks are bypassed if @host_initiated is %true.
1660
+ * Returns 0 on success, non-0 otherwise.
1661
+ * Assumes vcpu_load() was already called.
1662
+ */
1663
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1664
+ bool host_initiated)
1665
+{
1666
+ struct msr_data msr;
1667
+ int ret;
1668
+
1669
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1670
+ return KVM_MSR_RET_FILTERED;
1671
+
1672
+ msr.index = index;
1673
+ msr.host_initiated = host_initiated;
1674
+
1675
+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
1676
+ if (!ret)
1677
+ *data = msr.data;
1678
+ return ret;
1679
+}
1680
+
1681
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1682
+ u32 index, u64 *data, bool host_initiated)
1683
+{
1684
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1685
+
1686
+ if (ret == KVM_MSR_RET_INVALID) {
1687
+ /* Unconditionally clear *data for simplicity */
1688
+ *data = 0;
1689
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
1690
+ ret = 0;
1691
+ }
1692
+
1693
+ return ret;
1694
+}
1695
+
1696
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1697
+{
1698
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
1699
+}
1700
+EXPORT_SYMBOL_GPL(kvm_get_msr);
1701
+
1702
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1703
+{
1704
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
13471705 }
13481706 EXPORT_SYMBOL_GPL(kvm_set_msr);
1707
+
1708
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
1709
+{
1710
+ if (vcpu->run->msr.error) {
1711
+ kvm_inject_gp(vcpu, 0);
1712
+ return 1;
1713
+ } else if (is_read) {
1714
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1715
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1716
+ }
1717
+
1718
+ return kvm_skip_emulated_instruction(vcpu);
1719
+}
1720
+
1721
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1722
+{
1723
+ return complete_emulated_msr(vcpu, true);
1724
+}
1725
+
1726
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1727
+{
1728
+ return complete_emulated_msr(vcpu, false);
1729
+}
1730
+
1731
+static u64 kvm_msr_reason(int r)
1732
+{
1733
+ switch (r) {
1734
+ case KVM_MSR_RET_INVALID:
1735
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
1736
+ case KVM_MSR_RET_FILTERED:
1737
+ return KVM_MSR_EXIT_REASON_FILTER;
1738
+ default:
1739
+ return KVM_MSR_EXIT_REASON_INVAL;
1740
+ }
1741
+}
1742
+
1743
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1744
+ u32 exit_reason, u64 data,
1745
+ int (*completion)(struct kvm_vcpu *vcpu),
1746
+ int r)
1747
+{
1748
+ u64 msr_reason = kvm_msr_reason(r);
1749
+
1750
+ /* Check if the user wanted to know about this MSR fault */
1751
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1752
+ return 0;
1753
+
1754
+ vcpu->run->exit_reason = exit_reason;
1755
+ vcpu->run->msr.error = 0;
1756
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1757
+ vcpu->run->msr.reason = msr_reason;
1758
+ vcpu->run->msr.index = index;
1759
+ vcpu->run->msr.data = data;
1760
+ vcpu->arch.complete_userspace_io = completion;
1761
+
1762
+ return 1;
1763
+}
1764
+
1765
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1766
+{
1767
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1768
+ complete_emulated_rdmsr, r);
1769
+}
1770
+
1771
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1772
+{
1773
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1774
+ complete_emulated_wrmsr, r);
1775
+}
1776
+
1777
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1778
+{
1779
+ u32 ecx = kvm_rcx_read(vcpu);
1780
+ u64 data;
1781
+ int r;
1782
+
1783
+ r = kvm_get_msr(vcpu, ecx, &data);
1784
+
1785
+ /* MSR read failed? See if we should ask user space */
1786
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1787
+ /* Bounce to user space */
1788
+ return 0;
1789
+ }
1790
+
1791
+ /* MSR read failed? Inject a #GP */
1792
+ if (r) {
1793
+ trace_kvm_msr_read_ex(ecx);
1794
+ kvm_inject_gp(vcpu, 0);
1795
+ return 1;
1796
+ }
1797
+
1798
+ trace_kvm_msr_read(ecx, data);
1799
+
1800
+ kvm_rax_write(vcpu, data & -1u);
1801
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
1802
+ return kvm_skip_emulated_instruction(vcpu);
1803
+}
1804
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1805
+
1806
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1807
+{
1808
+ u32 ecx = kvm_rcx_read(vcpu);
1809
+ u64 data = kvm_read_edx_eax(vcpu);
1810
+ int r;
1811
+
1812
+ r = kvm_set_msr(vcpu, ecx, data);
1813
+
1814
+ /* MSR write failed? See if we should ask user space */
1815
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1816
+ /* Bounce to user space */
1817
+ return 0;
1818
+
1819
+ /* Signal all other negative errors to userspace */
1820
+ if (r < 0)
1821
+ return r;
1822
+
1823
+ /* MSR write failed? Inject a #GP */
1824
+ if (r > 0) {
1825
+ trace_kvm_msr_write_ex(ecx, data);
1826
+ kvm_inject_gp(vcpu, 0);
1827
+ return 1;
1828
+ }
1829
+
1830
+ trace_kvm_msr_write(ecx, data);
1831
+ return kvm_skip_emulated_instruction(vcpu);
1832
+}
1833
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1834
+
1835
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1836
+{
1837
+ return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1838
+ xfer_to_guest_mode_work_pending();
1839
+}
1840
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
1841
+
1842
+/*
1843
+ * The fast path for frequent and performance sensitive wrmsr emulation,
1844
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1845
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
1846
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1847
+ * other cases which must be called after interrupts are enabled on the host.
1848
+ */
1849
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1850
+{
1851
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1852
+ return 1;
1853
+
1854
+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1855
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1856
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1857
+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1858
+
1859
+ data &= ~(1 << 12);
1860
+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1861
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1862
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1863
+ trace_kvm_apic_write(APIC_ICR, (u32)data);
1864
+ return 0;
1865
+ }
1866
+
1867
+ return 1;
1868
+}
1869
+
1870
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1871
+{
1872
+ if (!kvm_can_use_hv_timer(vcpu))
1873
+ return 1;
1874
+
1875
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
1876
+ return 0;
1877
+}
1878
+
1879
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1880
+{
1881
+ u32 msr = kvm_rcx_read(vcpu);
1882
+ u64 data;
1883
+ fastpath_t ret = EXIT_FASTPATH_NONE;
1884
+
1885
+ switch (msr) {
1886
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
1887
+ data = kvm_read_edx_eax(vcpu);
1888
+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1889
+ kvm_skip_emulated_instruction(vcpu);
1890
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
1891
+ }
1892
+ break;
1893
+ case MSR_IA32_TSCDEADLINE:
1894
+ data = kvm_read_edx_eax(vcpu);
1895
+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1896
+ kvm_skip_emulated_instruction(vcpu);
1897
+ ret = EXIT_FASTPATH_REENTER_GUEST;
1898
+ }
1899
+ break;
1900
+ default:
1901
+ break;
1902
+ }
1903
+
1904
+ if (ret != EXIT_FASTPATH_NONE)
1905
+ trace_kvm_msr_write(msr, data);
1906
+
1907
+ return ret;
1908
+}
1909
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
13491910
13501911 /*
13511912 * Adapt set_msr() to msr_io()'s calling convention
13521913 */
13531914 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
13541915 {
1355
- struct msr_data msr;
1356
- int r;
1357
-
1358
- msr.index = index;
1359
- msr.host_initiated = true;
1360
- r = kvm_get_msr(vcpu, &msr);
1361
- if (r)
1362
- return r;
1363
-
1364
- *data = msr.data;
1365
- return 0;
1916
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
13661917 }
13671918
13681919 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
13691920 {
1370
- struct msr_data msr;
1371
-
1372
- msr.data = *data;
1373
- msr.index = index;
1374
- msr.host_initiated = true;
1375
- return kvm_set_msr(vcpu, &msr);
1921
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
13761922 }
13771923
13781924 #ifdef CONFIG_X86_64
1925
+struct pvclock_clock {
1926
+ int vclock_mode;
1927
+ u64 cycle_last;
1928
+ u64 mask;
1929
+ u32 mult;
1930
+ u32 shift;
1931
+ u64 base_cycles;
1932
+ u64 offset;
1933
+};
1934
+
13791935 struct pvclock_gtod_data {
13801936 seqcount_t seq;
13811937
1382
- struct { /* extract of a clocksource struct */
1383
- int vclock_mode;
1384
- u64 cycle_last;
1385
- u64 mask;
1386
- u32 mult;
1387
- u32 shift;
1388
- } clock;
1938
+ struct pvclock_clock clock; /* extract of a clocksource struct */
1939
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
13891940
1390
- u64 boot_ns;
1391
- u64 nsec_base;
1941
+ ktime_t offs_boot;
13921942 u64 wall_time_sec;
13931943 };
13941944
....@@ -1397,44 +1947,54 @@
13971947 static void update_pvclock_gtod(struct timekeeper *tk)
13981948 {
13991949 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1400
- u64 boot_ns;
1401
-
1402
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
14031950
14041951 write_seqcount_begin(&vdata->seq);
14051952
14061953 /* copy pvclock gtod data */
1407
- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
1954
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
14081955 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
14091956 vdata->clock.mask = tk->tkr_mono.mask;
14101957 vdata->clock.mult = tk->tkr_mono.mult;
14111958 vdata->clock.shift = tk->tkr_mono.shift;
1959
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
1960
+ vdata->clock.offset = tk->tkr_mono.base;
14121961
1413
- vdata->boot_ns = boot_ns;
1414
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
1962
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
1963
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
1964
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
1965
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
1966
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
1967
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
1968
+ vdata->raw_clock.offset = tk->tkr_raw.base;
14151969
14161970 vdata->wall_time_sec = tk->xtime_sec;
14171971
1972
+ vdata->offs_boot = tk->offs_boot;
1973
+
14181974 write_seqcount_end(&vdata->seq);
14191975 }
1420
-#endif
14211976
1422
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1977
+static s64 get_kvmclock_base_ns(void)
14231978 {
1424
- /*
1425
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1426
- * vcpu_enter_guest. This function is only called from
1427
- * the physical CPU that is running vcpu.
1428
- */
1429
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1979
+ /* Count up from boot time, but with the frequency of the raw clock. */
1980
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
14301981 }
1982
+#else
1983
+static s64 get_kvmclock_base_ns(void)
1984
+{
1985
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
1986
+ return ktime_get_boottime_ns();
1987
+}
1988
+#endif
14311989
14321990 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
14331991 {
14341992 int version;
14351993 int r;
14361994 struct pvclock_wall_clock wc;
1437
- struct timespec64 boot;
1995
+ u64 wall_nsec;
1996
+
1997
+ kvm->arch.wall_clock = wall_clock;
14381998
14391999 if (!wall_clock)
14402000 return;
....@@ -1454,23 +2014,46 @@
14542014 /*
14552015 * The guest calculates current wall clock time by adding
14562016 * system time (updated by kvm_guest_time_update below) to the
1457
- * wall clock specified here. guest system time equals host
1458
- * system time for us, thus we must fill in host boot time here.
2017
+ * wall clock specified here. We do the reverse here.
14592018 */
1460
- getboottime64(&boot);
2019
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
14612020
1462
- if (kvm->arch.kvmclock_offset) {
1463
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1464
- boot = timespec64_sub(boot, ts);
1465
- }
1466
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1467
- wc.nsec = boot.tv_nsec;
2021
+ wc.nsec = do_div(wall_nsec, 1000000000);
2022
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
14682023 wc.version = version;
14692024
14702025 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
14712026
14722027 version++;
14732028 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2029
+}
2030
+
2031
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2032
+ bool old_msr, bool host_initiated)
2033
+{
2034
+ struct kvm_arch *ka = &vcpu->kvm->arch;
2035
+
2036
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
2037
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2038
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2039
+
2040
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
2041
+ }
2042
+
2043
+ vcpu->arch.time = system_time;
2044
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2045
+
2046
+ /* we verify if the enable bit is set... */
2047
+ vcpu->arch.pv_time_enabled = false;
2048
+ if (!(system_time & 1))
2049
+ return;
2050
+
2051
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
2052
+ &vcpu->arch.pv_time, system_time & ~1ULL,
2053
+ sizeof(struct pvclock_vcpu_time_info)))
2054
+ vcpu->arch.pv_time_enabled = true;
2055
+
2056
+ return;
14742057 }
14752058
14762059 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
....@@ -1505,9 +2088,6 @@
15052088
15062089 *pshift = shift;
15072090 *pmultiplier = div_frac(scaled64, tps32);
1508
-
1509
- pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1510
- __func__, base_hz, scaled_hz, shift, *pmultiplier);
15112091 }
15122092
15132093 #ifdef CONFIG_X86_64
....@@ -1604,7 +2184,7 @@
16042184
16052185 static inline int gtod_is_based_on_tsc(int mode)
16062186 {
1607
- return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
2187
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
16082188 }
16092189
16102190 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
....@@ -1633,12 +2213,6 @@
16332213 atomic_read(&vcpu->kvm->online_vcpus),
16342214 ka->use_master_clock, gtod->clock.vclock_mode);
16352215 #endif
1636
-}
1637
-
1638
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1639
-{
1640
- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1641
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
16422216 }
16432217
16442218 /*
....@@ -1679,15 +2253,14 @@
16792253
16802254 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
16812255 {
1682
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1683
-
1684
- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2256
+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
16852257 }
16862258 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
16872259
16882260 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
16892261 {
1690
- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
2262
+ vcpu->arch.l1_tsc_offset = offset;
2263
+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
16912264 }
16922265
16932266 static inline bool kvm_check_tsc_unstable(void)
....@@ -1697,29 +2270,28 @@
16972270 * TSC is marked unstable when we're running on Hyper-V,
16982271 * 'TSC page' clocksource is good.
16992272 */
1700
- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
2273
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
17012274 return false;
17022275 #endif
17032276 return check_tsc_unstable();
17042277 }
17052278
1706
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
2279
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
17072280 {
17082281 struct kvm *kvm = vcpu->kvm;
17092282 u64 offset, ns, elapsed;
17102283 unsigned long flags;
17112284 bool matched;
17122285 bool already_matched;
1713
- u64 data = msr->data;
17142286 bool synchronizing = false;
17152287
17162288 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
17172289 offset = kvm_compute_tsc_offset(vcpu, data);
1718
- ns = ktime_get_boot_ns();
2290
+ ns = get_kvmclock_base_ns();
17192291 elapsed = ns - kvm->arch.last_tsc_nsec;
17202292
17212293 if (vcpu->arch.virtual_tsc_khz) {
1722
- if (data == 0 && msr->host_initiated) {
2294
+ if (data == 0) {
17232295 /*
17242296 * detection of vcpu initialization -- need to sync
17252297 * with other vCPUs. This particularly helps to keep
....@@ -1750,12 +2322,10 @@
17502322 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
17512323 if (!kvm_check_tsc_unstable()) {
17522324 offset = kvm->arch.cur_tsc_offset;
1753
- pr_debug("kvm: matched tsc offset for %llu\n", data);
17542325 } else {
17552326 u64 delta = nsec_to_cycles(vcpu, elapsed);
17562327 data += delta;
17572328 offset = kvm_compute_tsc_offset(vcpu, data);
1758
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
17592329 }
17602330 matched = true;
17612331 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
....@@ -1774,8 +2344,6 @@
17742344 kvm->arch.cur_tsc_write = data;
17752345 kvm->arch.cur_tsc_offset = offset;
17762346 matched = false;
1777
- pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1778
- kvm->arch.cur_tsc_generation, data);
17792347 }
17802348
17812349 /*
....@@ -1793,9 +2361,6 @@
17932361 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
17942362 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
17952363
1796
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1797
- update_ia32_tsc_adjust_msr(vcpu, offset);
1798
-
17992364 kvm_vcpu_write_tsc_offset(vcpu, offset);
18002365 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
18012366
....@@ -1810,12 +2375,10 @@
18102375 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
18112376 }
18122377
1813
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
1814
-
18152378 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
18162379 s64 adjustment)
18172380 {
1818
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
2381
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
18192382 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
18202383 }
18212384
....@@ -1849,43 +2412,43 @@
18492412 return last;
18502413 }
18512414
1852
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
2415
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2416
+ int *mode)
18532417 {
18542418 long v;
1855
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
18562419 u64 tsc_pg_val;
18572420
1858
- switch (gtod->clock.vclock_mode) {
1859
- case VCLOCK_HVCLOCK:
2421
+ switch (clock->vclock_mode) {
2422
+ case VDSO_CLOCKMODE_HVCLOCK:
18602423 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
18612424 tsc_timestamp);
18622425 if (tsc_pg_val != U64_MAX) {
18632426 /* TSC page valid */
1864
- *mode = VCLOCK_HVCLOCK;
1865
- v = (tsc_pg_val - gtod->clock.cycle_last) &
1866
- gtod->clock.mask;
2427
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
2428
+ v = (tsc_pg_val - clock->cycle_last) &
2429
+ clock->mask;
18672430 } else {
18682431 /* TSC page invalid */
1869
- *mode = VCLOCK_NONE;
2432
+ *mode = VDSO_CLOCKMODE_NONE;
18702433 }
18712434 break;
1872
- case VCLOCK_TSC:
1873
- *mode = VCLOCK_TSC;
2435
+ case VDSO_CLOCKMODE_TSC:
2436
+ *mode = VDSO_CLOCKMODE_TSC;
18742437 *tsc_timestamp = read_tsc();
1875
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
1876
- gtod->clock.mask;
2438
+ v = (*tsc_timestamp - clock->cycle_last) &
2439
+ clock->mask;
18772440 break;
18782441 default:
1879
- *mode = VCLOCK_NONE;
2442
+ *mode = VDSO_CLOCKMODE_NONE;
18802443 }
18812444
1882
- if (*mode == VCLOCK_NONE)
2445
+ if (*mode == VDSO_CLOCKMODE_NONE)
18832446 *tsc_timestamp = v = 0;
18842447
1885
- return v * gtod->clock.mult;
2448
+ return v * clock->mult;
18862449 }
18872450
1888
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
2451
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
18892452 {
18902453 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
18912454 unsigned long seq;
....@@ -1894,10 +2457,10 @@
18942457
18952458 do {
18962459 seq = read_seqcount_begin(&gtod->seq);
1897
- ns = gtod->nsec_base;
1898
- ns += vgettsc(tsc_timestamp, &mode);
1899
- ns >>= gtod->clock.shift;
1900
- ns += gtod->boot_ns;
2460
+ ns = gtod->raw_clock.base_cycles;
2461
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2462
+ ns >>= gtod->raw_clock.shift;
2463
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
19012464 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
19022465 *t = ns;
19032466
....@@ -1914,8 +2477,8 @@
19142477 do {
19152478 seq = read_seqcount_begin(&gtod->seq);
19162479 ts->tv_sec = gtod->wall_time_sec;
1917
- ns = gtod->nsec_base;
1918
- ns += vgettsc(tsc_timestamp, &mode);
2480
+ ns = gtod->clock.base_cycles;
2481
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
19192482 ns >>= gtod->clock.shift;
19202483 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
19212484
....@@ -1932,7 +2495,7 @@
19322495 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
19332496 return false;
19342497
1935
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
2498
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
19362499 tsc_timestamp));
19372500 }
19382501
....@@ -2057,7 +2620,7 @@
20572620 spin_lock(&ka->pvclock_gtod_sync_lock);
20582621 if (!ka->use_master_clock) {
20592622 spin_unlock(&ka->pvclock_gtod_sync_lock);
2060
- return ktime_get_boot_ns() + ka->kvmclock_offset;
2623
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
20612624 }
20622625
20632626 hv_clock.tsc_timestamp = ka->master_cycle_now;
....@@ -2073,7 +2636,7 @@
20732636 &hv_clock.tsc_to_system_mul);
20742637 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
20752638 } else
2076
- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
2639
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
20772640
20782641 put_cpu();
20792642
....@@ -2172,7 +2735,7 @@
21722735 }
21732736 if (!use_master_clock) {
21742737 host_tsc = rdtsc();
2175
- kernel_ns = ktime_get_boot_ns();
2738
+ kernel_ns = get_kvmclock_base_ns();
21762739 }
21772740
21782741 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
....@@ -2284,6 +2847,18 @@
22842847 KVMCLOCK_SYNC_PERIOD);
22852848 }
22862849
2850
+/*
2851
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2852
+ */
2853
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2854
+{
2855
+ /* McStatusWrEn enabled? */
2856
+ if (guest_cpuid_is_amd_or_hygon(vcpu))
2857
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2858
+
2859
+ return false;
2860
+}
2861
+
22872862 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
22882863 {
22892864 u64 mcg_cap = vcpu->arch.mcg_cap;
....@@ -2313,14 +2888,22 @@
23132888 /* only 0 or all 1s can be written to IA32_MCi_CTL
23142889 * some Linux kernels though clear bit 10 in bank 4 to
23152890 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2316
- * this to avoid an uncatched #GP in the guest
2891
+ * this to avoid an uncatched #GP in the guest.
2892
+ *
2893
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
2894
+ * correctable, single-bit ECC data errors.
23172895 */
23182896 if ((offset & 0x3) == 0 &&
2319
- data != 0 && (data | (1 << 10)) != ~(u64)0)
2320
- return -1;
2897
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
2898
+ return 1;
2899
+
2900
+ /* MCi_STATUS */
23212901 if (!msr_info->host_initiated &&
2322
- (offset & 0x3) == 1 && data != 0)
2323
- return -1;
2902
+ (offset & 0x3) == 1 && data != 0) {
2903
+ if (!can_set_mci_status(vcpu))
2904
+ return 1;
2905
+ }
2906
+
23242907 vcpu->arch.mce_banks[offset] = data;
23252908 break;
23262909 }
....@@ -2340,61 +2923,99 @@
23402923 u32 page_num = data & ~PAGE_MASK;
23412924 u64 page_addr = data & PAGE_MASK;
23422925 u8 *page;
2343
- int r;
23442926
2345
- r = -E2BIG;
23462927 if (page_num >= blob_size)
2347
- goto out;
2348
- r = -ENOMEM;
2928
+ return 1;
2929
+
23492930 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2350
- if (IS_ERR(page)) {
2351
- r = PTR_ERR(page);
2352
- goto out;
2931
+ if (IS_ERR(page))
2932
+ return PTR_ERR(page);
2933
+
2934
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
2935
+ kfree(page);
2936
+ return 1;
23532937 }
2354
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2355
- goto out_free;
2356
- r = 0;
2357
-out_free:
2358
- kfree(page);
2359
-out:
2360
- return r;
2938
+ return 0;
2939
+}
2940
+
2941
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2942
+{
2943
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2944
+
2945
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
23612946 }
23622947
23632948 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
23642949 {
23652950 gpa_t gpa = data & ~0x3f;
23662951
2367
- /* Bits 3:5 are reserved, Should be zero */
2368
- if (data & 0x38)
2952
+ /* Bits 4:5 are reserved, Should be zero */
2953
+ if (data & 0x30)
23692954 return 1;
23702955
2371
- vcpu->arch.apf.msr_val = data;
2956
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2957
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2958
+ return 1;
23722959
2373
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
2960
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2961
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2962
+ return 1;
2963
+
2964
+ if (!lapic_in_kernel(vcpu))
2965
+ return data ? 1 : 0;
2966
+
2967
+ vcpu->arch.apf.msr_en_val = data;
2968
+
2969
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
23742970 kvm_clear_async_pf_completion_queue(vcpu);
23752971 kvm_async_pf_hash_reset(vcpu);
23762972 return 0;
23772973 }
23782974
23792975 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2380
- sizeof(u32)))
2976
+ sizeof(u64)))
23812977 return 1;
23822978
23832979 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
23842980 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2981
+
23852982 kvm_async_pf_wakeup_all(vcpu);
2983
+
2984
+ return 0;
2985
+}
2986
+
2987
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2988
+{
2989
+ /* Bits 8-63 are reserved */
2990
+ if (data >> 8)
2991
+ return 1;
2992
+
2993
+ if (!lapic_in_kernel(vcpu))
2994
+ return 1;
2995
+
2996
+ vcpu->arch.apf.msr_int_val = data;
2997
+
2998
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
2999
+
23863000 return 0;
23873001 }
23883002
23893003 static void kvmclock_reset(struct kvm_vcpu *vcpu)
23903004 {
23913005 vcpu->arch.pv_time_enabled = false;
3006
+ vcpu->arch.time = 0;
23923007 }
23933008
2394
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
3009
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
23953010 {
23963011 ++vcpu->stat.tlb_flush;
2397
- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
3012
+ kvm_x86_ops.tlb_flush_all(vcpu);
3013
+}
3014
+
3015
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3016
+{
3017
+ ++vcpu->stat.tlb_flush;
3018
+ kvm_x86_ops.tlb_flush_guest(vcpu);
23983019 }
23993020
24003021 static void record_steal_time(struct kvm_vcpu *vcpu)
....@@ -2417,8 +3038,14 @@
24173038 * Doing a TLB flush here, on the guest's behalf, can avoid
24183039 * expensive IPIs.
24193040 */
2420
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2421
- kvm_vcpu_flush_tlb(vcpu, false);
3041
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3042
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3043
+ st->preempted & KVM_VCPU_FLUSH_TLB);
3044
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
3045
+ kvm_vcpu_flush_tlb_guest(vcpu);
3046
+ } else {
3047
+ st->preempted = 0;
3048
+ }
24223049
24233050 vcpu->arch.st.preempted = 0;
24243051
....@@ -2465,14 +3092,31 @@
24653092 return 1;
24663093 vcpu->arch.arch_capabilities = data;
24673094 break;
3095
+ case MSR_IA32_PERF_CAPABILITIES: {
3096
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3097
+
3098
+ if (!msr_info->host_initiated)
3099
+ return 1;
3100
+ if (kvm_get_msr_feature(&msr_ent))
3101
+ return 1;
3102
+ if (data & ~msr_ent.data)
3103
+ return 1;
3104
+
3105
+ vcpu->arch.perf_capabilities = data;
3106
+
3107
+ return 0;
3108
+ }
24683109 case MSR_EFER:
24693110 return set_efer(vcpu, msr_info);
24703111 case MSR_K7_HWCR:
24713112 data &= ~(u64)0x40; /* ignore flush filter disable */
24723113 data &= ~(u64)0x100; /* ignore ignne emulation enable */
24733114 data &= ~(u64)0x8; /* ignore TLB cache disable */
2474
- data &= ~(u64)0x40000; /* ignore Mc status write enable */
2475
- if (data != 0) {
3115
+
3116
+ /* Handle McStatusWrEn */
3117
+ if (data == BIT_ULL(18)) {
3118
+ vcpu->arch.msr_hwcr = data;
3119
+ } else if (data != 0) {
24763120 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
24773121 data);
24783122 return 1;
....@@ -2493,9 +3137,9 @@
24933137 /* Values other than LBR and BTF are vendor-specific,
24943138 thus reserved and should throw a #GP */
24953139 return 1;
2496
- }
2497
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2498
- __func__, data);
3140
+ } else if (report_ignored_msrs)
3141
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
3142
+ __func__, data);
24993143 break;
25003144 case 0x200 ... 0x2ff:
25013145 return kvm_mtrr_set_msr(vcpu, msr, data);
....@@ -2520,15 +3164,46 @@
25203164 }
25213165 break;
25223166 case MSR_IA32_MISC_ENABLE:
2523
- vcpu->arch.ia32_misc_enable_msr = data;
3167
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3168
+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3169
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3170
+ return 1;
3171
+ vcpu->arch.ia32_misc_enable_msr = data;
3172
+ kvm_update_cpuid_runtime(vcpu);
3173
+ } else {
3174
+ vcpu->arch.ia32_misc_enable_msr = data;
3175
+ }
25243176 break;
25253177 case MSR_IA32_SMBASE:
25263178 if (!msr_info->host_initiated)
25273179 return 1;
25283180 vcpu->arch.smbase = data;
25293181 break;
3182
+ case MSR_IA32_POWER_CTL:
3183
+ vcpu->arch.msr_ia32_power_ctl = data;
3184
+ break;
25303185 case MSR_IA32_TSC:
2531
- kvm_write_tsc(vcpu, msr_info);
3186
+ if (msr_info->host_initiated) {
3187
+ kvm_synchronize_tsc(vcpu, data);
3188
+ } else {
3189
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3190
+ adjust_tsc_offset_guest(vcpu, adj);
3191
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
3192
+ }
3193
+ break;
3194
+ case MSR_IA32_XSS:
3195
+ if (!msr_info->host_initiated &&
3196
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3197
+ return 1;
3198
+ /*
3199
+ * KVM supports exposing PT to the guest, but does not support
3200
+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3201
+ * XSAVES/XRSTORS to save/restore PT MSRs.
3202
+ */
3203
+ if (data & ~supported_xss)
3204
+ return 1;
3205
+ vcpu->arch.ia32_xss = data;
3206
+ kvm_update_cpuid_runtime(vcpu);
25323207 break;
25333208 case MSR_SMI_COUNT:
25343209 if (!msr_info->host_initiated)
....@@ -2536,46 +3211,54 @@
25363211 vcpu->arch.smi_count = data;
25373212 break;
25383213 case MSR_KVM_WALL_CLOCK_NEW:
3214
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3215
+ return 1;
3216
+
3217
+ kvm_write_wall_clock(vcpu->kvm, data);
3218
+ break;
25393219 case MSR_KVM_WALL_CLOCK:
2540
- vcpu->kvm->arch.wall_clock = data;
3220
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3221
+ return 1;
3222
+
25413223 kvm_write_wall_clock(vcpu->kvm, data);
25423224 break;
25433225 case MSR_KVM_SYSTEM_TIME_NEW:
2544
- case MSR_KVM_SYSTEM_TIME: {
2545
- struct kvm_arch *ka = &vcpu->kvm->arch;
3226
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3227
+ return 1;
25463228
2547
- kvmclock_reset(vcpu);
2548
-
2549
- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2550
- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2551
-
2552
- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2553
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2554
-
2555
- ka->boot_vcpu_runs_old_kvmclock = tmp;
2556
- }
2557
-
2558
- vcpu->arch.time = data;
2559
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2560
-
2561
- /* we verify if the enable bit is set... */
2562
- if (!(data & 1))
2563
- break;
2564
-
2565
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2566
- &vcpu->arch.pv_time, data & ~1ULL,
2567
- sizeof(struct pvclock_vcpu_time_info)))
2568
- vcpu->arch.pv_time_enabled = false;
2569
- else
2570
- vcpu->arch.pv_time_enabled = true;
2571
-
3229
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
25723230 break;
2573
- }
3231
+ case MSR_KVM_SYSTEM_TIME:
3232
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3233
+ return 1;
3234
+
3235
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
3236
+ break;
25743237 case MSR_KVM_ASYNC_PF_EN:
3238
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3239
+ return 1;
3240
+
25753241 if (kvm_pv_enable_async_pf(vcpu, data))
25763242 return 1;
25773243 break;
3244
+ case MSR_KVM_ASYNC_PF_INT:
3245
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3246
+ return 1;
3247
+
3248
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
3249
+ return 1;
3250
+ break;
3251
+ case MSR_KVM_ASYNC_PF_ACK:
3252
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3253
+ return 1;
3254
+ if (data & 0x1) {
3255
+ vcpu->arch.apf.pageready_pending = false;
3256
+ kvm_check_async_pf_completion(vcpu);
3257
+ }
3258
+ break;
25783259 case MSR_KVM_STEAL_TIME:
3260
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3261
+ return 1;
25793262
25803263 if (unlikely(!sched_info_on()))
25813264 return 1;
....@@ -2592,8 +3275,22 @@
25923275
25933276 break;
25943277 case MSR_KVM_PV_EOI_EN:
3278
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3279
+ return 1;
3280
+
25953281 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
25963282 return 1;
3283
+ break;
3284
+
3285
+ case MSR_KVM_POLL_CONTROL:
3286
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3287
+ return 1;
3288
+
3289
+ /* only enable bit supported */
3290
+ if (data & (-1ULL << 1))
3291
+ return 1;
3292
+
3293
+ vcpu->arch.msr_kvm_poll_control = data;
25973294 break;
25983295
25993296 case MSR_IA32_MCG_CTL:
....@@ -2603,7 +3300,8 @@
26033300
26043301 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
26053302 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2606
- pr = true; /* fall through */
3303
+ pr = true;
3304
+ fallthrough;
26073305 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
26083306 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
26093307 if (kvm_pmu_is_valid_msr(vcpu, msr))
....@@ -2624,6 +3322,8 @@
26243322 */
26253323 break;
26263324 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3325
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3326
+ case HV_X64_MSR_SYNDBG_OPTIONS:
26273327 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
26283328 case HV_X64_MSR_CRASH_CTL:
26293329 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
....@@ -2669,33 +3369,11 @@
26693369 return xen_hvm_config(vcpu, data);
26703370 if (kvm_pmu_is_valid_msr(vcpu, msr))
26713371 return kvm_pmu_set_msr(vcpu, msr_info);
2672
- if (!ignore_msrs) {
2673
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2674
- msr, data);
2675
- return 1;
2676
- } else {
2677
- if (report_ignored_msrs)
2678
- vcpu_unimpl(vcpu,
2679
- "ignored wrmsr: 0x%x data 0x%llx\n",
2680
- msr, data);
2681
- break;
2682
- }
3372
+ return KVM_MSR_RET_INVALID;
26833373 }
26843374 return 0;
26853375 }
26863376 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2687
-
2688
-
2689
-/*
2690
- * Reads an msr value (of 'msr_index') into 'pdata'.
2691
- * Returns 0 on success, non-0 otherwise.
2692
- * Assumes vcpu_load() was already called.
2693
- */
2694
-int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2695
-{
2696
- return kvm_x86_ops->get_msr(vcpu, msr);
2697
-}
2698
-EXPORT_SYMBOL_GPL(kvm_get_msr);
26993377
27003378 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
27013379 {
....@@ -2748,7 +3426,6 @@
27483426 case MSR_K8_SYSCFG:
27493427 case MSR_K8_TSEG_ADDR:
27503428 case MSR_K8_TSEG_MASK:
2751
- case MSR_K7_HWCR:
27523429 case MSR_VM_HSAVE_PA:
27533430 case MSR_K8_INT_PENDING_MSG:
27543431 case MSR_AMD64_NB_CFG:
....@@ -2757,6 +3434,17 @@
27573434 case MSR_IA32_PERF_CTL:
27583435 case MSR_AMD64_DC_CFG:
27593436 case MSR_F15H_EX_CFG:
3437
+ /*
3438
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
3439
+ * limit) MSRs. Just return 0, as we do not want to expose the host
3440
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
3441
+ * so for existing CPU-specific MSRs.
3442
+ */
3443
+ case MSR_RAPL_POWER_UNIT:
3444
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
3445
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
3446
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
3447
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
27603448 msr_info->data = 0;
27613449 break;
27623450 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
....@@ -2765,7 +3453,7 @@
27653453 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
27663454 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
27673455 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2768
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
3456
+ return kvm_pmu_get_msr(vcpu, msr_info);
27693457 msr_info->data = 0;
27703458 break;
27713459 case MSR_IA32_UCODE_REV:
....@@ -2777,9 +3465,31 @@
27773465 return 1;
27783466 msr_info->data = vcpu->arch.arch_capabilities;
27793467 break;
2780
- case MSR_IA32_TSC:
2781
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
3468
+ case MSR_IA32_PERF_CAPABILITIES:
3469
+ if (!msr_info->host_initiated &&
3470
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3471
+ return 1;
3472
+ msr_info->data = vcpu->arch.perf_capabilities;
27823473 break;
3474
+ case MSR_IA32_POWER_CTL:
3475
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3476
+ break;
3477
+ case MSR_IA32_TSC: {
3478
+ /*
3479
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3480
+ * even when not intercepted. AMD manual doesn't explicitly
3481
+ * state this but appears to behave the same.
3482
+ *
3483
+ * On userspace reads and writes, however, we unconditionally
3484
+ * return L1's TSC value to ensure backwards-compatible
3485
+ * behavior for migration.
3486
+ */
3487
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3488
+ vcpu->arch.tsc_offset;
3489
+
3490
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3491
+ break;
3492
+ }
27833493 case MSR_MTRRcap:
27843494 case 0x200 ... 0x2ff:
27853495 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
....@@ -2805,7 +3515,6 @@
28053515 break;
28063516 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
28073517 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2808
- break;
28093518 case MSR_IA32_TSCDEADLINE:
28103519 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
28113520 break;
....@@ -2833,21 +3542,64 @@
28333542 msr_info->data = vcpu->arch.efer;
28343543 break;
28353544 case MSR_KVM_WALL_CLOCK:
3545
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3546
+ return 1;
3547
+
3548
+ msr_info->data = vcpu->kvm->arch.wall_clock;
3549
+ break;
28363550 case MSR_KVM_WALL_CLOCK_NEW:
3551
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3552
+ return 1;
3553
+
28373554 msr_info->data = vcpu->kvm->arch.wall_clock;
28383555 break;
28393556 case MSR_KVM_SYSTEM_TIME:
3557
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3558
+ return 1;
3559
+
3560
+ msr_info->data = vcpu->arch.time;
3561
+ break;
28403562 case MSR_KVM_SYSTEM_TIME_NEW:
3563
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3564
+ return 1;
3565
+
28413566 msr_info->data = vcpu->arch.time;
28423567 break;
28433568 case MSR_KVM_ASYNC_PF_EN:
2844
- msr_info->data = vcpu->arch.apf.msr_val;
3569
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3570
+ return 1;
3571
+
3572
+ msr_info->data = vcpu->arch.apf.msr_en_val;
3573
+ break;
3574
+ case MSR_KVM_ASYNC_PF_INT:
3575
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3576
+ return 1;
3577
+
3578
+ msr_info->data = vcpu->arch.apf.msr_int_val;
3579
+ break;
3580
+ case MSR_KVM_ASYNC_PF_ACK:
3581
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3582
+ return 1;
3583
+
3584
+ msr_info->data = 0;
28453585 break;
28463586 case MSR_KVM_STEAL_TIME:
3587
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3588
+ return 1;
3589
+
28473590 msr_info->data = vcpu->arch.st.msr_val;
28483591 break;
28493592 case MSR_KVM_PV_EOI_EN:
3593
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3594
+ return 1;
3595
+
28503596 msr_info->data = vcpu->arch.pv_eoi.msr_val;
3597
+ break;
3598
+ case MSR_KVM_POLL_CONTROL:
3599
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3600
+ return 1;
3601
+
3602
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
28513603 break;
28523604 case MSR_IA32_P5_MC_ADDR:
28533605 case MSR_IA32_P5_MC_TYPE:
....@@ -2857,6 +3609,12 @@
28573609 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
28583610 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
28593611 msr_info->host_initiated);
3612
+ case MSR_IA32_XSS:
3613
+ if (!msr_info->host_initiated &&
3614
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3615
+ return 1;
3616
+ msr_info->data = vcpu->arch.ia32_xss;
3617
+ break;
28603618 case MSR_K7_CLK_CTL:
28613619 /*
28623620 * Provide expected ramp-up count for K7. All other
....@@ -2870,6 +3628,8 @@
28703628 msr_info->data = 0x20000000;
28713629 break;
28723630 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3631
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3632
+ case HV_X64_MSR_SYNDBG_OPTIONS:
28733633 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
28743634 case HV_X64_MSR_CRASH_CTL:
28753635 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
....@@ -2879,7 +3639,6 @@
28793639 return kvm_hv_get_msr_common(vcpu,
28803640 msr_info->index, &msr_info->data,
28813641 msr_info->host_initiated);
2882
- break;
28833642 case MSR_IA32_BBL_CR_CTL3:
28843643 /* This legacy MSR exists but isn't fully documented in current
28853644 * silicon. It is however accessed by winxp in very narrow
....@@ -2912,20 +3671,13 @@
29123671 case MSR_MISC_FEATURES_ENABLES:
29133672 msr_info->data = vcpu->arch.msr_misc_features_enables;
29143673 break;
3674
+ case MSR_K7_HWCR:
3675
+ msr_info->data = vcpu->arch.msr_hwcr;
3676
+ break;
29153677 default:
29163678 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2917
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2918
- if (!ignore_msrs) {
2919
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2920
- msr_info->index);
2921
- return 1;
2922
- } else {
2923
- if (report_ignored_msrs)
2924
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2925
- msr_info->index);
2926
- msr_info->data = 0;
2927
- }
2928
- break;
3679
+ return kvm_pmu_get_msr(vcpu, msr_info);
3680
+ return KVM_MSR_RET_INVALID;
29293681 }
29303682 return 0;
29313683 }
....@@ -2966,7 +3718,7 @@
29663718 unsigned size;
29673719
29683720 r = -EFAULT;
2969
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
3721
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
29703722 goto out;
29713723
29723724 r = -E2BIG;
....@@ -3037,24 +3789,33 @@
30373789 case KVM_CAP_HYPERV_VP_INDEX:
30383790 case KVM_CAP_HYPERV_EVENTFD:
30393791 case KVM_CAP_HYPERV_TLBFLUSH:
3792
+ case KVM_CAP_HYPERV_SEND_IPI:
3793
+ case KVM_CAP_HYPERV_CPUID:
30403794 case KVM_CAP_PCI_SEGMENT:
30413795 case KVM_CAP_DEBUGREGS:
30423796 case KVM_CAP_X86_ROBUST_SINGLESTEP:
30433797 case KVM_CAP_XSAVE:
30443798 case KVM_CAP_ASYNC_PF:
3799
+ case KVM_CAP_ASYNC_PF_INT:
30453800 case KVM_CAP_GET_TSC_KHZ:
30463801 case KVM_CAP_KVMCLOCK_CTRL:
30473802 case KVM_CAP_READONLY_MEM:
30483803 case KVM_CAP_HYPERV_TIME:
30493804 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
30503805 case KVM_CAP_TSC_DEADLINE_TIMER:
3051
- case KVM_CAP_ENABLE_CAP_VM:
30523806 case KVM_CAP_DISABLE_QUIRKS:
30533807 case KVM_CAP_SET_BOOT_CPU_ID:
30543808 case KVM_CAP_SPLIT_IRQCHIP:
30553809 case KVM_CAP_IMMEDIATE_EXIT:
3810
+ case KVM_CAP_PMU_EVENT_FILTER:
30563811 case KVM_CAP_GET_MSR_FEATURES:
30573812 case KVM_CAP_MSR_PLATFORM_INFO:
3813
+ case KVM_CAP_EXCEPTION_PAYLOAD:
3814
+ case KVM_CAP_SET_GUEST_DEBUG:
3815
+ case KVM_CAP_LAST_CPU:
3816
+ case KVM_CAP_X86_USER_SPACE_MSR:
3817
+ case KVM_CAP_X86_MSR_FILTER:
3818
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
30583819 r = 1;
30593820 break;
30603821 case KVM_CAP_SYNC_REGS:
....@@ -3064,7 +3825,8 @@
30643825 r = KVM_CLOCK_TSC_STABLE;
30653826 break;
30663827 case KVM_CAP_X86_DISABLE_EXITS:
3067
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
3828
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3829
+ KVM_X86_DISABLE_EXITS_CSTATE;
30683830 if(kvm_can_mwait_in_guest())
30693831 r |= KVM_X86_DISABLE_EXITS_MWAIT;
30703832 break;
....@@ -3077,10 +3839,10 @@
30773839 * fringe case that is not enabled except via specific settings
30783840 * of the module parameters.
30793841 */
3080
- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
3842
+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
30813843 break;
30823844 case KVM_CAP_VAPIC:
3083
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
3845
+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
30843846 break;
30853847 case KVM_CAP_NR_VCPUS:
30863848 r = KVM_SOFT_MAX_VCPUS;
....@@ -3090,9 +3852,6 @@
30903852 break;
30913853 case KVM_CAP_MAX_VCPU_ID:
30923854 r = KVM_MAX_VCPU_ID;
3093
- break;
3094
- case KVM_CAP_NR_MEMSLOTS:
3095
- r = KVM_USER_MEM_SLOTS;
30963855 break;
30973856 case KVM_CAP_PV_MMU: /* obsolete */
30983857 r = 0;
....@@ -3110,8 +3869,20 @@
31103869 r = KVM_X2APIC_API_VALID_FLAGS;
31113870 break;
31123871 case KVM_CAP_NESTED_STATE:
3113
- r = kvm_x86_ops->get_nested_state ?
3114
- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
3872
+ r = kvm_x86_ops.nested_ops->get_state ?
3873
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
3874
+ break;
3875
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
3876
+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
3877
+ break;
3878
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3879
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
3880
+ break;
3881
+ case KVM_CAP_SMALLER_MAXPHYADDR:
3882
+ r = (int) allow_smaller_maxphyaddr;
3883
+ break;
3884
+ case KVM_CAP_STEAL_TIME:
3885
+ r = sched_info_on();
31153886 break;
31163887 default:
31173888 break;
....@@ -3133,11 +3904,11 @@
31333904 unsigned n;
31343905
31353906 r = -EFAULT;
3136
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3907
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
31373908 goto out;
31383909 n = msr_list.nmsrs;
31393910 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3140
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3911
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
31413912 goto out;
31423913 r = -E2BIG;
31433914 if (n < msr_list.nmsrs)
....@@ -3159,7 +3930,7 @@
31593930 struct kvm_cpuid2 cpuid;
31603931
31613932 r = -EFAULT;
3162
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3933
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
31633934 goto out;
31643935
31653936 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
....@@ -3168,12 +3939,12 @@
31683939 goto out;
31693940
31703941 r = -EFAULT;
3171
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3942
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
31723943 goto out;
31733944 r = 0;
31743945 break;
31753946 }
3176
- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3947
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
31773948 r = -EFAULT;
31783949 if (copy_to_user(argp, &kvm_mce_cap_supported,
31793950 sizeof(kvm_mce_cap_supported)))
....@@ -3205,9 +3976,9 @@
32053976 case KVM_GET_MSRS:
32063977 r = msr_io(NULL, argp, do_get_msr_feature, 1);
32073978 break;
3208
- }
32093979 default:
32103980 r = -EINVAL;
3981
+ break;
32113982 }
32123983 out:
32133984 return r;
....@@ -3227,14 +3998,17 @@
32273998 {
32283999 /* Address WBINVD may be executed by guest */
32294000 if (need_emulate_wbinvd(vcpu)) {
3230
- if (kvm_x86_ops->has_wbinvd_exit())
4001
+ if (kvm_x86_ops.has_wbinvd_exit())
32314002 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
32324003 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
32334004 smp_call_function_single(vcpu->cpu,
32344005 wbinvd_ipi, NULL, 1);
32354006 }
32364007
3237
- kvm_x86_ops->vcpu_load(vcpu, cpu);
4008
+ kvm_x86_ops.vcpu_load(vcpu, cpu);
4009
+
4010
+ /* Save host pkru register if supported */
4011
+ vcpu->arch.host_pkru = read_pkru();
32384012
32394013 /* Apply any externally detected TSC adjustments (due to suspend) */
32404014 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
....@@ -3301,7 +4075,7 @@
33014075 int idx;
33024076
33034077 if (vcpu->preempted)
3304
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
4078
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
33054079
33064080 /*
33074081 * Disable page faults because we're in atomic context here.
....@@ -3320,7 +4094,7 @@
33204094 kvm_steal_time_set_preempted(vcpu);
33214095 srcu_read_unlock(&vcpu->kvm->srcu, idx);
33224096 pagefault_enable();
3323
- kvm_x86_ops->vcpu_put(vcpu);
4097
+ kvm_x86_ops.vcpu_put(vcpu);
33244098 vcpu->arch.last_host_tsc = rdtsc();
33254099 /*
33264100 * If userspace has set any breakpoints or watchpoints, dr6 is restored
....@@ -3334,7 +4108,7 @@
33344108 struct kvm_lapic_state *s)
33354109 {
33364110 if (vcpu->arch.apicv_active)
3337
- kvm_x86_ops->sync_pir_to_irr(vcpu);
4111
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
33384112
33394113 return kvm_apic_get_state(vcpu, s);
33404114 }
....@@ -3453,8 +4227,7 @@
34534227 for (bank = 0; bank < bank_num; bank++)
34544228 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
34554229
3456
- if (kvm_x86_ops->setup_mce)
3457
- kvm_x86_ops->setup_mce(vcpu);
4230
+ kvm_x86_ops.setup_mce(vcpu);
34584231 out:
34594232 return r;
34604233 }
....@@ -3516,28 +4289,56 @@
35164289 process_smi(vcpu);
35174290
35184291 /*
3519
- * FIXME: pass injected and pending separately. This is only
3520
- * needed for nested virtualization, whose state cannot be
3521
- * migrated yet. For now we can combine them.
4292
+ * In guest mode, payload delivery should be deferred,
4293
+ * so that the L1 hypervisor can intercept #PF before
4294
+ * CR2 is modified (or intercept #DB before DR6 is
4295
+ * modified under nVMX). Unless the per-VM capability,
4296
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4297
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4298
+ * opportunistically defer the exception payload, deliver it if the
4299
+ * capability hasn't been requested before processing a
4300
+ * KVM_GET_VCPU_EVENTS.
35224301 */
3523
- events->exception.injected =
3524
- (vcpu->arch.exception.pending ||
3525
- vcpu->arch.exception.injected) &&
3526
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
4302
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
4303
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4304
+ kvm_deliver_exception_payload(vcpu);
4305
+
4306
+ /*
4307
+ * The API doesn't provide the instruction length for software
4308
+ * exceptions, so don't report them. As long as the guest RIP
4309
+ * isn't advanced, we should expect to encounter the exception
4310
+ * again.
4311
+ */
4312
+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4313
+ events->exception.injected = 0;
4314
+ events->exception.pending = 0;
4315
+ } else {
4316
+ events->exception.injected = vcpu->arch.exception.injected;
4317
+ events->exception.pending = vcpu->arch.exception.pending;
4318
+ /*
4319
+ * For ABI compatibility, deliberately conflate
4320
+ * pending and injected exceptions when
4321
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4322
+ */
4323
+ if (!vcpu->kvm->arch.exception_payload_enabled)
4324
+ events->exception.injected |=
4325
+ vcpu->arch.exception.pending;
4326
+ }
35274327 events->exception.nr = vcpu->arch.exception.nr;
35284328 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3529
- events->exception.pad = 0;
35304329 events->exception.error_code = vcpu->arch.exception.error_code;
4330
+ events->exception_has_payload = vcpu->arch.exception.has_payload;
4331
+ events->exception_payload = vcpu->arch.exception.payload;
35314332
35324333 events->interrupt.injected =
35334334 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
35344335 events->interrupt.nr = vcpu->arch.interrupt.nr;
35354336 events->interrupt.soft = 0;
3536
- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
4337
+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
35374338
35384339 events->nmi.injected = vcpu->arch.nmi_injected;
35394340 events->nmi.pending = vcpu->arch.nmi_pending != 0;
3540
- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
4341
+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
35414342 events->nmi.pad = 0;
35424343
35434344 events->sipi_vector = 0; /* never valid when reporting to user space */
....@@ -3551,10 +4352,13 @@
35514352 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
35524353 | KVM_VCPUEVENT_VALID_SHADOW
35534354 | KVM_VCPUEVENT_VALID_SMM);
4355
+ if (vcpu->kvm->arch.exception_payload_enabled)
4356
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4357
+
35544358 memset(&events->reserved, 0, sizeof(events->reserved));
35554359 }
35564360
3557
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
4361
+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
35584362
35594363 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
35604364 struct kvm_vcpu_events *events)
....@@ -3562,12 +4366,24 @@
35624366 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
35634367 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
35644368 | KVM_VCPUEVENT_VALID_SHADOW
3565
- | KVM_VCPUEVENT_VALID_SMM))
4369
+ | KVM_VCPUEVENT_VALID_SMM
4370
+ | KVM_VCPUEVENT_VALID_PAYLOAD))
35664371 return -EINVAL;
35674372
3568
- if (events->exception.injected &&
3569
- (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3570
- is_guest_mode(vcpu)))
4373
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4374
+ if (!vcpu->kvm->arch.exception_payload_enabled)
4375
+ return -EINVAL;
4376
+ if (events->exception.pending)
4377
+ events->exception.injected = 0;
4378
+ else
4379
+ events->exception_has_payload = 0;
4380
+ } else {
4381
+ events->exception.pending = 0;
4382
+ events->exception_has_payload = 0;
4383
+ }
4384
+
4385
+ if ((events->exception.injected || events->exception.pending) &&
4386
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
35714387 return -EINVAL;
35724388
35734389 /* INITs are latched while in SMM */
....@@ -3577,35 +4393,40 @@
35774393 return -EINVAL;
35784394
35794395 process_nmi(vcpu);
3580
- vcpu->arch.exception.injected = false;
3581
- vcpu->arch.exception.pending = events->exception.injected;
4396
+ vcpu->arch.exception.injected = events->exception.injected;
4397
+ vcpu->arch.exception.pending = events->exception.pending;
35824398 vcpu->arch.exception.nr = events->exception.nr;
35834399 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
35844400 vcpu->arch.exception.error_code = events->exception.error_code;
4401
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
4402
+ vcpu->arch.exception.payload = events->exception_payload;
35854403
35864404 vcpu->arch.interrupt.injected = events->interrupt.injected;
35874405 vcpu->arch.interrupt.nr = events->interrupt.nr;
35884406 vcpu->arch.interrupt.soft = events->interrupt.soft;
35894407 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3590
- kvm_x86_ops->set_interrupt_shadow(vcpu,
4408
+ kvm_x86_ops.set_interrupt_shadow(vcpu,
35914409 events->interrupt.shadow);
35924410
35934411 vcpu->arch.nmi_injected = events->nmi.injected;
35944412 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
35954413 vcpu->arch.nmi_pending = events->nmi.pending;
3596
- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
4414
+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
35974415
35984416 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
35994417 lapic_in_kernel(vcpu))
36004418 vcpu->arch.apic->sipi_vector = events->sipi_vector;
36014419
36024420 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3603
- u32 hflags = vcpu->arch.hflags;
3604
- if (events->smi.smm)
3605
- hflags |= HF_SMM_MASK;
3606
- else
3607
- hflags &= ~HF_SMM_MASK;
3608
- kvm_set_hflags(vcpu, hflags);
4421
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
4422
+ if (events->smi.smm)
4423
+ vcpu->arch.hflags |= HF_SMM_MASK;
4424
+ else
4425
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
4426
+
4427
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
4428
+ kvm_smm_changed(vcpu);
4429
+ }
36094430
36104431 vcpu->arch.smi_pending = events->smi.pending;
36114432
....@@ -3614,12 +4435,13 @@
36144435 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
36154436 else
36164437 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3617
- if (lapic_in_kernel(vcpu)) {
3618
- if (events->smi.latched_init)
3619
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3620
- else
3621
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3622
- }
4438
+ }
4439
+
4440
+ if (lapic_in_kernel(vcpu)) {
4441
+ if (events->smi.latched_init)
4442
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4443
+ else
4444
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
36234445 }
36244446 }
36254447
....@@ -3655,7 +4477,6 @@
36554477 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
36564478 kvm_update_dr0123(vcpu);
36574479 vcpu->arch.dr6 = dbgregs->dr6;
3658
- kvm_update_dr6(vcpu);
36594480 vcpu->arch.dr7 = dbgregs->dr7;
36604481 kvm_update_dr7(vcpu);
36614482
....@@ -3666,7 +4487,7 @@
36664487
36674488 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
36684489 {
3669
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
4490
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
36704491 u64 xstate_bv = xsave->header.xfeatures;
36714492 u64 valid;
36724493
....@@ -3686,15 +4507,15 @@
36864507 */
36874508 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
36884509 while (valid) {
3689
- u64 feature = valid & -valid;
3690
- int index = fls64(feature) - 1;
3691
- void *src = get_xsave_addr(xsave, feature);
4510
+ u64 xfeature_mask = valid & -valid;
4511
+ int xfeature_nr = fls64(xfeature_mask) - 1;
4512
+ void *src = get_xsave_addr(xsave, xfeature_nr);
36924513
36934514 if (src) {
36944515 u32 size, offset, ecx, edx;
3695
- cpuid_count(XSTATE_CPUID, index,
4516
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
36964517 &size, &offset, &ecx, &edx);
3697
- if (feature == XFEATURE_MASK_PKRU)
4518
+ if (xfeature_nr == XFEATURE_PKRU)
36984519 memcpy(dest + offset, &vcpu->arch.pkru,
36994520 sizeof(vcpu->arch.pkru));
37004521 else
....@@ -3702,13 +4523,13 @@
37024523
37034524 }
37044525
3705
- valid -= feature;
4526
+ valid -= xfeature_mask;
37064527 }
37074528 }
37084529
37094530 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
37104531 {
3711
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
4532
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
37124533 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
37134534 u64 valid;
37144535
....@@ -3729,22 +4550,22 @@
37294550 */
37304551 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
37314552 while (valid) {
3732
- u64 feature = valid & -valid;
3733
- int index = fls64(feature) - 1;
3734
- void *dest = get_xsave_addr(xsave, feature);
4553
+ u64 xfeature_mask = valid & -valid;
4554
+ int xfeature_nr = fls64(xfeature_mask) - 1;
4555
+ void *dest = get_xsave_addr(xsave, xfeature_nr);
37354556
37364557 if (dest) {
37374558 u32 size, offset, ecx, edx;
3738
- cpuid_count(XSTATE_CPUID, index,
4559
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
37394560 &size, &offset, &ecx, &edx);
3740
- if (feature == XFEATURE_MASK_PKRU)
4561
+ if (xfeature_nr == XFEATURE_PKRU)
37414562 memcpy(&vcpu->arch.pkru, src + offset,
37424563 sizeof(vcpu->arch.pkru));
37434564 else
37444565 memcpy(dest, src + offset, size);
37454566 }
37464567
3747
- valid -= feature;
4568
+ valid -= xfeature_mask;
37484569 }
37494570 }
37504571
....@@ -3756,7 +4577,7 @@
37564577 fill_xsave((u8 *) guest_xsave->region, vcpu);
37574578 } else {
37584579 memcpy(guest_xsave->region,
3759
- &vcpu->arch.guest_fpu.state.fxsave,
4580
+ &vcpu->arch.guest_fpu->state.fxsave,
37604581 sizeof(struct fxregs_state));
37614582 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
37624583 XFEATURE_MASK_FPSSE;
....@@ -3778,15 +4599,14 @@
37784599 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
37794600 * with old userspace.
37804601 */
3781
- if (xstate_bv & ~kvm_supported_xcr0() ||
3782
- mxcsr & ~mxcsr_feature_mask)
4602
+ if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
37834603 return -EINVAL;
37844604 load_xsave(vcpu, (u8 *)guest_xsave->region);
37854605 } else {
37864606 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
37874607 mxcsr & ~mxcsr_feature_mask)
37884608 return -EINVAL;
3789
- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
4609
+ memcpy(&vcpu->arch.guest_fpu->state.fxsave,
37904610 guest_xsave->region, sizeof(struct fxregs_state));
37914611 }
37924612 return 0;
....@@ -3847,6 +4667,10 @@
38474667 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
38484668 struct kvm_enable_cap *cap)
38494669 {
4670
+ int r;
4671
+ uint16_t vmcs_version;
4672
+ void __user *user_ptr;
4673
+
38504674 if (cap->flags)
38514675 return -EINVAL;
38524676
....@@ -3854,11 +4678,37 @@
38544678 case KVM_CAP_HYPERV_SYNIC2:
38554679 if (cap->args[0])
38564680 return -EINVAL;
4681
+ fallthrough;
4682
+
38574683 case KVM_CAP_HYPERV_SYNIC:
38584684 if (!irqchip_in_kernel(vcpu->kvm))
38594685 return -EINVAL;
38604686 return kvm_hv_activate_synic(vcpu, cap->cap ==
38614687 KVM_CAP_HYPERV_SYNIC2);
4688
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4689
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
4690
+ return -ENOTTY;
4691
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
4692
+ if (!r) {
4693
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
4694
+ if (copy_to_user(user_ptr, &vmcs_version,
4695
+ sizeof(vmcs_version)))
4696
+ r = -EFAULT;
4697
+ }
4698
+ return r;
4699
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4700
+ if (!kvm_x86_ops.enable_direct_tlbflush)
4701
+ return -ENOTTY;
4702
+
4703
+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
4704
+
4705
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4706
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
4707
+ if (vcpu->arch.pv_cpuid.enforce)
4708
+ kvm_update_pv_runtime(vcpu);
4709
+
4710
+ return 0;
4711
+
38624712 default:
38634713 return -EINVAL;
38644714 }
....@@ -3885,7 +4735,8 @@
38854735 r = -EINVAL;
38864736 if (!lapic_in_kernel(vcpu))
38874737 goto out;
3888
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
4738
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4739
+ GFP_KERNEL_ACCOUNT);
38894740
38904741 r = -ENOMEM;
38914742 if (!u.lapic)
....@@ -3916,7 +4767,7 @@
39164767 struct kvm_interrupt irq;
39174768
39184769 r = -EFAULT;
3919
- if (copy_from_user(&irq, argp, sizeof irq))
4770
+ if (copy_from_user(&irq, argp, sizeof(irq)))
39204771 goto out;
39214772 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
39224773 break;
....@@ -3934,7 +4785,7 @@
39344785 struct kvm_cpuid cpuid;
39354786
39364787 r = -EFAULT;
3937
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4788
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39384789 goto out;
39394790 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
39404791 break;
....@@ -3944,7 +4795,7 @@
39444795 struct kvm_cpuid2 cpuid;
39454796
39464797 r = -EFAULT;
3947
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4798
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39484799 goto out;
39494800 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
39504801 cpuid_arg->entries);
....@@ -3955,14 +4806,14 @@
39554806 struct kvm_cpuid2 cpuid;
39564807
39574808 r = -EFAULT;
3958
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4809
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39594810 goto out;
39604811 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
39614812 cpuid_arg->entries);
39624813 if (r)
39634814 goto out;
39644815 r = -EFAULT;
3965
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
4816
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
39664817 goto out;
39674818 r = 0;
39684819 break;
....@@ -3983,13 +4834,13 @@
39834834 struct kvm_tpr_access_ctl tac;
39844835
39854836 r = -EFAULT;
3986
- if (copy_from_user(&tac, argp, sizeof tac))
4837
+ if (copy_from_user(&tac, argp, sizeof(tac)))
39874838 goto out;
39884839 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
39894840 if (r)
39904841 goto out;
39914842 r = -EFAULT;
3992
- if (copy_to_user(argp, &tac, sizeof tac))
4843
+ if (copy_to_user(argp, &tac, sizeof(tac)))
39934844 goto out;
39944845 r = 0;
39954846 break;
....@@ -4002,7 +4853,7 @@
40024853 if (!lapic_in_kernel(vcpu))
40034854 goto out;
40044855 r = -EFAULT;
4005
- if (copy_from_user(&va, argp, sizeof va))
4856
+ if (copy_from_user(&va, argp, sizeof(va)))
40064857 goto out;
40074858 idx = srcu_read_lock(&vcpu->kvm->srcu);
40084859 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
....@@ -4013,7 +4864,7 @@
40134864 u64 mcg_cap;
40144865
40154866 r = -EFAULT;
4016
- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
4867
+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
40174868 goto out;
40184869 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
40194870 break;
....@@ -4022,7 +4873,7 @@
40224873 struct kvm_x86_mce mce;
40234874
40244875 r = -EFAULT;
4025
- if (copy_from_user(&mce, argp, sizeof mce))
4876
+ if (copy_from_user(&mce, argp, sizeof(mce)))
40264877 goto out;
40274878 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
40284879 break;
....@@ -4072,7 +4923,7 @@
40724923 break;
40734924 }
40744925 case KVM_GET_XSAVE: {
4075
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
4926
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
40764927 r = -ENOMEM;
40774928 if (!u.xsave)
40784929 break;
....@@ -4096,7 +4947,7 @@
40964947 break;
40974948 }
40984949 case KVM_GET_XCRS: {
4099
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
4950
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
41004951 r = -ENOMEM;
41014952 if (!u.xcrs)
41024953 break;
....@@ -4126,7 +4977,8 @@
41264977 r = -EINVAL;
41274978 user_tsc_khz = (u32)arg;
41284979
4129
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
4980
+ if (kvm_has_tsc_control &&
4981
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
41304982 goto out;
41314983
41324984 if (user_tsc_khz == 0)
....@@ -4159,7 +5011,7 @@
41595011 u32 user_data_size;
41605012
41615013 r = -EINVAL;
4162
- if (!kvm_x86_ops->get_nested_state)
5014
+ if (!kvm_x86_ops.nested_ops->get_state)
41635015 break;
41645016
41655017 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
....@@ -4167,8 +5019,8 @@
41675019 if (get_user(user_data_size, &user_kvm_nested_state->size))
41685020 break;
41695021
4170
- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
4171
- user_data_size);
5022
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5023
+ user_data_size);
41725024 if (r < 0)
41735025 break;
41745026
....@@ -4189,7 +5041,7 @@
41895041 int idx;
41905042
41915043 r = -EINVAL;
4192
- if (!kvm_x86_ops->set_nested_state)
5044
+ if (!kvm_x86_ops.nested_ops->set_state)
41935045 break;
41945046
41955047 r = -EFAULT;
....@@ -4201,16 +5053,38 @@
42015053 break;
42025054
42035055 if (kvm_state.flags &
4204
- ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
5056
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
5057
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5058
+ | KVM_STATE_NESTED_GIF_SET))
42055059 break;
42065060
42075061 /* nested_run_pending implies guest_mode. */
4208
- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
5062
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5063
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
42095064 break;
42105065
42115066 idx = srcu_read_lock(&vcpu->kvm->srcu);
4212
- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
5067
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
42135068 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5069
+ break;
5070
+ }
5071
+ case KVM_GET_SUPPORTED_HV_CPUID: {
5072
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
5073
+ struct kvm_cpuid2 cpuid;
5074
+
5075
+ r = -EFAULT;
5076
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5077
+ goto out;
5078
+
5079
+ r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
5080
+ cpuid_arg->entries);
5081
+ if (r)
5082
+ goto out;
5083
+
5084
+ r = -EFAULT;
5085
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
5086
+ goto out;
5087
+ r = 0;
42145088 break;
42155089 }
42165090 default:
....@@ -4234,14 +5108,14 @@
42345108
42355109 if (addr > (unsigned int)(-3 * PAGE_SIZE))
42365110 return -EINVAL;
4237
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
5111
+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
42385112 return ret;
42395113 }
42405114
42415115 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
42425116 u64 ident_addr)
42435117 {
4244
- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
5118
+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
42455119 }
42465120
42475121 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
....@@ -4382,9 +5256,6 @@
43825256 {
43835257 struct kvm_pit *pit = kvm->arch.vpit;
43845258
4385
- if (!pit)
4386
- return -ENXIO;
4387
-
43885259 /* pit->pit_state.lock was overloaded to prevent userspace from getting
43895260 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
43905261 * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
....@@ -4396,50 +5267,13 @@
43965267 return 0;
43975268 }
43985269
4399
-/**
4400
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4401
- * @kvm: kvm instance
4402
- * @log: slot id and address to which we copy the log
4403
- *
4404
- * Steps 1-4 below provide general overview of dirty page logging. See
4405
- * kvm_get_dirty_log_protect() function description for additional details.
4406
- *
4407
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4408
- * always flush the TLB (step 4) even if previous step failed and the dirty
4409
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4410
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4411
- * writes will be marked dirty for next log read.
4412
- *
4413
- * 1. Take a snapshot of the bit and clear it if needed.
4414
- * 2. Write protect the corresponding page.
4415
- * 3. Copy the snapshot to the userspace.
4416
- * 4. Flush TLB's if needed.
4417
- */
4418
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
5270
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
44195271 {
4420
- bool is_dirty = false;
4421
- int r;
4422
-
4423
- mutex_lock(&kvm->slots_lock);
4424
-
44255272 /*
44265273 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
44275274 */
4428
- if (kvm_x86_ops->flush_log_dirty)
4429
- kvm_x86_ops->flush_log_dirty(kvm);
4430
-
4431
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
4432
-
4433
- /*
4434
- * All the TLBs can be flushed out of mmu lock, see the comments in
4435
- * kvm_mmu_slot_remove_write_access().
4436
- */
4437
- lockdep_assert_held(&kvm->slots_lock);
4438
- if (is_dirty)
4439
- kvm_flush_remote_tlbs(kvm);
4440
-
4441
- mutex_unlock(&kvm->slots_lock);
4442
- return r;
5275
+ if (kvm_x86_ops.flush_log_dirty)
5276
+ kvm_x86_ops.flush_log_dirty(kvm);
44435277 }
44445278
44455279 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
....@@ -4454,8 +5288,8 @@
44545288 return 0;
44555289 }
44565290
4457
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4458
- struct kvm_enable_cap *cap)
5291
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5292
+ struct kvm_enable_cap *cap)
44595293 {
44605294 int r;
44615295
....@@ -4513,10 +5347,25 @@
45135347 kvm->arch.hlt_in_guest = true;
45145348 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
45155349 kvm->arch.pause_in_guest = true;
5350
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
5351
+ kvm->arch.cstate_in_guest = true;
45165352 r = 0;
45175353 break;
45185354 case KVM_CAP_MSR_PLATFORM_INFO:
45195355 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
5356
+ r = 0;
5357
+ break;
5358
+ case KVM_CAP_EXCEPTION_PAYLOAD:
5359
+ kvm->arch.exception_payload_enabled = cap->args[0];
5360
+ r = 0;
5361
+ break;
5362
+ case KVM_CAP_X86_USER_SPACE_MSR:
5363
+ r = -EINVAL;
5364
+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
5365
+ KVM_MSR_EXIT_REASON_UNKNOWN |
5366
+ KVM_MSR_EXIT_REASON_FILTER))
5367
+ break;
5368
+ kvm->arch.user_space_msr_mask = cap->args[0];
45205369 r = 0;
45215370 break;
45225371 default:
....@@ -4525,6 +5374,180 @@
45255374 }
45265375 return r;
45275376 }
5377
+
5378
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
5379
+{
5380
+ struct kvm_x86_msr_filter *msr_filter;
5381
+
5382
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
5383
+ if (!msr_filter)
5384
+ return NULL;
5385
+
5386
+ msr_filter->default_allow = default_allow;
5387
+ return msr_filter;
5388
+}
5389
+
5390
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
5391
+{
5392
+ u32 i;
5393
+
5394
+ if (!msr_filter)
5395
+ return;
5396
+
5397
+ for (i = 0; i < msr_filter->count; i++)
5398
+ kfree(msr_filter->ranges[i].bitmap);
5399
+
5400
+ kfree(msr_filter);
5401
+}
5402
+
5403
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
5404
+ struct kvm_msr_filter_range *user_range)
5405
+{
5406
+ struct msr_bitmap_range range;
5407
+ unsigned long *bitmap = NULL;
5408
+ size_t bitmap_size;
5409
+ int r;
5410
+
5411
+ if (!user_range->nmsrs)
5412
+ return 0;
5413
+
5414
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
5415
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
5416
+ return -EINVAL;
5417
+
5418
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
5419
+ if (IS_ERR(bitmap))
5420
+ return PTR_ERR(bitmap);
5421
+
5422
+ range = (struct msr_bitmap_range) {
5423
+ .flags = user_range->flags,
5424
+ .base = user_range->base,
5425
+ .nmsrs = user_range->nmsrs,
5426
+ .bitmap = bitmap,
5427
+ };
5428
+
5429
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
5430
+ r = -EINVAL;
5431
+ goto err;
5432
+ }
5433
+
5434
+ if (!range.flags) {
5435
+ r = -EINVAL;
5436
+ goto err;
5437
+ }
5438
+
5439
+ /* Everything ok, add this range identifier. */
5440
+ msr_filter->ranges[msr_filter->count] = range;
5441
+ msr_filter->count++;
5442
+
5443
+ return 0;
5444
+err:
5445
+ kfree(bitmap);
5446
+ return r;
5447
+}
5448
+
5449
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
5450
+ struct kvm_msr_filter *filter)
5451
+{
5452
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
5453
+ bool default_allow;
5454
+ bool empty = true;
5455
+ int r = 0;
5456
+ u32 i;
5457
+
5458
+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
5459
+ return -EINVAL;
5460
+
5461
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
5462
+ empty &= !filter->ranges[i].nmsrs;
5463
+
5464
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
5465
+ if (empty && !default_allow)
5466
+ return -EINVAL;
5467
+
5468
+ new_filter = kvm_alloc_msr_filter(default_allow);
5469
+ if (!new_filter)
5470
+ return -ENOMEM;
5471
+
5472
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
5473
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
5474
+ if (r) {
5475
+ kvm_free_msr_filter(new_filter);
5476
+ return r;
5477
+ }
5478
+ }
5479
+
5480
+ mutex_lock(&kvm->lock);
5481
+
5482
+ /* The per-VM filter is protected by kvm->lock... */
5483
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
5484
+
5485
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
5486
+ synchronize_srcu(&kvm->srcu);
5487
+
5488
+ kvm_free_msr_filter(old_filter);
5489
+
5490
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
5491
+ mutex_unlock(&kvm->lock);
5492
+
5493
+ return 0;
5494
+}
5495
+
5496
+#ifdef CONFIG_KVM_COMPAT
5497
+/* for KVM_X86_SET_MSR_FILTER */
5498
+struct kvm_msr_filter_range_compat {
5499
+ __u32 flags;
5500
+ __u32 nmsrs;
5501
+ __u32 base;
5502
+ __u32 bitmap;
5503
+};
5504
+
5505
+struct kvm_msr_filter_compat {
5506
+ __u32 flags;
5507
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
5508
+};
5509
+
5510
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
5511
+
5512
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5513
+ unsigned long arg)
5514
+{
5515
+ void __user *argp = (void __user *)arg;
5516
+ struct kvm *kvm = filp->private_data;
5517
+ long r = -ENOTTY;
5518
+
5519
+ switch (ioctl) {
5520
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
5521
+ struct kvm_msr_filter __user *user_msr_filter = argp;
5522
+ struct kvm_msr_filter_compat filter_compat;
5523
+ struct kvm_msr_filter filter;
5524
+ int i;
5525
+
5526
+ if (copy_from_user(&filter_compat, user_msr_filter,
5527
+ sizeof(filter_compat)))
5528
+ return -EFAULT;
5529
+
5530
+ filter.flags = filter_compat.flags;
5531
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
5532
+ struct kvm_msr_filter_range_compat *cr;
5533
+
5534
+ cr = &filter_compat.ranges[i];
5535
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
5536
+ .flags = cr->flags,
5537
+ .nmsrs = cr->nmsrs,
5538
+ .base = cr->base,
5539
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
5540
+ };
5541
+ }
5542
+
5543
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5544
+ break;
5545
+ }
5546
+ }
5547
+
5548
+ return r;
5549
+}
5550
+#endif
45285551
45295552 long kvm_arch_vm_ioctl(struct file *filp,
45305553 unsigned int ioctl, unsigned long arg)
....@@ -4555,7 +5578,7 @@
45555578 if (kvm->created_vcpus)
45565579 goto set_identity_unlock;
45575580 r = -EFAULT;
4558
- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
5581
+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
45595582 goto set_identity_unlock;
45605583 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
45615584 set_identity_unlock:
....@@ -4639,7 +5662,7 @@
46395662 if (r)
46405663 goto get_irqchip_out;
46415664 r = -EFAULT;
4642
- if (copy_to_user(argp, chip, sizeof *chip))
5665
+ if (copy_to_user(argp, chip, sizeof(*chip)))
46435666 goto get_irqchip_out;
46445667 r = 0;
46455668 get_irqchip_out:
....@@ -4660,9 +5683,6 @@
46605683 if (!irqchip_kernel(kvm))
46615684 goto set_irqchip_out;
46625685 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4663
- if (r)
4664
- goto set_irqchip_out;
4665
- r = 0;
46665686 set_irqchip_out:
46675687 kfree(chip);
46685688 break;
....@@ -4685,7 +5705,7 @@
46855705 }
46865706 case KVM_SET_PIT: {
46875707 r = -EFAULT;
4688
- if (copy_from_user(&u.ps, argp, sizeof u.ps))
5708
+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
46895709 goto out;
46905710 mutex_lock(&kvm->lock);
46915711 r = -ENXIO;
....@@ -4726,6 +5746,9 @@
47265746 struct kvm_reinject_control control;
47275747 r = -EFAULT;
47285748 if (copy_from_user(&control, argp, sizeof(control)))
5749
+ goto out;
5750
+ r = -ENXIO;
5751
+ if (!kvm->arch.vpit)
47295752 goto out;
47305753 r = kvm_vm_ioctl_reinject(kvm, &control);
47315754 break;
....@@ -4790,19 +5813,10 @@
47905813 r = 0;
47915814 break;
47925815 }
4793
- case KVM_ENABLE_CAP: {
4794
- struct kvm_enable_cap cap;
4795
-
4796
- r = -EFAULT;
4797
- if (copy_from_user(&cap, argp, sizeof(cap)))
4798
- goto out;
4799
- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4800
- break;
4801
- }
48025816 case KVM_MEMORY_ENCRYPT_OP: {
48035817 r = -ENOTTY;
4804
- if (kvm_x86_ops->mem_enc_op)
4805
- r = kvm_x86_ops->mem_enc_op(kvm, argp);
5818
+ if (kvm_x86_ops.mem_enc_op)
5819
+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
48065820 break;
48075821 }
48085822 case KVM_MEMORY_ENCRYPT_REG_REGION: {
....@@ -4813,8 +5827,8 @@
48135827 goto out;
48145828
48155829 r = -ENOTTY;
4816
- if (kvm_x86_ops->mem_enc_reg_region)
4817
- r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
5830
+ if (kvm_x86_ops.mem_enc_reg_region)
5831
+ r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
48185832 break;
48195833 }
48205834 case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
....@@ -4825,8 +5839,8 @@
48255839 goto out;
48265840
48275841 r = -ENOTTY;
4828
- if (kvm_x86_ops->mem_enc_unreg_region)
4829
- r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
5842
+ if (kvm_x86_ops.mem_enc_unreg_region)
5843
+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
48305844 break;
48315845 }
48325846 case KVM_HYPERV_EVENTFD: {
....@@ -4838,6 +5852,19 @@
48385852 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
48395853 break;
48405854 }
5855
+ case KVM_SET_PMU_EVENT_FILTER:
5856
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
5857
+ break;
5858
+ case KVM_X86_SET_MSR_FILTER: {
5859
+ struct kvm_msr_filter __user *user_msr_filter = argp;
5860
+ struct kvm_msr_filter filter;
5861
+
5862
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
5863
+ return -EFAULT;
5864
+
5865
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5866
+ break;
5867
+ }
48415868 default:
48425869 r = -ENOTTY;
48435870 }
....@@ -4847,58 +5874,96 @@
48475874
48485875 static void kvm_init_msr_list(void)
48495876 {
5877
+ struct x86_pmu_capability x86_pmu;
48505878 u32 dummy[2];
4851
- unsigned i, j;
5879
+ unsigned i;
48525880
4853
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4854
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
5881
+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
5882
+ "Please update the fixed PMCs in msrs_to_saved_all[]");
5883
+
5884
+ perf_get_x86_pmu_capability(&x86_pmu);
5885
+
5886
+ num_msrs_to_save = 0;
5887
+ num_emulated_msrs = 0;
5888
+ num_msr_based_features = 0;
5889
+
5890
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5891
+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
48555892 continue;
48565893
48575894 /*
48585895 * Even MSRs that are valid in the host may not be exposed
48595896 * to the guests in some cases.
48605897 */
4861
- switch (msrs_to_save[i]) {
5898
+ switch (msrs_to_save_all[i]) {
48625899 case MSR_IA32_BNDCFGS:
48635900 if (!kvm_mpx_supported())
48645901 continue;
48655902 break;
48665903 case MSR_TSC_AUX:
4867
- if (!kvm_x86_ops->rdtscp_supported())
5904
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
5905
+ continue;
5906
+ break;
5907
+ case MSR_IA32_UMWAIT_CONTROL:
5908
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
5909
+ continue;
5910
+ break;
5911
+ case MSR_IA32_RTIT_CTL:
5912
+ case MSR_IA32_RTIT_STATUS:
5913
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
5914
+ continue;
5915
+ break;
5916
+ case MSR_IA32_RTIT_CR3_MATCH:
5917
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5918
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
5919
+ continue;
5920
+ break;
5921
+ case MSR_IA32_RTIT_OUTPUT_BASE:
5922
+ case MSR_IA32_RTIT_OUTPUT_MASK:
5923
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5924
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
5925
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
5926
+ continue;
5927
+ break;
5928
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
5929
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5930
+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
5931
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
5932
+ continue;
5933
+ break;
5934
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
5935
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
5936
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
5937
+ continue;
5938
+ break;
5939
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
5940
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
5941
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
48685942 continue;
48695943 break;
48705944 default:
48715945 break;
48725946 }
48735947
4874
- if (j < i)
4875
- msrs_to_save[j] = msrs_to_save[i];
4876
- j++;
5948
+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
48775949 }
4878
- num_msrs_to_save = j;
48795950
4880
- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4881
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
5951
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
5952
+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
48825953 continue;
48835954
4884
- if (j < i)
4885
- emulated_msrs[j] = emulated_msrs[i];
4886
- j++;
5955
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
48875956 }
4888
- num_emulated_msrs = j;
48895957
4890
- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
5958
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
48915959 struct kvm_msr_entry msr;
48925960
4893
- msr.index = msr_based_features[i];
5961
+ msr.index = msr_based_features_all[i];
48945962 if (kvm_get_msr_feature(&msr))
48955963 continue;
48965964
4897
- if (j < i)
4898
- msr_based_features[j] = msr_based_features[i];
4899
- j++;
5965
+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
49005966 }
4901
- num_msr_based_features = j;
49025967 }
49035968
49045969 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
....@@ -4947,13 +6012,13 @@
49476012 static void kvm_set_segment(struct kvm_vcpu *vcpu,
49486013 struct kvm_segment *var, int seg)
49496014 {
4950
- kvm_x86_ops->set_segment(vcpu, var, seg);
6015
+ kvm_x86_ops.set_segment(vcpu, var, seg);
49516016 }
49526017
49536018 void kvm_get_segment(struct kvm_vcpu *vcpu,
49546019 struct kvm_segment *var, int seg)
49556020 {
4956
- kvm_x86_ops->get_segment(vcpu, var, seg);
6021
+ kvm_x86_ops.get_segment(vcpu, var, seg);
49576022 }
49586023
49596024 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
....@@ -4965,7 +6030,7 @@
49656030
49666031 /* NPT walks are always user-walks */
49676032 access |= PFERR_USER_MASK;
4968
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
6033
+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
49696034
49706035 return t_gpa;
49716036 }
....@@ -4973,14 +6038,14 @@
49736038 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
49746039 struct x86_exception *exception)
49756040 {
4976
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6041
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49776042 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49786043 }
49796044
49806045 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
49816046 struct x86_exception *exception)
49826047 {
4983
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6048
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49846049 access |= PFERR_FETCH_MASK;
49856050 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49866051 }
....@@ -4988,7 +6053,7 @@
49886053 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
49896054 struct x86_exception *exception)
49906055 {
4991
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6056
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49926057 access |= PFERR_WRITE_MASK;
49936058 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49946059 }
....@@ -5037,7 +6102,7 @@
50376102 struct x86_exception *exception)
50386103 {
50396104 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5040
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6105
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
50416106 unsigned offset;
50426107 int ret;
50436108
....@@ -5062,7 +6127,7 @@
50626127 gva_t addr, void *val, unsigned int bytes,
50636128 struct x86_exception *exception)
50646129 {
5065
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6130
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
50666131
50676132 /*
50686133 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
....@@ -5083,7 +6148,7 @@
50836148 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
50846149 u32 access = 0;
50856150
5086
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
6151
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
50876152 access |= PFERR_USER_MASK;
50886153
50896154 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
....@@ -5136,7 +6201,7 @@
51366201 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
51376202 u32 access = PFERR_WRITE_MASK;
51386203
5139
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
6204
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
51406205 access |= PFERR_USER_MASK;
51416206
51426207 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
....@@ -5149,13 +6214,6 @@
51496214 /* kvm_write_guest_virt_system can pull in tons of pages. */
51506215 vcpu->arch.l1tf_flush_l1d = true;
51516216
5152
- /*
5153
- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5154
- * is returned, but our callers are not ready for that and they blindly
5155
- * call kvm_inject_page_fault. Ensure that they at least do not leak
5156
- * uninitialized kernel stack memory into cr2 and error code.
5157
- */
5158
- memset(exception, 0, sizeof(*exception));
51596217 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
51606218 PFERR_WRITE_MASK, exception);
51616219 }
....@@ -5163,25 +6221,23 @@
51636221
51646222 int handle_ud(struct kvm_vcpu *vcpu)
51656223 {
6224
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
51666225 int emul_type = EMULTYPE_TRAP_UD;
5167
- enum emulation_result er;
51686226 char sig[5]; /* ud2; .ascii "kvm" */
51696227 struct x86_exception e;
6228
+
6229
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
6230
+ return 1;
51706231
51716232 if (force_emulation_prefix &&
51726233 kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
51736234 sig, sizeof(sig), &e) == 0 &&
5174
- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
6235
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
51756236 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
5176
- emul_type = 0;
6237
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
51776238 }
51786239
5179
- er = kvm_emulate_instruction(vcpu, emul_type);
5180
- if (er == EMULATE_USER_EXIT)
5181
- return 0;
5182
- if (er != EMULATE_DONE)
5183
- kvm_queue_exception(vcpu, UD_VECTOR);
5184
- return 1;
6240
+ return kvm_emulate_instruction(vcpu, emul_type);
51856241 }
51866242 EXPORT_SYMBOL_GPL(handle_ud);
51876243
....@@ -5204,7 +6260,7 @@
52046260 gpa_t *gpa, struct x86_exception *exception,
52056261 bool write)
52066262 {
5207
- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
6263
+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
52086264 | (write ? PFERR_WRITE_MASK : 0);
52096265
52106266 /*
....@@ -5214,7 +6270,7 @@
52146270 */
52156271 if (vcpu_match_mmio_gva(vcpu, gva)
52166272 && !permission_fault(vcpu, vcpu->arch.walk_mmu,
5217
- vcpu->arch.access, 0, access)) {
6273
+ vcpu->arch.mmio_access, 0, access)) {
52186274 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
52196275 (gva & (PAGE_SIZE - 1));
52206276 trace_vcpu_match_mmio(gva, *gpa, write, false);
....@@ -5323,7 +6379,7 @@
53236379 int handled, ret;
53246380 bool write = ops->write;
53256381 struct kvm_mmio_fragment *frag;
5326
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6382
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
53276383
53286384 /*
53296385 * If the exit was due to a NPF we may already have a GPA.
....@@ -5332,10 +6388,9 @@
53326388 * operation using rep will only have the initial GPA from the NPF
53336389 * occurred.
53346390 */
5335
- if (vcpu->arch.gpa_available &&
5336
- emulator_can_use_gpa(ctxt) &&
5337
- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5338
- gpa = vcpu->arch.gpa_val;
6391
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
6392
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
6393
+ gpa = ctxt->gpa_val;
53396394 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
53406395 } else {
53416396 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
....@@ -5456,9 +6511,10 @@
54566511 unsigned int bytes,
54576512 struct x86_exception *exception)
54586513 {
6514
+ struct kvm_host_map map;
54596515 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6516
+ u64 page_line_mask;
54606517 gpa_t gpa;
5461
- struct page *page;
54626518 char *kaddr;
54636519 bool exchanged;
54646520
....@@ -5472,15 +6528,23 @@
54726528 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
54736529 goto emul_write;
54746530
5475
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
6531
+ /*
6532
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
6533
+ * enabled in the host and the access splits a cache line.
6534
+ */
6535
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
6536
+ page_line_mask = ~(cache_line_size() - 1);
6537
+ else
6538
+ page_line_mask = PAGE_MASK;
6539
+
6540
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
54766541 goto emul_write;
54776542
5478
- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
5479
- if (is_error_page(page))
6543
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
54806544 goto emul_write;
54816545
5482
- kaddr = kmap_atomic(page);
5483
- kaddr += offset_in_page(gpa);
6546
+ kaddr = map.hva + offset_in_page(gpa);
6547
+
54846548 switch (bytes) {
54856549 case 1:
54866550 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
....@@ -5497,13 +6561,12 @@
54976561 default:
54986562 BUG();
54996563 }
5500
- kunmap_atomic(kaddr);
5501
- kvm_release_page_dirty(page);
6564
+
6565
+ kvm_vcpu_unmap(vcpu, &map, true);
55026566
55036567 if (!exchanged)
55046568 return X86EMUL_CMPXCHG_FAILED;
55056569
5506
- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
55076570 kvm_page_track_write(vcpu, gpa, new, bytes);
55086571
55096572 return X86EMUL_CONTINUE;
....@@ -5557,11 +6620,9 @@
55576620 return 0;
55586621 }
55596622
5560
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5561
- int size, unsigned short port, void *val,
5562
- unsigned int count)
6623
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
6624
+ unsigned short port, void *val, unsigned int count)
55636625 {
5564
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
55656626 int ret;
55666627
55676628 if (vcpu->arch.pio.count)
....@@ -5581,20 +6642,33 @@
55816642 return 0;
55826643 }
55836644
5584
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
5585
- int size, unsigned short port,
5586
- const void *val, unsigned int count)
6645
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
6646
+ int size, unsigned short port, void *val,
6647
+ unsigned int count)
55876648 {
5588
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6649
+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
55896650
6651
+}
6652
+
6653
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
6654
+ unsigned short port, const void *val,
6655
+ unsigned int count)
6656
+{
55906657 memcpy(vcpu->arch.pio_data, val, size * count);
55916658 trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
55926659 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
55936660 }
55946661
6662
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
6663
+ int size, unsigned short port,
6664
+ const void *val, unsigned int count)
6665
+{
6666
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
6667
+}
6668
+
55956669 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
55966670 {
5597
- return kvm_x86_ops->get_segment_base(vcpu, seg);
6671
+ return kvm_x86_ops.get_segment_base(vcpu, seg);
55986672 }
55996673
56006674 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
....@@ -5607,7 +6681,7 @@
56076681 if (!need_emulate_wbinvd(vcpu))
56086682 return X86EMUL_CONTINUE;
56096683
5610
- if (kvm_x86_ops->has_wbinvd_exit()) {
6684
+ if (kvm_x86_ops.has_wbinvd_exit()) {
56116685 int cpu = get_cpu();
56126686
56136687 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
....@@ -5712,27 +6786,27 @@
57126786
57136787 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
57146788 {
5715
- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
6789
+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
57166790 }
57176791
57186792 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57196793 {
5720
- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
6794
+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
57216795 }
57226796
57236797 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57246798 {
5725
- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
6799
+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
57266800 }
57276801
57286802 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57296803 {
5730
- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
6804
+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
57316805 }
57326806
57336807 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57346808 {
5735
- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
6809
+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
57366810 }
57376811
57386812 static unsigned long emulator_get_cached_segment_base(
....@@ -5810,28 +6884,33 @@
58106884 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
58116885 u32 msr_index, u64 *pdata)
58126886 {
5813
- struct msr_data msr;
6887
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
58146888 int r;
58156889
5816
- msr.index = msr_index;
5817
- msr.host_initiated = false;
5818
- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
5819
- if (r)
5820
- return r;
6890
+ r = kvm_get_msr(vcpu, msr_index, pdata);
58216891
5822
- *pdata = msr.data;
5823
- return 0;
6892
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
6893
+ /* Bounce to user space */
6894
+ return X86EMUL_IO_NEEDED;
6895
+ }
6896
+
6897
+ return r;
58246898 }
58256899
58266900 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
58276901 u32 msr_index, u64 data)
58286902 {
5829
- struct msr_data msr;
6903
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6904
+ int r;
58306905
5831
- msr.data = data;
5832
- msr.index = msr_index;
5833
- msr.host_initiated = false;
5834
- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
6906
+ r = kvm_set_msr(vcpu, msr_index, data);
6907
+
6908
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
6909
+ /* Bounce to user space */
6910
+ return X86EMUL_IO_NEEDED;
6911
+ }
6912
+
6913
+ return r;
58356914 }
58366915
58376916 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
....@@ -5851,7 +6930,7 @@
58516930 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
58526931 u32 pmc)
58536932 {
5854
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
6933
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
58556934 }
58566935
58576936 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
....@@ -5869,13 +6948,35 @@
58696948 struct x86_instruction_info *info,
58706949 enum x86_intercept_stage stage)
58716950 {
5872
- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
6951
+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
6952
+ &ctxt->exception);
58736953 }
58746954
58756955 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5876
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
6956
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
6957
+ bool exact_only)
58776958 {
5878
- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
6959
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
6960
+}
6961
+
6962
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
6963
+{
6964
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
6965
+}
6966
+
6967
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
6968
+{
6969
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
6970
+}
6971
+
6972
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
6973
+{
6974
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
6975
+}
6976
+
6977
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
6978
+{
6979
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
58796980 }
58806981
58816982 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
....@@ -5890,7 +6991,7 @@
58906991
58916992 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
58926993 {
5893
- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
6994
+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
58946995 }
58956996
58966997 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
....@@ -5900,12 +7001,26 @@
59007001
59017002 static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
59027003 {
5903
- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
7004
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7005
+
7006
+ vcpu->arch.hflags = emul_flags;
7007
+ kvm_mmu_reset_context(vcpu);
59047008 }
59057009
5906
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
7010
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
7011
+ const char *smstate)
59077012 {
5908
- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
7013
+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
7014
+}
7015
+
7016
+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
7017
+{
7018
+ kvm_smm_changed(emul_to_vcpu(ctxt));
7019
+}
7020
+
7021
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
7022
+{
7023
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
59097024 }
59107025
59117026 static const struct x86_emulate_ops emulate_ops = {
....@@ -5944,15 +7059,21 @@
59447059 .fix_hypercall = emulator_fix_hypercall,
59457060 .intercept = emulator_intercept,
59467061 .get_cpuid = emulator_get_cpuid,
7062
+ .guest_has_long_mode = emulator_guest_has_long_mode,
7063
+ .guest_has_movbe = emulator_guest_has_movbe,
7064
+ .guest_has_fxsr = emulator_guest_has_fxsr,
7065
+ .guest_has_rdpid = emulator_guest_has_rdpid,
59477066 .set_nmi_mask = emulator_set_nmi_mask,
59487067 .get_hflags = emulator_get_hflags,
59497068 .set_hflags = emulator_set_hflags,
59507069 .pre_leave_smm = emulator_pre_leave_smm,
7070
+ .post_leave_smm = emulator_post_leave_smm,
7071
+ .set_xcr = emulator_set_xcr,
59517072 };
59527073
59537074 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
59547075 {
5955
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
7076
+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
59567077 /*
59577078 * an sti; sti; sequence only disable interrupts for the first
59587079 * instruction. So, if the last instruction, be it emulated or
....@@ -5963,7 +7084,7 @@
59637084 if (int_shadow & mask)
59647085 mask = 0;
59657086 if (unlikely(int_shadow || mask)) {
5966
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
7087
+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
59677088 if (!mask)
59687089 kvm_make_request(KVM_REQ_EVENT, vcpu);
59697090 }
....@@ -5971,9 +7092,9 @@
59717092
59727093 static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
59737094 {
5974
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7095
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
59757096 if (ctxt->exception.vector == PF_VECTOR)
5976
- return kvm_propagate_fault(vcpu, &ctxt->exception);
7097
+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
59777098
59787099 if (ctxt->exception.error_code_valid)
59797100 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
....@@ -5983,13 +7104,31 @@
59837104 return false;
59847105 }
59857106
7107
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
7108
+{
7109
+ struct x86_emulate_ctxt *ctxt;
7110
+
7111
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
7112
+ if (!ctxt) {
7113
+ pr_err("kvm: failed to allocate vcpu's emulator\n");
7114
+ return NULL;
7115
+ }
7116
+
7117
+ ctxt->vcpu = vcpu;
7118
+ ctxt->ops = &emulate_ops;
7119
+ vcpu->arch.emulate_ctxt = ctxt;
7120
+
7121
+ return ctxt;
7122
+}
7123
+
59867124 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
59877125 {
5988
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7126
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
59897127 int cs_db, cs_l;
59907128
5991
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
7129
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
59927130
7131
+ ctxt->gpa_available = false;
59937132 ctxt->eflags = kvm_get_rflags(vcpu);
59947133 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
59957134
....@@ -6003,13 +7142,18 @@
60037142 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
60047143 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
60057144
7145
+ ctxt->interruptibility = 0;
7146
+ ctxt->have_exception = false;
7147
+ ctxt->exception.vector = -1;
7148
+ ctxt->perm_ok = false;
7149
+
60067150 init_decode_cache(ctxt);
60077151 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
60087152 }
60097153
6010
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
7154
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
60117155 {
6012
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7156
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
60137157 int ret;
60147158
60157159 init_emulate_ctxt(vcpu);
....@@ -6019,37 +7163,43 @@
60197163 ctxt->_eip = ctxt->eip + inc_eip;
60207164 ret = emulate_int_real(ctxt, irq);
60217165
6022
- if (ret != X86EMUL_CONTINUE)
6023
- return EMULATE_FAIL;
6024
-
6025
- ctxt->eip = ctxt->_eip;
6026
- kvm_rip_write(vcpu, ctxt->eip);
6027
- kvm_set_rflags(vcpu, ctxt->eflags);
6028
-
6029
- return EMULATE_DONE;
7166
+ if (ret != X86EMUL_CONTINUE) {
7167
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7168
+ } else {
7169
+ ctxt->eip = ctxt->_eip;
7170
+ kvm_rip_write(vcpu, ctxt->eip);
7171
+ kvm_set_rflags(vcpu, ctxt->eflags);
7172
+ }
60307173 }
60317174 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
60327175
60337176 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
60347177 {
6035
- int r = EMULATE_DONE;
6036
-
60377178 ++vcpu->stat.insn_emulation_fail;
60387179 trace_kvm_emulate_insn_failed(vcpu);
60397180
6040
- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
6041
- return EMULATE_FAIL;
7181
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
7182
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7183
+ return 1;
7184
+ }
60427185
6043
- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
7186
+ if (emulation_type & EMULTYPE_SKIP) {
60447187 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
60457188 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
60467189 vcpu->run->internal.ndata = 0;
6047
- r = EMULATE_USER_EXIT;
7190
+ return 0;
60487191 }
60497192
60507193 kvm_queue_exception(vcpu, UD_VECTOR);
60517194
6052
- return r;
7195
+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
7196
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7197
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7198
+ vcpu->run->internal.ndata = 0;
7199
+ return 0;
7200
+ }
7201
+
7202
+ return 1;
60537203 }
60547204
60557205 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
....@@ -6059,13 +7209,14 @@
60597209 gpa_t gpa = cr2_or_gpa;
60607210 kvm_pfn_t pfn;
60617211
6062
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
7212
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
60637213 return false;
60647214
6065
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
7215
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7216
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
60667217 return false;
60677218
6068
- if (!vcpu->arch.mmu.direct_map) {
7219
+ if (!vcpu->arch.mmu->direct_map) {
60697220 /*
60707221 * Write permission should be allowed since only
60717222 * write access need to be emulated.
....@@ -6098,7 +7249,7 @@
60987249 kvm_release_pfn_clean(pfn);
60997250
61007251 /* The instructions are well-emulated on direct mmu. */
6101
- if (vcpu->arch.mmu.direct_map) {
7252
+ if (vcpu->arch.mmu->direct_map) {
61027253 unsigned int indirect_shadow_pages;
61037254
61047255 spin_lock(&vcpu->kvm->mmu_lock);
....@@ -6150,10 +7301,11 @@
61507301 */
61517302 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
61527303
6153
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
7304
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
61547305 return false;
61557306
6156
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
7307
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7308
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
61577309 return false;
61587310
61597311 if (x86_page_table_writing_insn(ctxt))
....@@ -6165,7 +7317,7 @@
61657317 vcpu->arch.last_retry_eip = ctxt->eip;
61667318 vcpu->arch.last_retry_addr = cr2_or_gpa;
61677319
6168
- if (!vcpu->arch.mmu.direct_map)
7320
+ if (!vcpu->arch.mmu->direct_map)
61697321 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
61707322
61717323 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
....@@ -6189,16 +7341,6 @@
61897341 kvm_mmu_reset_context(vcpu);
61907342 }
61917343
6192
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
6193
-{
6194
- unsigned changed = vcpu->arch.hflags ^ emul_flags;
6195
-
6196
- vcpu->arch.hflags = emul_flags;
6197
-
6198
- if (changed & HF_SMM_MASK)
6199
- kvm_smm_changed(vcpu);
6200
-}
6201
-
62027344 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
62037345 unsigned long *db)
62047346 {
....@@ -6214,34 +7356,29 @@
62147356 return dr6;
62157357 }
62167358
6217
-static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
7359
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
62187360 {
62197361 struct kvm_run *kvm_run = vcpu->run;
62207362
62217363 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
62227364 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
6223
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
7365
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
62247366 kvm_run->debug.arch.exception = DB_VECTOR;
62257367 kvm_run->exit_reason = KVM_EXIT_DEBUG;
6226
- *r = EMULATE_USER_EXIT;
6227
- } else {
6228
- /*
6229
- * "Certain debug exceptions may clear bit 0-3. The
6230
- * remaining contents of the DR6 register are never
6231
- * cleared by the processor".
6232
- */
6233
- vcpu->arch.dr6 &= ~15;
6234
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
6235
- kvm_queue_exception(vcpu, DB_VECTOR);
7368
+ return 0;
62367369 }
7370
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
7371
+ return 1;
62377372 }
62387373
62397374 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
62407375 {
6241
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
6242
- int r = EMULATE_DONE;
7376
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7377
+ int r;
62437378
6244
- kvm_x86_ops->skip_emulated_instruction(vcpu);
7379
+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
7380
+ if (unlikely(!r))
7381
+ return 0;
62457382
62467383 /*
62477384 * rflags is the old, "raw" value of the flags. The new value has
....@@ -6252,12 +7389,12 @@
62527389 * that sets the TF flag".
62537390 */
62547391 if (unlikely(rflags & X86_EFLAGS_TF))
6255
- kvm_vcpu_do_singlestep(vcpu, &r);
6256
- return r == EMULATE_DONE;
7392
+ r = kvm_vcpu_do_singlestep(vcpu);
7393
+ return r;
62577394 }
62587395 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
62597396
6260
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
7397
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
62617398 {
62627399 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
62637400 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
....@@ -6272,7 +7409,7 @@
62727409 kvm_run->debug.arch.pc = eip;
62737410 kvm_run->debug.arch.exception = DB_VECTOR;
62747411 kvm_run->exit_reason = KVM_EXIT_DEBUG;
6275
- *r = EMULATE_USER_EXIT;
7412
+ *r = 0;
62767413 return true;
62777414 }
62787415 }
....@@ -6285,10 +7422,8 @@
62857422 vcpu->arch.db);
62867423
62877424 if (dr6 != 0) {
6288
- vcpu->arch.dr6 &= ~15;
6289
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
6290
- kvm_queue_exception(vcpu, DB_VECTOR);
6291
- *r = EMULATE_DONE;
7425
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
7426
+ *r = 1;
62927427 return true;
62937428 }
62947429 }
....@@ -6327,13 +7462,45 @@
63277462 return false;
63287463 }
63297464
7465
+/*
7466
+ * Decode an instruction for emulation. The caller is responsible for handling
7467
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
7468
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
7469
+ * code breakpoints have higher priority and thus have already been done by
7470
+ * hardware.
7471
+ *
7472
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
7473
+ * response to a machine check.
7474
+ */
7475
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
7476
+ void *insn, int insn_len)
7477
+{
7478
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7479
+ int r;
7480
+
7481
+ init_emulate_ctxt(vcpu);
7482
+
7483
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
7484
+
7485
+ r = x86_decode_insn(ctxt, insn, insn_len);
7486
+
7487
+ trace_kvm_emulate_insn_start(vcpu);
7488
+ ++vcpu->stat.insn_emulation;
7489
+
7490
+ return r;
7491
+}
7492
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
7493
+
63307494 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
63317495 int emulation_type, void *insn, int insn_len)
63327496 {
63337497 int r;
6334
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7498
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
63357499 bool writeback = true;
6336
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
7500
+ bool write_fault_to_spt;
7501
+
7502
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
7503
+ return 1;
63377504
63387505 vcpu->arch.l1tf_flush_l1d = true;
63397506
....@@ -6341,39 +7508,33 @@
63417508 * Clear write_fault_to_shadow_pgtable here to ensure it is
63427509 * never reused.
63437510 */
7511
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
63447512 vcpu->arch.write_fault_to_shadow_pgtable = false;
6345
- kvm_clear_exception_queue(vcpu);
63467513
63477514 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
6348
- init_emulate_ctxt(vcpu);
7515
+ kvm_clear_exception_queue(vcpu);
63497516
63507517 /*
6351
- * We will reenter on the same instruction since
6352
- * we do not set complete_userspace_io. This does not
6353
- * handle watchpoints yet, those would be handled in
6354
- * the emulate_ops.
7518
+ * Return immediately if RIP hits a code breakpoint, such #DBs
7519
+ * are fault-like and are higher priority than any faults on
7520
+ * the code fetch itself.
63557521 */
63567522 if (!(emulation_type & EMULTYPE_SKIP) &&
6357
- kvm_vcpu_check_breakpoint(vcpu, &r))
7523
+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
63587524 return r;
63597525
6360
- ctxt->interruptibility = 0;
6361
- ctxt->have_exception = false;
6362
- ctxt->exception.vector = -1;
6363
- ctxt->perm_ok = false;
6364
-
6365
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
6366
-
6367
- r = x86_decode_insn(ctxt, insn, insn_len);
6368
-
6369
- trace_kvm_emulate_insn_start(vcpu);
6370
- ++vcpu->stat.insn_emulation;
7526
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
7527
+ insn, insn_len);
63717528 if (r != EMULATION_OK) {
6372
- if (emulation_type & EMULTYPE_TRAP_UD)
6373
- return EMULATE_FAIL;
6374
- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6375
- emulation_type))
6376
- return EMULATE_DONE;
7529
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
7530
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
7531
+ kvm_queue_exception(vcpu, UD_VECTOR);
7532
+ return 1;
7533
+ }
7534
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
7535
+ write_fault_to_spt,
7536
+ emulation_type))
7537
+ return 1;
63777538 if (ctxt->have_exception) {
63787539 /*
63797540 * #UD should result in just EMULATION_FAILED, and trap-like
....@@ -6382,27 +7543,32 @@
63827543 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
63837544 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
63847545 inject_emulated_exception(vcpu);
6385
- return EMULATE_DONE;
7546
+ return 1;
63867547 }
6387
- if (emulation_type & EMULTYPE_SKIP)
6388
- return EMULATE_FAIL;
63897548 return handle_emulation_failure(vcpu, emulation_type);
63907549 }
63917550 }
63927551
6393
- if ((emulation_type & EMULTYPE_VMWARE) &&
6394
- !is_vmware_backdoor_opcode(ctxt))
6395
- return EMULATE_FAIL;
7552
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
7553
+ !is_vmware_backdoor_opcode(ctxt)) {
7554
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7555
+ return 1;
7556
+ }
63967557
7558
+ /*
7559
+ * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
7560
+ * for kvm_skip_emulated_instruction(). The caller is responsible for
7561
+ * updating interruptibility state and injecting single-step #DBs.
7562
+ */
63977563 if (emulation_type & EMULTYPE_SKIP) {
63987564 kvm_rip_write(vcpu, ctxt->_eip);
63997565 if (ctxt->eflags & X86_EFLAGS_RF)
64007566 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
6401
- return EMULATE_DONE;
7567
+ return 1;
64027568 }
64037569
64047570 if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
6405
- return EMULATE_DONE;
7571
+ return 1;
64067572
64077573 /* this is needed for vmware backdoor interface to work since it
64087574 changes registers values during IO operation */
....@@ -6412,24 +7578,35 @@
64127578 }
64137579
64147580 restart:
6415
- /* Save the faulting GPA (cr2) in the address field */
6416
- ctxt->exception.address = cr2_or_gpa;
7581
+ if (emulation_type & EMULTYPE_PF) {
7582
+ /* Save the faulting GPA (cr2) in the address field */
7583
+ ctxt->exception.address = cr2_or_gpa;
7584
+
7585
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
7586
+ if (vcpu->arch.mmu->direct_map) {
7587
+ ctxt->gpa_available = true;
7588
+ ctxt->gpa_val = cr2_or_gpa;
7589
+ }
7590
+ } else {
7591
+ /* Sanitize the address out of an abundance of paranoia. */
7592
+ ctxt->exception.address = 0;
7593
+ }
64177594
64187595 r = x86_emulate_insn(ctxt);
64197596
64207597 if (r == EMULATION_INTERCEPTED)
6421
- return EMULATE_DONE;
7598
+ return 1;
64227599
64237600 if (r == EMULATION_FAILED) {
64247601 if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
64257602 emulation_type))
6426
- return EMULATE_DONE;
7603
+ return 1;
64277604
64287605 return handle_emulation_failure(vcpu, emulation_type);
64297606 }
64307607
64317608 if (ctxt->have_exception) {
6432
- r = EMULATE_DONE;
7609
+ r = 1;
64337610 if (inject_emulated_exception(vcpu))
64347611 return r;
64357612 } else if (vcpu->arch.pio.count) {
....@@ -6440,26 +7617,36 @@
64407617 writeback = false;
64417618 vcpu->arch.complete_userspace_io = complete_emulated_pio;
64427619 }
6443
- r = EMULATE_USER_EXIT;
7620
+ r = 0;
64447621 } else if (vcpu->mmio_needed) {
7622
+ ++vcpu->stat.mmio_exits;
7623
+
64457624 if (!vcpu->mmio_is_write)
64467625 writeback = false;
6447
- r = EMULATE_USER_EXIT;
7626
+ r = 0;
64487627 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
64497628 } else if (r == EMULATION_RESTART)
64507629 goto restart;
64517630 else
6452
- r = EMULATE_DONE;
7631
+ r = 1;
64537632
64547633 if (writeback) {
6455
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
7634
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
64567635 toggle_interruptibility(vcpu, ctxt->interruptibility);
64577636 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
7637
+
7638
+ /*
7639
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
7640
+ * only supports code breakpoints and general detect #DB, both
7641
+ * of which are fault-like.
7642
+ */
64587643 if (!ctxt->have_exception ||
64597644 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
64607645 kvm_rip_write(vcpu, ctxt->eip);
6461
- if (r == EMULATE_DONE && ctxt->tf)
6462
- kvm_vcpu_do_singlestep(vcpu, &r);
7646
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
7647
+ r = kvm_vcpu_do_singlestep(vcpu);
7648
+ if (kvm_x86_ops.update_emulated_instruction)
7649
+ kvm_x86_ops.update_emulated_instruction(vcpu);
64637650 __kvm_set_rflags(vcpu, ctxt->eflags);
64647651 }
64657652
....@@ -6509,9 +7696,9 @@
65097696 static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
65107697 unsigned short port)
65117698 {
6512
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6513
- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6514
- size, port, &val, 1);
7699
+ unsigned long val = kvm_rax_read(vcpu);
7700
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
7701
+
65157702 if (ret)
65167703 return ret;
65177704
....@@ -6544,16 +7731,14 @@
65447731 }
65457732
65467733 /* For size less than 4 we merge, else we zero extend */
6547
- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
6548
- : 0;
7734
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
65497735
65507736 /*
6551
- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
7737
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
65527738 * the copy and tracing
65537739 */
6554
- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
6555
- vcpu->arch.pio.port, &val, 1);
6556
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
7740
+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
7741
+ kvm_rax_write(vcpu, val);
65577742
65587743 return kvm_skip_emulated_instruction(vcpu);
65597744 }
....@@ -6565,12 +7750,11 @@
65657750 int ret;
65667751
65677752 /* For size less than 4 we merge, else we zero extend */
6568
- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
7753
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
65697754
6570
- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
6571
- &val, 1);
7755
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
65727756 if (ret) {
6573
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
7757
+ kvm_rax_write(vcpu, val);
65747758 return ret;
65757759 }
65767760
....@@ -6649,10 +7833,8 @@
66497833 }
66507834 #endif
66517835
6652
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
6653
- void *data)
7836
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
66547837 {
6655
- struct cpufreq_freqs *freq = data;
66567838 struct kvm *kvm;
66577839 struct kvm_vcpu *vcpu;
66587840 int i, send_ipi = 0;
....@@ -6696,17 +7878,12 @@
66967878 *
66977879 */
66987880
6699
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
6700
- return 0;
6701
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
6702
- return 0;
6703
-
6704
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
7881
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
67057882
67067883 mutex_lock(&kvm_lock);
67077884 list_for_each_entry(kvm, &vm_list, vm_list) {
67087885 kvm_for_each_vcpu(i, vcpu, kvm) {
6709
- if (vcpu->cpu != freq->cpu)
7886
+ if (vcpu->cpu != cpu)
67107887 continue;
67117888 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
67127889 if (vcpu->cpu != raw_smp_processor_id())
....@@ -6728,8 +7905,24 @@
67287905 * guest context is entered kvmclock will be updated,
67297906 * so the guest will not see stale values.
67307907 */
6731
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
7908
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
67327909 }
7910
+}
7911
+
7912
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
7913
+ void *data)
7914
+{
7915
+ struct cpufreq_freqs *freq = data;
7916
+ int cpu;
7917
+
7918
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
7919
+ return 0;
7920
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
7921
+ return 0;
7922
+
7923
+ for_each_cpu(cpu, freq->policy->cpus)
7924
+ __kvmclock_cpufreq_notifier(freq, cpu);
7925
+
67337926 return 0;
67347927 }
67357928
....@@ -6749,20 +7942,21 @@
67497942
67507943 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
67517944 #ifdef CONFIG_CPU_FREQ
6752
- struct cpufreq_policy policy;
7945
+ struct cpufreq_policy *policy;
67537946 int cpu;
67547947
6755
- memset(&policy, 0, sizeof(policy));
67567948 cpu = get_cpu();
6757
- cpufreq_get_policy(&policy, cpu);
6758
- if (policy.cpuinfo.max_freq)
6759
- max_tsc_khz = policy.cpuinfo.max_freq;
7949
+ policy = cpufreq_cpu_get(cpu);
7950
+ if (policy) {
7951
+ if (policy->cpuinfo.max_freq)
7952
+ max_tsc_khz = policy->cpuinfo.max_freq;
7953
+ cpufreq_cpu_put(policy);
7954
+ }
67607955 put_cpu();
67617956 #endif
67627957 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
67637958 CPUFREQ_TRANSITION_NOTIFIER);
67647959 }
6765
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
67667960
67677961 cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
67687962 kvmclock_cpu_online, kvmclock_cpu_down_prep);
....@@ -6781,7 +7975,7 @@
67817975 int user_mode = 3;
67827976
67837977 if (__this_cpu_read(current_vcpu))
6784
- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
7978
+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
67857979
67867980 return user_mode != 0;
67877981 }
....@@ -6796,10 +7990,20 @@
67967990 return ip;
67977991 }
67987992
7993
+static void kvm_handle_intel_pt_intr(void)
7994
+{
7995
+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
7996
+
7997
+ kvm_make_request(KVM_REQ_PMI, vcpu);
7998
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
7999
+ (unsigned long *)&vcpu->arch.pmu.global_status);
8000
+}
8001
+
67998002 static struct perf_guest_info_callbacks kvm_guest_cbs = {
68008003 .is_in_guest = kvm_is_in_guest,
68018004 .is_user_mode = kvm_is_user_mode,
68028005 .get_guest_ip = kvm_get_guest_ip,
8006
+ .handle_intel_pt_intr = NULL,
68038007 };
68048008
68058009 #ifdef CONFIG_X86_64
....@@ -6821,6 +8025,18 @@
68218025 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
68228026
68238027 /*
8028
+ * Indirection to move queue_work() out of the tk_core.seq write held
8029
+ * region to prevent possible deadlocks against time accessors which
8030
+ * are invoked with work related locks held.
8031
+ */
8032
+static void pvclock_irq_work_fn(struct irq_work *w)
8033
+{
8034
+ queue_work(system_long_wq, &pvclock_gtod_work);
8035
+}
8036
+
8037
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
8038
+
8039
+/*
68248040 * Notification about pvclock gtod data update.
68258041 */
68268042 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
....@@ -6831,13 +8047,14 @@
68318047
68328048 update_pvclock_gtod(tk);
68338049
6834
- /* disable master clock if host does not trust, or does not
6835
- * use, TSC based clocksource.
8050
+ /*
8051
+ * Disable master clock if host does not trust, or does not use,
8052
+ * TSC based clocksource. Delegate queue_work() to irq_work as
8053
+ * this is invoked with tk_core.seq write held.
68368054 */
68378055 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
68388056 atomic_read(&kvm_guest_has_master_clock) != 0)
6839
- queue_work(system_long_wq, &pvclock_gtod_work);
6840
-
8057
+ irq_work_queue(&pvclock_irq_work);
68418058 return 0;
68428059 }
68438060
....@@ -6848,50 +8065,87 @@
68488065
68498066 int kvm_arch_init(void *opaque)
68508067 {
8068
+ struct kvm_x86_init_ops *ops = opaque;
68518069 int r;
6852
- struct kvm_x86_ops *ops = opaque;
68538070
6854
- if (kvm_x86_ops) {
8071
+ if (kvm_x86_ops.hardware_enable) {
68558072 printk(KERN_ERR "kvm: already loaded the other module\n");
68568073 r = -EEXIST;
68578074 goto out;
68588075 }
68598076
68608077 if (!ops->cpu_has_kvm_support()) {
6861
- printk(KERN_ERR "kvm: no hardware support\n");
8078
+ pr_err_ratelimited("kvm: no hardware support\n");
68628079 r = -EOPNOTSUPP;
68638080 goto out;
68648081 }
68658082 if (ops->disabled_by_bios()) {
6866
- printk(KERN_ERR "kvm: disabled by bios\n");
8083
+ pr_err_ratelimited("kvm: disabled by bios\n");
68678084 r = -EOPNOTSUPP;
68688085 goto out;
68698086 }
68708087
6871
- r = -ENOMEM;
6872
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
6873
- if (!shared_msrs) {
6874
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
8088
+ /*
8089
+ * KVM explicitly assumes that the guest has an FPU and
8090
+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
8091
+ * vCPU's FPU state as a fxregs_state struct.
8092
+ */
8093
+ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
8094
+ printk(KERN_ERR "kvm: inadequate fpu\n");
8095
+ r = -EOPNOTSUPP;
68758096 goto out;
68768097 }
68778098
6878
- r = kvm_mmu_module_init();
8099
+#ifdef CONFIG_PREEMPT_RT
8100
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
8101
+ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
8102
+ r = -EOPNOTSUPP;
8103
+ goto out;
8104
+ }
8105
+#endif
8106
+
8107
+ r = -ENOMEM;
8108
+ x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
8109
+ __alignof__(struct fpu), SLAB_ACCOUNT,
8110
+ NULL);
8111
+ if (!x86_fpu_cache) {
8112
+ printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
8113
+ goto out;
8114
+ }
8115
+
8116
+ x86_emulator_cache = kvm_alloc_emulator_cache();
8117
+ if (!x86_emulator_cache) {
8118
+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
8119
+ goto out_free_x86_fpu_cache;
8120
+ }
8121
+
8122
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
8123
+ if (!user_return_msrs) {
8124
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
8125
+ goto out_free_x86_emulator_cache;
8126
+ }
8127
+
8128
+ r = kvm_mmu_vendor_module_init();
68798129 if (r)
68808130 goto out_free_percpu;
6881
-
6882
- kvm_x86_ops = ops;
68838131
68848132 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
68858133 PT_DIRTY_MASK, PT64_NX_MASK, 0,
68868134 PT_PRESENT_MASK, 0, sme_me_mask);
68878135 kvm_timer_init();
68888136
8137
+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
8138
+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
68898139 perf_register_guest_info_callbacks(&kvm_guest_cbs);
68908140
6891
- if (boot_cpu_has(X86_FEATURE_XSAVE))
8141
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
68928142 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
8143
+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
8144
+ }
68938145
68948146 kvm_lapic_init();
8147
+ if (pi_inject_timer == -1)
8148
+ pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
68958149 #ifdef CONFIG_X86_64
68968150 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
68978151
....@@ -6902,7 +8156,11 @@
69028156 return 0;
69038157
69048158 out_free_percpu:
6905
- free_percpu(shared_msrs);
8159
+ free_percpu(user_return_msrs);
8160
+out_free_x86_emulator_cache:
8161
+ kmem_cache_destroy(x86_emulator_cache);
8162
+out_free_x86_fpu_cache:
8163
+ kmem_cache_destroy(x86_fpu_cache);
69068164 out:
69078165 return r;
69088166 }
....@@ -6915,6 +8173,7 @@
69158173 #endif
69168174 kvm_lapic_exit();
69178175 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
8176
+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
69188177
69198178 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
69208179 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
....@@ -6922,11 +8181,14 @@
69228181 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
69238182 #ifdef CONFIG_X86_64
69248183 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
8184
+ irq_work_sync(&pvclock_irq_work);
69258185 cancel_work_sync(&pvclock_gtod_work);
69268186 #endif
6927
- kvm_x86_ops = NULL;
6928
- kvm_mmu_module_exit();
6929
- free_percpu(shared_msrs);
8187
+ kvm_x86_ops.hardware_enable = NULL;
8188
+ kvm_mmu_vendor_module_exit();
8189
+ free_percpu(user_return_msrs);
8190
+ kmem_cache_destroy(x86_emulator_cache);
8191
+ kmem_cache_destroy(x86_fpu_cache);
69308192 }
69318193
69328194 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
....@@ -6990,22 +8252,52 @@
69908252 */
69918253 static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
69928254 {
6993
- struct kvm_lapic_irq lapic_irq;
8255
+ /*
8256
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
8257
+ * common code, e.g. for tracing. Defer initialization to the compiler.
8258
+ */
8259
+ struct kvm_lapic_irq lapic_irq = {
8260
+ .delivery_mode = APIC_DM_REMRD,
8261
+ .dest_mode = APIC_DEST_PHYSICAL,
8262
+ .shorthand = APIC_DEST_NOSHORT,
8263
+ .dest_id = apicid,
8264
+ };
69948265
6995
- lapic_irq.shorthand = 0;
6996
- lapic_irq.dest_mode = 0;
6997
- lapic_irq.level = 0;
6998
- lapic_irq.dest_id = apicid;
6999
- lapic_irq.msi_redir_hint = false;
7000
-
7001
- lapic_irq.delivery_mode = APIC_DM_REMRD;
70028266 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
70038267 }
70048268
7005
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
8269
+bool kvm_apicv_activated(struct kvm *kvm)
70068270 {
7007
- vcpu->arch.apicv_active = false;
7008
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
8271
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
8272
+}
8273
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
8274
+
8275
+void kvm_apicv_init(struct kvm *kvm, bool enable)
8276
+{
8277
+ if (enable)
8278
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
8279
+ &kvm->arch.apicv_inhibit_reasons);
8280
+ else
8281
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
8282
+ &kvm->arch.apicv_inhibit_reasons);
8283
+}
8284
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
8285
+
8286
+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
8287
+{
8288
+ struct kvm_vcpu *target = NULL;
8289
+ struct kvm_apic_map *map;
8290
+
8291
+ rcu_read_lock();
8292
+ map = rcu_dereference(kvm->arch.apic_map);
8293
+
8294
+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
8295
+ target = map->phys_map[dest_id]->vcpu;
8296
+
8297
+ rcu_read_unlock();
8298
+
8299
+ if (target && READ_ONCE(target->ready))
8300
+ kvm_vcpu_yield_to(target);
70098301 }
70108302
70118303 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
....@@ -7016,11 +8308,11 @@
70168308 if (kvm_hv_hypercall_enabled(vcpu->kvm))
70178309 return kvm_hv_hypercall(vcpu);
70188310
7019
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
7020
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
7021
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
7022
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
7023
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
8311
+ nr = kvm_rax_read(vcpu);
8312
+ a0 = kvm_rbx_read(vcpu);
8313
+ a1 = kvm_rcx_read(vcpu);
8314
+ a2 = kvm_rdx_read(vcpu);
8315
+ a3 = kvm_rsi_read(vcpu);
70248316
70258317 trace_kvm_hypercall(nr, a0, a1, a2, a3);
70268318
....@@ -7033,17 +8325,23 @@
70338325 a3 &= 0xFFFFFFFF;
70348326 }
70358327
7036
- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
8328
+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
70378329 ret = -KVM_EPERM;
70388330 goto out;
70398331 }
8332
+
8333
+ ret = -KVM_ENOSYS;
70408334
70418335 switch (nr) {
70428336 case KVM_HC_VAPIC_POLL_IRQ:
70438337 ret = 0;
70448338 break;
70458339 case KVM_HC_KICK_CPU:
8340
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
8341
+ break;
8342
+
70468343 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
8344
+ kvm_sched_yield(vcpu->kvm, a1);
70478345 ret = 0;
70488346 break;
70498347 #ifdef CONFIG_X86_64
....@@ -7052,7 +8350,17 @@
70528350 break;
70538351 #endif
70548352 case KVM_HC_SEND_IPI:
8353
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
8354
+ break;
8355
+
70558356 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
8357
+ break;
8358
+ case KVM_HC_SCHED_YIELD:
8359
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
8360
+ break;
8361
+
8362
+ kvm_sched_yield(vcpu->kvm, a0);
8363
+ ret = 0;
70568364 break;
70578365 default:
70588366 ret = -KVM_ENOSYS;
....@@ -7061,7 +8369,7 @@
70618369 out:
70628370 if (!op_64_bit)
70638371 ret = (u32)ret;
7064
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
8372
+ kvm_rax_write(vcpu, ret);
70658373
70668374 ++vcpu->stat.hypercalls;
70678375 return kvm_skip_emulated_instruction(vcpu);
....@@ -7074,7 +8382,7 @@
70748382 char instruction[3];
70758383 unsigned long rip = kvm_rip_read(vcpu);
70768384
7077
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
8385
+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
70788386
70798387 return emulator_write_emulated(ctxt, rip, instruction, 3,
70808388 &ctxt->exception);
....@@ -7103,7 +8411,7 @@
71038411 {
71048412 int max_irr, tpr;
71058413
7106
- if (!kvm_x86_ops->update_cr8_intercept)
8414
+ if (!kvm_x86_ops.update_cr8_intercept)
71078415 return;
71088416
71098417 if (!lapic_in_kernel(vcpu))
....@@ -7122,24 +8430,32 @@
71228430
71238431 tpr = kvm_lapic_get_cr8(vcpu);
71248432
7125
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
8433
+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
71268434 }
71278435
71288436 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
71298437 {
7130
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
7131
- vcpu->arch.exception.error_code = false;
7132
- kvm_x86_ops->queue_exception(vcpu);
8438
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
8439
+ vcpu->arch.exception.has_error_code,
8440
+ vcpu->arch.exception.error_code,
8441
+ vcpu->arch.exception.injected);
8442
+
8443
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
8444
+ vcpu->arch.exception.error_code = false;
8445
+ kvm_x86_ops.queue_exception(vcpu);
71338446 }
71348447
7135
-static int inject_pending_event(struct kvm_vcpu *vcpu)
8448
+static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
71368449 {
71378450 int r;
8451
+ bool can_inject = true;
71388452
71398453 /* try to reinject previous events if any */
71408454
7141
- if (vcpu->arch.exception.injected)
8455
+ if (vcpu->arch.exception.injected) {
71428456 kvm_inject_exception(vcpu);
8457
+ can_inject = false;
8458
+ }
71438459 /*
71448460 * Do not inject an NMI or interrupt if there is a pending
71458461 * exception. Exceptions and interrupts are recognized at
....@@ -7155,11 +8471,17 @@
71558471 * fully complete the previous instruction.
71568472 */
71578473 else if (!vcpu->arch.exception.pending) {
7158
- if (vcpu->arch.nmi_injected)
7159
- kvm_x86_ops->set_nmi(vcpu);
7160
- else if (vcpu->arch.interrupt.injected)
7161
- kvm_x86_ops->set_irq(vcpu);
8474
+ if (vcpu->arch.nmi_injected) {
8475
+ kvm_x86_ops.set_nmi(vcpu);
8476
+ can_inject = false;
8477
+ } else if (vcpu->arch.interrupt.injected) {
8478
+ kvm_x86_ops.set_irq(vcpu);
8479
+ can_inject = false;
8480
+ }
71628481 }
8482
+
8483
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
8484
+ vcpu->arch.exception.pending);
71638485
71648486 /*
71658487 * Call check_nested_events() even if we reinjected a previous event
....@@ -7167,69 +8489,107 @@
71678489 * from L2 to L1 due to pending L1 events which require exit
71688490 * from L2 to L1.
71698491 */
7170
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7171
- r = kvm_x86_ops->check_nested_events(vcpu);
7172
- if (r != 0)
7173
- return r;
8492
+ if (is_guest_mode(vcpu)) {
8493
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
8494
+ if (r < 0)
8495
+ goto busy;
71748496 }
71758497
71768498 /* try to inject new event if pending */
71778499 if (vcpu->arch.exception.pending) {
7178
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
7179
- vcpu->arch.exception.has_error_code,
7180
- vcpu->arch.exception.error_code);
7181
-
7182
- WARN_ON_ONCE(vcpu->arch.exception.injected);
7183
- vcpu->arch.exception.pending = false;
7184
- vcpu->arch.exception.injected = true;
7185
-
8500
+ /*
8501
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
8502
+ * value pushed on the stack. Trap-like exception and all #DBs
8503
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
8504
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
8505
+ *
8506
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
8507
+ * describe the behavior of General Detect #DBs, which are
8508
+ * fault-like. They do _not_ set RF, a la code breakpoints.
8509
+ */
71868510 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
71878511 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
71888512 X86_EFLAGS_RF);
71898513
7190
- if (vcpu->arch.exception.nr == DB_VECTOR &&
7191
- (vcpu->arch.dr7 & DR7_GD)) {
7192
- vcpu->arch.dr7 &= ~DR7_GD;
7193
- kvm_update_dr7(vcpu);
8514
+ if (vcpu->arch.exception.nr == DB_VECTOR) {
8515
+ kvm_deliver_exception_payload(vcpu);
8516
+ if (vcpu->arch.dr7 & DR7_GD) {
8517
+ vcpu->arch.dr7 &= ~DR7_GD;
8518
+ kvm_update_dr7(vcpu);
8519
+ }
71948520 }
71958521
71968522 kvm_inject_exception(vcpu);
8523
+
8524
+ vcpu->arch.exception.pending = false;
8525
+ vcpu->arch.exception.injected = true;
8526
+
8527
+ can_inject = false;
71978528 }
71988529
7199
- /* Don't consider new event if we re-injected an event */
7200
- if (kvm_event_needs_reinjection(vcpu))
7201
- return 0;
7202
-
7203
- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
7204
- kvm_x86_ops->smi_allowed(vcpu)) {
7205
- vcpu->arch.smi_pending = false;
7206
- ++vcpu->arch.smi_count;
7207
- enter_smm(vcpu);
7208
- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
7209
- --vcpu->arch.nmi_pending;
7210
- vcpu->arch.nmi_injected = true;
7211
- kvm_x86_ops->set_nmi(vcpu);
7212
- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
7213
- /*
7214
- * Because interrupts can be injected asynchronously, we are
7215
- * calling check_nested_events again here to avoid a race condition.
7216
- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
7217
- * proposal and current concerns. Perhaps we should be setting
7218
- * KVM_REQ_EVENT only on certain events and not unconditionally?
7219
- */
7220
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7221
- r = kvm_x86_ops->check_nested_events(vcpu);
7222
- if (r != 0)
7223
- return r;
7224
- }
7225
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
7226
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
7227
- false);
7228
- kvm_x86_ops->set_irq(vcpu);
7229
- }
8530
+ /*
8531
+ * Finally, inject interrupt events. If an event cannot be injected
8532
+ * due to architectural conditions (e.g. IF=0) a window-open exit
8533
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
8534
+ * and can architecturally be injected, but we cannot do it right now:
8535
+ * an interrupt could have arrived just now and we have to inject it
8536
+ * as a vmexit, or there could already an event in the queue, which is
8537
+ * indicated by can_inject. In that case we request an immediate exit
8538
+ * in order to make progress and get back here for another iteration.
8539
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
8540
+ */
8541
+ if (vcpu->arch.smi_pending) {
8542
+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
8543
+ if (r < 0)
8544
+ goto busy;
8545
+ if (r) {
8546
+ vcpu->arch.smi_pending = false;
8547
+ ++vcpu->arch.smi_count;
8548
+ enter_smm(vcpu);
8549
+ can_inject = false;
8550
+ } else
8551
+ kvm_x86_ops.enable_smi_window(vcpu);
72308552 }
72318553
7232
- return 0;
8554
+ if (vcpu->arch.nmi_pending) {
8555
+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
8556
+ if (r < 0)
8557
+ goto busy;
8558
+ if (r) {
8559
+ --vcpu->arch.nmi_pending;
8560
+ vcpu->arch.nmi_injected = true;
8561
+ kvm_x86_ops.set_nmi(vcpu);
8562
+ can_inject = false;
8563
+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
8564
+ }
8565
+ if (vcpu->arch.nmi_pending)
8566
+ kvm_x86_ops.enable_nmi_window(vcpu);
8567
+ }
8568
+
8569
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
8570
+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
8571
+ if (r < 0)
8572
+ goto busy;
8573
+ if (r) {
8574
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
8575
+ kvm_x86_ops.set_irq(vcpu);
8576
+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
8577
+ }
8578
+ if (kvm_cpu_has_injectable_intr(vcpu))
8579
+ kvm_x86_ops.enable_irq_window(vcpu);
8580
+ }
8581
+
8582
+ if (is_guest_mode(vcpu) &&
8583
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
8584
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
8585
+ *req_immediate_exit = true;
8586
+
8587
+ WARN_ON(vcpu->arch.exception.pending);
8588
+ return;
8589
+
8590
+busy:
8591
+ *req_immediate_exit = true;
8592
+ return;
72338593 }
72348594
72358595 static void process_nmi(struct kvm_vcpu *vcpu)
....@@ -7241,7 +8601,7 @@
72418601 * If an NMI is already in progress, limit further NMIs to just one.
72428602 * Otherwise, allow two (and we'll inject the first one immediately).
72438603 */
7244
- if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
8604
+ if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
72458605 limit = 1;
72468606
72478607 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
....@@ -7331,11 +8691,11 @@
73318691 put_smstate(u32, buf, 0x7f7c, seg.limit);
73328692 put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
73338693
7334
- kvm_x86_ops->get_gdt(vcpu, &dt);
8694
+ kvm_x86_ops.get_gdt(vcpu, &dt);
73358695 put_smstate(u32, buf, 0x7f74, dt.address);
73368696 put_smstate(u32, buf, 0x7f70, dt.size);
73378697
7338
- kvm_x86_ops->get_idt(vcpu, &dt);
8698
+ kvm_x86_ops.get_idt(vcpu, &dt);
73398699 put_smstate(u32, buf, 0x7f58, dt.address);
73408700 put_smstate(u32, buf, 0x7f54, dt.size);
73418701
....@@ -7385,7 +8745,7 @@
73858745 put_smstate(u32, buf, 0x7e94, seg.limit);
73868746 put_smstate(u64, buf, 0x7e98, seg.base);
73878747
7388
- kvm_x86_ops->get_idt(vcpu, &dt);
8748
+ kvm_x86_ops.get_idt(vcpu, &dt);
73898749 put_smstate(u32, buf, 0x7e84, dt.size);
73908750 put_smstate(u64, buf, 0x7e88, dt.address);
73918751
....@@ -7395,7 +8755,7 @@
73958755 put_smstate(u32, buf, 0x7e74, seg.limit);
73968756 put_smstate(u64, buf, 0x7e78, seg.base);
73978757
7398
- kvm_x86_ops->get_gdt(vcpu, &dt);
8758
+ kvm_x86_ops.get_gdt(vcpu, &dt);
73998759 put_smstate(u32, buf, 0x7e64, dt.size);
74008760 put_smstate(u64, buf, 0x7e68, dt.address);
74018761
....@@ -7425,28 +8785,28 @@
74258785 * vCPU state (e.g. leave guest mode) after we've saved the state into
74268786 * the SMM state-save area.
74278787 */
7428
- kvm_x86_ops->pre_enter_smm(vcpu, buf);
8788
+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
74298789
74308790 vcpu->arch.hflags |= HF_SMM_MASK;
74318791 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
74328792
7433
- if (kvm_x86_ops->get_nmi_mask(vcpu))
8793
+ if (kvm_x86_ops.get_nmi_mask(vcpu))
74348794 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
74358795 else
7436
- kvm_x86_ops->set_nmi_mask(vcpu, true);
8796
+ kvm_x86_ops.set_nmi_mask(vcpu, true);
74378797
74388798 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
74398799 kvm_rip_write(vcpu, 0x8000);
74408800
74418801 cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
7442
- kvm_x86_ops->set_cr0(vcpu, cr0);
8802
+ kvm_x86_ops.set_cr0(vcpu, cr0);
74438803 vcpu->arch.cr0 = cr0;
74448804
7445
- kvm_x86_ops->set_cr4(vcpu, 0);
8805
+ kvm_x86_ops.set_cr4(vcpu, 0);
74468806
74478807 /* Undocumented: IDT limit is set to zero on entry to SMM. */
74488808 dt.address = dt.size = 0;
7449
- kvm_x86_ops->set_idt(vcpu, &dt);
8809
+ kvm_x86_ops.set_idt(vcpu, &dt);
74508810
74518811 __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
74528812
....@@ -7477,10 +8837,10 @@
74778837
74788838 #ifdef CONFIG_X86_64
74798839 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
7480
- kvm_x86_ops->set_efer(vcpu, 0);
8840
+ kvm_x86_ops.set_efer(vcpu, 0);
74818841 #endif
74828842
7483
- kvm_update_cpuid(vcpu);
8843
+ kvm_update_cpuid_runtime(vcpu);
74848844 kvm_mmu_reset_context(vcpu);
74858845 }
74868846
....@@ -7490,10 +8850,82 @@
74908850 kvm_make_request(KVM_REQ_EVENT, vcpu);
74918851 }
74928852
8853
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
8854
+ unsigned long *vcpu_bitmap)
8855
+{
8856
+ cpumask_var_t cpus;
8857
+
8858
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
8859
+
8860
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
8861
+ NULL, vcpu_bitmap, cpus);
8862
+
8863
+ free_cpumask_var(cpus);
8864
+}
8865
+
74938866 void kvm_make_scan_ioapic_request(struct kvm *kvm)
74948867 {
74958868 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
74968869 }
8870
+
8871
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8872
+{
8873
+ if (!lapic_in_kernel(vcpu))
8874
+ return;
8875
+
8876
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
8877
+ kvm_apic_update_apicv(vcpu);
8878
+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
8879
+}
8880
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
8881
+
8882
+/*
8883
+ * NOTE: Do not hold any lock prior to calling this.
8884
+ *
8885
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
8886
+ * locked, because it calls __x86_set_memory_region() which does
8887
+ * synchronize_srcu(&kvm->srcu).
8888
+ */
8889
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
8890
+{
8891
+ struct kvm_vcpu *except;
8892
+ unsigned long old, new, expected;
8893
+
8894
+ if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
8895
+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
8896
+ return;
8897
+
8898
+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
8899
+ do {
8900
+ expected = new = old;
8901
+ if (activate)
8902
+ __clear_bit(bit, &new);
8903
+ else
8904
+ __set_bit(bit, &new);
8905
+ if (new == old)
8906
+ break;
8907
+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
8908
+ } while (old != expected);
8909
+
8910
+ if (!!old == !!new)
8911
+ return;
8912
+
8913
+ trace_kvm_apicv_update_request(activate, bit);
8914
+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
8915
+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
8916
+
8917
+ /*
8918
+ * Sending request to update APICV for all other vcpus,
8919
+ * while update the calling vcpu immediately instead of
8920
+ * waiting for another #VMEXIT to handle the request.
8921
+ */
8922
+ except = kvm_get_running_vcpu();
8923
+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
8924
+ except);
8925
+ if (except)
8926
+ kvm_vcpu_update_apicv(except);
8927
+}
8928
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
74978929
74988930 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
74998931 {
....@@ -7506,7 +8938,7 @@
75068938 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
75078939 else {
75088940 if (vcpu->arch.apicv_active)
7509
- kvm_x86_ops->sync_pir_to_irr(vcpu);
8941
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
75108942 if (ioapic_in_kernel(vcpu->kvm))
75118943 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
75128944 }
....@@ -7526,7 +8958,7 @@
75268958
75278959 bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
75288960 vcpu_to_synic(vcpu)->vec_bitmap, 256);
7529
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
8961
+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
75308962 }
75318963
75328964 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
....@@ -7543,28 +8975,22 @@
75438975 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
75448976 }
75458977
8978
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
8979
+{
8980
+ if (kvm_x86_ops.guest_memory_reclaimed)
8981
+ kvm_x86_ops.guest_memory_reclaimed(kvm);
8982
+}
8983
+
75468984 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
75478985 {
7548
- struct page *page = NULL;
7549
-
75508986 if (!lapic_in_kernel(vcpu))
75518987 return;
75528988
7553
- if (!kvm_x86_ops->set_apic_access_page_addr)
8989
+ if (!kvm_x86_ops.set_apic_access_page_addr)
75548990 return;
75558991
7556
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
7557
- if (is_error_page(page))
7558
- return;
7559
- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
7560
-
7561
- /*
7562
- * Do not pin apic access page in memory, the MMU notifier
7563
- * will call us again if it is migrated or swapped out.
7564
- */
7565
- put_page(page);
8992
+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
75668993 }
7567
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
75688994
75698995 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
75708996 {
....@@ -7583,12 +9009,17 @@
75839009 bool req_int_win =
75849010 dm_request_for_irq_injection(vcpu) &&
75859011 kvm_cpu_accept_dm_intr(vcpu);
9012
+ fastpath_t exit_fastpath;
75869013
75879014 bool req_immediate_exit = false;
75889015
75899016 if (kvm_request_pending(vcpu)) {
7590
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
7591
- kvm_x86_ops->get_vmcs12_pages(vcpu);
9017
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
9018
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
9019
+ r = 0;
9020
+ goto out;
9021
+ }
9022
+ }
75929023 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
75939024 kvm_mmu_unload(vcpu);
75949025 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
....@@ -7604,10 +9035,19 @@
76049035 }
76059036 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
76069037 kvm_mmu_sync_roots(vcpu);
7607
- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
7608
- kvm_mmu_load_cr3(vcpu);
7609
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
7610
- kvm_vcpu_flush_tlb(vcpu, true);
9038
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
9039
+ kvm_mmu_load_pgd(vcpu);
9040
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
9041
+ kvm_vcpu_flush_tlb_all(vcpu);
9042
+
9043
+ /* Flushing all ASIDs flushes the current ASID... */
9044
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
9045
+ }
9046
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
9047
+ kvm_vcpu_flush_tlb_current(vcpu);
9048
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
9049
+ kvm_vcpu_flush_tlb_guest(vcpu);
9050
+
76119051 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
76129052 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
76139053 r = 0;
....@@ -7678,6 +9118,12 @@
76789118 */
76799119 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
76809120 kvm_hv_process_stimers(vcpu);
9121
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
9122
+ kvm_vcpu_update_apicv(vcpu);
9123
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
9124
+ kvm_check_async_pf_completion(vcpu);
9125
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
9126
+ kvm_x86_ops.msr_filter_changed(vcpu);
76819127 }
76829128
76839129 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
....@@ -7688,32 +9134,9 @@
76889134 goto out;
76899135 }
76909136
7691
- if (inject_pending_event(vcpu) != 0)
7692
- req_immediate_exit = true;
7693
- else {
7694
- /* Enable SMI/NMI/IRQ window open exits if needed.
7695
- *
7696
- * SMIs have three cases:
7697
- * 1) They can be nested, and then there is nothing to
7698
- * do here because RSM will cause a vmexit anyway.
7699
- * 2) There is an ISA-specific reason why SMI cannot be
7700
- * injected, and the moment when this changes can be
7701
- * intercepted.
7702
- * 3) Or the SMI can be pending because
7703
- * inject_pending_event has completed the injection
7704
- * of an IRQ or NMI from the previous vmexit, and
7705
- * then we request an immediate exit to inject the
7706
- * SMI.
7707
- */
7708
- if (vcpu->arch.smi_pending && !is_smm(vcpu))
7709
- if (!kvm_x86_ops->enable_smi_window(vcpu))
7710
- req_immediate_exit = true;
7711
- if (vcpu->arch.nmi_pending)
7712
- kvm_x86_ops->enable_nmi_window(vcpu);
7713
- if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
7714
- kvm_x86_ops->enable_irq_window(vcpu);
7715
- WARN_ON(vcpu->arch.exception.pending);
7716
- }
9137
+ inject_pending_event(vcpu, &req_immediate_exit);
9138
+ if (req_int_win)
9139
+ kvm_x86_ops.enable_irq_window(vcpu);
77179140
77189141 if (kvm_lapic_enabled(vcpu)) {
77199142 update_cr8_intercept(vcpu);
....@@ -7728,7 +9151,7 @@
77289151
77299152 preempt_disable();
77309153
7731
- kvm_x86_ops->prepare_guest_switch(vcpu);
9154
+ kvm_x86_ops.prepare_guest_switch(vcpu);
77329155
77339156 /*
77349157 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
....@@ -7744,7 +9167,7 @@
77449167 * 1) We should set ->mode before checking ->requests. Please see
77459168 * the comment in kvm_vcpu_exiting_guest_mode().
77469169 *
7747
- * 2) For APICv, we should set ->mode before checking PIR.ON. This
9170
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
77489171 * pairs with the memory barrier implicit in pi_test_and_set_on
77499172 * (see vmx_deliver_posted_interrupt).
77509173 *
....@@ -7759,10 +9182,9 @@
77599182 * notified with kvm_vcpu_kick.
77609183 */
77619184 if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
7762
- kvm_x86_ops->sync_pir_to_irr(vcpu);
9185
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
77639186
7764
- if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
7765
- || need_resched() || signal_pending(current)) {
9187
+ if (kvm_vcpu_exit_request(vcpu)) {
77669188 vcpu->mode = OUTSIDE_GUEST_MODE;
77679189 smp_wmb();
77689190 local_irq_enable();
....@@ -7774,13 +9196,14 @@
77749196
77759197 if (req_immediate_exit) {
77769198 kvm_make_request(KVM_REQ_EVENT, vcpu);
7777
- kvm_x86_ops->request_immediate_exit(vcpu);
9199
+ kvm_x86_ops.request_immediate_exit(vcpu);
77789200 }
77799201
7780
- trace_kvm_entry(vcpu->vcpu_id);
7781
- if (lapic_timer_advance_ns)
7782
- wait_lapic_expire(vcpu);
7783
- guest_enter_irqoff();
9202
+ trace_kvm_entry(vcpu);
9203
+
9204
+ fpregs_assert_state_consistent();
9205
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
9206
+ switch_fpu_return();
77849207
77859208 if (unlikely(vcpu->arch.switch_db_regs)) {
77869209 set_debugreg(0, 7);
....@@ -7794,7 +9217,7 @@
77949217 set_debugreg(0, 7);
77959218 }
77969219
7797
- kvm_x86_ops->run(vcpu);
9220
+ exit_fastpath = kvm_x86_ops.run(vcpu);
77989221
77999222 /*
78009223 * Do this here before restoring debug registers on the host. And
....@@ -7804,9 +9227,8 @@
78049227 */
78059228 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
78069229 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
7807
- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
9230
+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
78089231 kvm_update_dr0123(vcpu);
7809
- kvm_update_dr6(vcpu);
78109232 kvm_update_dr7(vcpu);
78119233 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
78129234 }
....@@ -7821,18 +9243,43 @@
78219243 if (hw_breakpoint_active())
78229244 hw_breakpoint_restore();
78239245
9246
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
78249247 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
78259248
78269249 vcpu->mode = OUTSIDE_GUEST_MODE;
78279250 smp_wmb();
78289251
9252
+ kvm_x86_ops.handle_exit_irqoff(vcpu);
9253
+
9254
+ /*
9255
+ * Consume any pending interrupts, including the possible source of
9256
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
9257
+ * An instruction is required after local_irq_enable() to fully unblock
9258
+ * interrupts on processors that implement an interrupt shadow, the
9259
+ * stat.exits increment will do nicely.
9260
+ */
78299261 kvm_before_interrupt(vcpu);
7830
- kvm_x86_ops->handle_external_intr(vcpu);
9262
+ local_irq_enable();
9263
+ ++vcpu->stat.exits;
9264
+ local_irq_disable();
78319265 kvm_after_interrupt(vcpu);
78329266
7833
- ++vcpu->stat.exits;
9267
+ /*
9268
+ * Wait until after servicing IRQs to account guest time so that any
9269
+ * ticks that occurred while running the guest are properly accounted
9270
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
9271
+ * of accounting via context tracking, but the loss of accuracy is
9272
+ * acceptable for all known use cases.
9273
+ */
9274
+ vtime_account_guest_exit();
78349275
7835
- guest_exit_irqoff();
9276
+ if (lapic_in_kernel(vcpu)) {
9277
+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
9278
+ if (delta != S64_MIN) {
9279
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
9280
+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
9281
+ }
9282
+ }
78369283
78379284 local_irq_enable();
78389285 preempt_enable();
....@@ -7853,12 +9300,13 @@
78539300 if (vcpu->arch.apic_attention)
78549301 kvm_lapic_sync_from_vapic(vcpu);
78559302
7856
- vcpu->arch.gpa_available = false;
7857
- r = kvm_x86_ops->handle_exit(vcpu);
9303
+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
78589304 return r;
78599305
78609306 cancel_injection:
7861
- kvm_x86_ops->cancel_injection(vcpu);
9307
+ if (req_immediate_exit)
9308
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
9309
+ kvm_x86_ops.cancel_injection(vcpu);
78629310 if (unlikely(vcpu->arch.apic_attention))
78639311 kvm_lapic_sync_from_vapic(vcpu);
78649312 out:
....@@ -7868,13 +9316,13 @@
78689316 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
78699317 {
78709318 if (!kvm_arch_vcpu_runnable(vcpu) &&
7871
- (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
9319
+ (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
78729320 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
78739321 kvm_vcpu_block(vcpu);
78749322 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
78759323
7876
- if (kvm_x86_ops->post_block)
7877
- kvm_x86_ops->post_block(vcpu);
9324
+ if (kvm_x86_ops.post_block)
9325
+ kvm_x86_ops.post_block(vcpu);
78789326
78799327 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
78809328 return 1;
....@@ -7886,6 +9334,7 @@
78869334 vcpu->arch.pv.pv_unhalted = false;
78879335 vcpu->arch.mp_state =
78889336 KVM_MP_STATE_RUNNABLE;
9337
+ fallthrough;
78899338 case KVM_MP_STATE_RUNNABLE:
78909339 vcpu->arch.apf.halted = false;
78919340 break;
....@@ -7893,15 +9342,14 @@
78939342 break;
78949343 default:
78959344 return -EINTR;
7896
- break;
78979345 }
78989346 return 1;
78999347 }
79009348
79019349 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
79029350 {
7903
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7904
- kvm_x86_ops->check_nested_events(vcpu);
9351
+ if (is_guest_mode(vcpu))
9352
+ kvm_x86_ops.nested_ops->check_events(vcpu);
79059353
79069354 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
79079355 !vcpu->arch.apf.halted);
....@@ -7937,17 +9385,11 @@
79379385 break;
79389386 }
79399387
7940
- kvm_check_async_pf_completion(vcpu);
7941
-
7942
- if (signal_pending(current)) {
7943
- r = -EINTR;
7944
- vcpu->run->exit_reason = KVM_EXIT_INTR;
7945
- ++vcpu->stat.signal_exits;
7946
- break;
7947
- }
7948
- if (need_resched()) {
9388
+ if (__xfer_to_guest_mode_work_pending()) {
79499389 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7950
- cond_resched();
9390
+ r = xfer_to_guest_mode_handle_work(vcpu);
9391
+ if (r)
9392
+ return r;
79519393 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
79529394 }
79539395 }
....@@ -7960,12 +9402,11 @@
79609402 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
79619403 {
79629404 int r;
9405
+
79639406 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
79649407 r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
79659408 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
7966
- if (r != EMULATE_DONE)
7967
- return 0;
7968
- return 1;
9409
+ return r;
79699410 }
79709411
79719412 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
....@@ -8038,31 +9479,55 @@
80389479 return 0;
80399480 }
80409481
9482
+static void kvm_save_current_fpu(struct fpu *fpu)
9483
+{
9484
+ /*
9485
+ * If the target FPU state is not resident in the CPU registers, just
9486
+ * memcpy() from current, else save CPU state directly to the target.
9487
+ */
9488
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
9489
+ memcpy(&fpu->state, &current->thread.fpu.state,
9490
+ fpu_kernel_xstate_size);
9491
+ else
9492
+ copy_fpregs_to_fpstate(fpu);
9493
+}
9494
+
80419495 /* Swap (qemu) user FPU context for the guest FPU context. */
80429496 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
80439497 {
8044
- preempt_disable();
8045
- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
8046
- /* PKRU is separately restored in kvm_x86_ops->run. */
8047
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
9498
+ fpregs_lock();
9499
+
9500
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
9501
+
9502
+ /* PKRU is separately restored in kvm_x86_ops.run. */
9503
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
80489504 ~XFEATURE_MASK_PKRU);
8049
- preempt_enable();
9505
+
9506
+ fpregs_mark_activate();
9507
+ fpregs_unlock();
9508
+
80509509 trace_kvm_fpu(1);
80519510 }
80529511
80539512 /* When vcpu_run ends, restore user space FPU context. */
80549513 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
80559514 {
8056
- preempt_disable();
8057
- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
8058
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
8059
- preempt_enable();
9515
+ fpregs_lock();
9516
+
9517
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
9518
+
9519
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
9520
+
9521
+ fpregs_mark_activate();
9522
+ fpregs_unlock();
9523
+
80609524 ++vcpu->stat.fpu_reload;
80619525 trace_kvm_fpu(0);
80629526 }
80639527
8064
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9528
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
80659529 {
9530
+ struct kvm_run *kvm_run = vcpu->run;
80669531 int r;
80679532
80689533 vcpu_load(vcpu);
....@@ -8080,18 +9545,18 @@
80809545 r = -EAGAIN;
80819546 if (signal_pending(current)) {
80829547 r = -EINTR;
8083
- vcpu->run->exit_reason = KVM_EXIT_INTR;
9548
+ kvm_run->exit_reason = KVM_EXIT_INTR;
80849549 ++vcpu->stat.signal_exits;
80859550 }
80869551 goto out;
80879552 }
80889553
8089
- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
9554
+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
80909555 r = -EINVAL;
80919556 goto out;
80929557 }
80939558
8094
- if (vcpu->run->kvm_dirty_regs) {
9559
+ if (kvm_run->kvm_dirty_regs) {
80959560 r = sync_regs(vcpu);
80969561 if (r != 0)
80979562 goto out;
....@@ -8121,7 +9586,7 @@
81219586
81229587 out:
81239588 kvm_put_guest_fpu(vcpu);
8124
- if (vcpu->run->kvm_valid_regs)
9589
+ if (kvm_run->kvm_valid_regs)
81259590 store_regs(vcpu);
81269591 post_kvm_run_save(vcpu);
81279592 kvm_sigset_deactivate(vcpu);
....@@ -8140,26 +9605,26 @@
81409605 * that usually, but some bad designed PV devices (vmware
81419606 * backdoor interface) need this to work
81429607 */
8143
- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
9608
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
81449609 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
81459610 }
8146
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
8147
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
8148
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
8149
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
8150
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
8151
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
8152
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
8153
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
9611
+ regs->rax = kvm_rax_read(vcpu);
9612
+ regs->rbx = kvm_rbx_read(vcpu);
9613
+ regs->rcx = kvm_rcx_read(vcpu);
9614
+ regs->rdx = kvm_rdx_read(vcpu);
9615
+ regs->rsi = kvm_rsi_read(vcpu);
9616
+ regs->rdi = kvm_rdi_read(vcpu);
9617
+ regs->rsp = kvm_rsp_read(vcpu);
9618
+ regs->rbp = kvm_rbp_read(vcpu);
81549619 #ifdef CONFIG_X86_64
8155
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
8156
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
8157
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
8158
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
8159
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
8160
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
8161
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
8162
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
9620
+ regs->r8 = kvm_r8_read(vcpu);
9621
+ regs->r9 = kvm_r9_read(vcpu);
9622
+ regs->r10 = kvm_r10_read(vcpu);
9623
+ regs->r11 = kvm_r11_read(vcpu);
9624
+ regs->r12 = kvm_r12_read(vcpu);
9625
+ regs->r13 = kvm_r13_read(vcpu);
9626
+ regs->r14 = kvm_r14_read(vcpu);
9627
+ regs->r15 = kvm_r15_read(vcpu);
81639628 #endif
81649629
81659630 regs->rip = kvm_rip_read(vcpu);
....@@ -8179,23 +9644,23 @@
81799644 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
81809645 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
81819646
8182
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
8183
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
8184
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
8185
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
8186
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
8187
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
8188
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
8189
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
9647
+ kvm_rax_write(vcpu, regs->rax);
9648
+ kvm_rbx_write(vcpu, regs->rbx);
9649
+ kvm_rcx_write(vcpu, regs->rcx);
9650
+ kvm_rdx_write(vcpu, regs->rdx);
9651
+ kvm_rsi_write(vcpu, regs->rsi);
9652
+ kvm_rdi_write(vcpu, regs->rdi);
9653
+ kvm_rsp_write(vcpu, regs->rsp);
9654
+ kvm_rbp_write(vcpu, regs->rbp);
81909655 #ifdef CONFIG_X86_64
8191
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
8192
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
8193
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
8194
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
8195
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
8196
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
8197
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
8198
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
9656
+ kvm_r8_write(vcpu, regs->r8);
9657
+ kvm_r9_write(vcpu, regs->r9);
9658
+ kvm_r10_write(vcpu, regs->r10);
9659
+ kvm_r11_write(vcpu, regs->r11);
9660
+ kvm_r12_write(vcpu, regs->r12);
9661
+ kvm_r13_write(vcpu, regs->r13);
9662
+ kvm_r14_write(vcpu, regs->r14);
9663
+ kvm_r15_write(vcpu, regs->r15);
81999664 #endif
82009665
82019666 kvm_rip_write(vcpu, regs->rip);
....@@ -8238,10 +9703,10 @@
82389703 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
82399704 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
82409705
8241
- kvm_x86_ops->get_idt(vcpu, &dt);
9706
+ kvm_x86_ops.get_idt(vcpu, &dt);
82429707 sregs->idt.limit = dt.size;
82439708 sregs->idt.base = dt.address;
8244
- kvm_x86_ops->get_gdt(vcpu, &dt);
9709
+ kvm_x86_ops.get_gdt(vcpu, &dt);
82459710 sregs->gdt.limit = dt.size;
82469711 sregs->gdt.base = dt.address;
82479712
....@@ -8253,7 +9718,7 @@
82539718 sregs->efer = vcpu->arch.efer;
82549719 sregs->apic_base = kvm_get_apic_base(vcpu);
82559720
8256
- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
9721
+ memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
82579722
82589723 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
82599724 set_bit(vcpu->arch.interrupt.nr,
....@@ -8300,8 +9765,12 @@
83009765 mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
83019766 goto out;
83029767
8303
- /* INITs are latched while in SMM */
8304
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
9768
+ /*
9769
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
9770
+ * INIT state; latched init should be reported using
9771
+ * KVM_SET_VCPU_EVENTS, so reject it here.
9772
+ */
9773
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
83059774 (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
83069775 mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
83079776 goto out;
....@@ -8322,21 +9791,23 @@
83229791 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
83239792 int reason, bool has_error_code, u32 error_code)
83249793 {
8325
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
9794
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
83269795 int ret;
83279796
83289797 init_emulate_ctxt(vcpu);
83299798
83309799 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
83319800 has_error_code, error_code);
8332
-
8333
- if (ret)
8334
- return EMULATE_FAIL;
9801
+ if (ret) {
9802
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9803
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
9804
+ vcpu->run->internal.ndata = 0;
9805
+ return 0;
9806
+ }
83359807
83369808 kvm_rip_write(vcpu, ctxt->eip);
83379809 kvm_set_rflags(vcpu, ctxt->eflags);
8338
- kvm_make_request(KVM_REQ_EVENT, vcpu);
8339
- return EMULATE_DONE;
9810
+ return 1;
83409811 }
83419812 EXPORT_SYMBOL_GPL(kvm_task_switch);
83429813
....@@ -8350,6 +9821,8 @@
83509821 */
83519822 if (!(sregs->cr4 & X86_CR4_PAE)
83529823 || !(sregs->efer & EFER_LMA))
9824
+ return -EINVAL;
9825
+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
83539826 return -EINVAL;
83549827 } else {
83559828 /*
....@@ -8382,31 +9855,31 @@
83829855
83839856 dt.size = sregs->idt.limit;
83849857 dt.address = sregs->idt.base;
8385
- kvm_x86_ops->set_idt(vcpu, &dt);
9858
+ kvm_x86_ops.set_idt(vcpu, &dt);
83869859 dt.size = sregs->gdt.limit;
83879860 dt.address = sregs->gdt.base;
8388
- kvm_x86_ops->set_gdt(vcpu, &dt);
9861
+ kvm_x86_ops.set_gdt(vcpu, &dt);
83899862
83909863 vcpu->arch.cr2 = sregs->cr2;
83919864 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
83929865 vcpu->arch.cr3 = sregs->cr3;
8393
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
9866
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
83949867
83959868 kvm_set_cr8(vcpu, sregs->cr8);
83969869
83979870 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
8398
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
9871
+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
83999872
84009873 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
8401
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
9874
+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
84029875 vcpu->arch.cr0 = sregs->cr0;
84039876
84049877 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
84059878 cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
84069879 (X86_CR4_OSXSAVE | X86_CR4_PKE));
8407
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
9880
+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
84089881 if (cpuid_update_needed)
8409
- kvm_update_cpuid(vcpu);
9882
+ kvm_update_cpuid_runtime(vcpu);
84109883
84119884 idx = srcu_read_lock(&vcpu->kvm->srcu);
84129885 if (is_pae_paging(vcpu)) {
....@@ -8510,7 +9983,7 @@
85109983 */
85119984 kvm_set_rflags(vcpu, rflags);
85129985
8513
- kvm_x86_ops->update_bp_intercept(vcpu);
9986
+ kvm_x86_ops.update_exception_bitmap(vcpu);
85149987
85159988 r = 0;
85169989
....@@ -8549,7 +10022,7 @@
854910022
855010023 vcpu_load(vcpu);
855110024
8552
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
10025
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
855310026 memcpy(fpu->fpr, fxsave->st_space, 128);
855410027 fpu->fcw = fxsave->cwd;
855510028 fpu->fsw = fxsave->swd;
....@@ -8557,7 +10030,7 @@
855710030 fpu->last_opcode = fxsave->fop;
855810031 fpu->last_ip = fxsave->rip;
855910032 fpu->last_dp = fxsave->rdp;
8560
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
10033
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
856110034
856210035 vcpu_put(vcpu);
856310036 return 0;
....@@ -8569,7 +10042,7 @@
856910042
857010043 vcpu_load(vcpu);
857110044
8572
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
10045
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
857310046
857410047 memcpy(fxsave->st_space, fpu->fpr, 128);
857510048 fxsave->cwd = fpu->fcw;
....@@ -8578,7 +10051,7 @@
857810051 fxsave->fop = fpu->last_opcode;
857910052 fxsave->rip = fpu->last_ip;
858010053 fxsave->rdp = fpu->last_dp;
8581
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
10054
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
858210055
858310056 vcpu_put(vcpu);
858410057 return 0;
....@@ -8625,9 +10098,9 @@
862510098
862610099 static void fx_init(struct kvm_vcpu *vcpu)
862710100 {
8628
- fpstate_init(&vcpu->arch.guest_fpu.state);
10101
+ fpstate_init(&vcpu->arch.guest_fpu->state);
862910102 if (boot_cpu_has(X86_FEATURE_XSAVES))
8630
- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
10103
+ vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
863110104 host_xcr0 | XSTATE_COMPACTION_ENABLED;
863210105
863310106 /*
....@@ -8638,48 +10111,122 @@
863810111 vcpu->arch.cr0 |= X86_CR0_ET;
863910112 }
864010113
8641
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
10114
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
864210115 {
8643
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
8644
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
8645
-
8646
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
8647
-
8648
- kvmclock_reset(vcpu);
8649
-
8650
- kvm_x86_ops->vcpu_free(vcpu);
8651
- free_cpumask_var(wbinvd_dirty_mask);
8652
-}
8653
-
8654
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
8655
- unsigned int id)
8656
-{
8657
- struct kvm_vcpu *vcpu;
8658
-
865910116 if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
8660
- printk_once(KERN_WARNING
8661
- "kvm: SMP vm created on host with unstable TSC; "
8662
- "guest TSC will not be reliable\n");
10117
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
10118
+ "guest TSC will not be reliable\n");
866310119
8664
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
8665
-
8666
- return vcpu;
10120
+ return 0;
866710121 }
866810122
8669
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
10123
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
867010124 {
10125
+ struct page *page;
10126
+ int r;
10127
+
10128
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
10129
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10130
+ else
10131
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
10132
+
10133
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
10134
+
10135
+ r = kvm_mmu_create(vcpu);
10136
+ if (r < 0)
10137
+ return r;
10138
+
10139
+ if (irqchip_in_kernel(vcpu->kvm)) {
10140
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
10141
+ if (r < 0)
10142
+ goto fail_mmu_destroy;
10143
+ if (kvm_apicv_activated(vcpu->kvm))
10144
+ vcpu->arch.apicv_active = true;
10145
+ } else
10146
+ static_key_slow_inc(&kvm_no_apic_vcpu);
10147
+
10148
+ r = -ENOMEM;
10149
+
10150
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
10151
+ if (!page)
10152
+ goto fail_free_lapic;
10153
+ vcpu->arch.pio_data = page_address(page);
10154
+
10155
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
10156
+ GFP_KERNEL_ACCOUNT);
10157
+ if (!vcpu->arch.mce_banks)
10158
+ goto fail_free_pio_data;
10159
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
10160
+
10161
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
10162
+ GFP_KERNEL_ACCOUNT))
10163
+ goto fail_free_mce_banks;
10164
+
10165
+ if (!alloc_emulate_ctxt(vcpu))
10166
+ goto free_wbinvd_dirty_mask;
10167
+
10168
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
10169
+ GFP_KERNEL_ACCOUNT);
10170
+ if (!vcpu->arch.user_fpu) {
10171
+ pr_err("kvm: failed to allocate userspace's fpu\n");
10172
+ goto free_emulate_ctxt;
10173
+ }
10174
+
10175
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
10176
+ GFP_KERNEL_ACCOUNT);
10177
+ if (!vcpu->arch.guest_fpu) {
10178
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
10179
+ goto free_user_fpu;
10180
+ }
10181
+ fx_init(vcpu);
10182
+
10183
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
10184
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
10185
+
10186
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
10187
+
10188
+ kvm_async_pf_hash_reset(vcpu);
10189
+ kvm_pmu_init(vcpu);
10190
+
10191
+ vcpu->arch.pending_external_vector = -1;
10192
+ vcpu->arch.preempted_in_kernel = false;
10193
+
10194
+ kvm_hv_vcpu_init(vcpu);
10195
+
10196
+ r = kvm_x86_ops.vcpu_create(vcpu);
10197
+ if (r)
10198
+ goto free_guest_fpu;
10199
+
867110200 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
10201
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
867210202 kvm_vcpu_mtrr_init(vcpu);
867310203 vcpu_load(vcpu);
867410204 kvm_vcpu_reset(vcpu, false);
8675
- kvm_mmu_setup(vcpu);
10205
+ kvm_init_mmu(vcpu, false);
867610206 vcpu_put(vcpu);
867710207 return 0;
10208
+
10209
+free_guest_fpu:
10210
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10211
+free_user_fpu:
10212
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10213
+free_emulate_ctxt:
10214
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10215
+free_wbinvd_dirty_mask:
10216
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10217
+fail_free_mce_banks:
10218
+ kfree(vcpu->arch.mce_banks);
10219
+fail_free_pio_data:
10220
+ free_page((unsigned long)vcpu->arch.pio_data);
10221
+fail_free_lapic:
10222
+ kvm_free_lapic(vcpu);
10223
+fail_mmu_destroy:
10224
+ kvm_mmu_destroy(vcpu);
10225
+ return r;
867810226 }
867910227
868010228 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
868110229 {
8682
- struct msr_data msr;
868310230 struct kvm *kvm = vcpu->kvm;
868410231
868510232 kvm_hv_vcpu_postcreate(vcpu);
....@@ -8687,23 +10234,46 @@
868710234 if (mutex_lock_killable(&vcpu->mutex))
868810235 return;
868910236 vcpu_load(vcpu);
8690
- msr.data = 0x0;
8691
- msr.index = MSR_IA32_TSC;
8692
- msr.host_initiated = true;
8693
- kvm_write_tsc(vcpu, &msr);
10237
+ kvm_synchronize_tsc(vcpu, 0);
869410238 vcpu_put(vcpu);
10239
+
10240
+ /* poll control enabled by default */
10241
+ vcpu->arch.msr_kvm_poll_control = 1;
10242
+
869510243 mutex_unlock(&vcpu->mutex);
869610244
8697
- if (!kvmclock_periodic_sync)
8698
- return;
8699
-
8700
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
8701
- KVMCLOCK_SYNC_PERIOD);
10245
+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
10246
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
10247
+ KVMCLOCK_SYNC_PERIOD);
870210248 }
870310249
870410250 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
870510251 {
8706
- kvm_arch_vcpu_free(vcpu);
10252
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
10253
+ int idx;
10254
+
10255
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
10256
+
10257
+ kvmclock_reset(vcpu);
10258
+
10259
+ kvm_x86_ops.vcpu_free(vcpu);
10260
+
10261
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10262
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10263
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10264
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10265
+
10266
+ kvm_hv_vcpu_uninit(vcpu);
10267
+ kvm_pmu_destroy(vcpu);
10268
+ kfree(vcpu->arch.mce_banks);
10269
+ kvm_free_lapic(vcpu);
10270
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
10271
+ kvm_mmu_destroy(vcpu);
10272
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
10273
+ free_page((unsigned long)vcpu->arch.pio_data);
10274
+ kvfree(vcpu->arch.cpuid_entries);
10275
+ if (!lapic_in_kernel(vcpu))
10276
+ static_key_slow_dec(&kvm_no_apic_vcpu);
870710277 }
870810278
870910279 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
....@@ -8719,19 +10289,18 @@
871910289 vcpu->arch.nmi_injected = false;
872010290 kvm_clear_interrupt_queue(vcpu);
872110291 kvm_clear_exception_queue(vcpu);
8722
- vcpu->arch.exception.pending = false;
872310292
872410293 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
872510294 kvm_update_dr0123(vcpu);
872610295 vcpu->arch.dr6 = DR6_INIT;
8727
- kvm_update_dr6(vcpu);
872810296 vcpu->arch.dr7 = DR7_FIXED_1;
872910297 kvm_update_dr7(vcpu);
873010298
873110299 vcpu->arch.cr2 = 0;
873210300
873310301 kvm_make_request(KVM_REQ_EVENT, vcpu);
8734
- vcpu->arch.apf.msr_val = 0;
10302
+ vcpu->arch.apf.msr_en_val = 0;
10303
+ vcpu->arch.apf.msr_int_val = 0;
873510304 vcpu->arch.st.msr_val = 0;
873610305
873710306 kvmclock_reset(vcpu);
....@@ -8749,12 +10318,12 @@
874910318 */
875010319 if (init_event)
875110320 kvm_put_guest_fpu(vcpu);
8752
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8753
- XFEATURE_MASK_BNDREGS);
10321
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10322
+ XFEATURE_BNDREGS);
875410323 if (mpx_state_buffer)
875510324 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
8756
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8757
- XFEATURE_MASK_BNDCSR);
10325
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10326
+ XFEATURE_BNDCSR);
875810327 if (mpx_state_buffer)
875910328 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
876010329 if (init_event)
....@@ -8765,7 +10334,6 @@
876510334 kvm_pmu_reset(vcpu);
876610335 vcpu->arch.smbase = 0x30000;
876710336
8768
- vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
876910337 vcpu->arch.msr_misc_features_enables = 0;
877010338
877110339 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
....@@ -8777,7 +10345,7 @@
877710345
877810346 vcpu->arch.ia32_xss = 0;
877910347
8780
- kvm_x86_ops->vcpu_reset(vcpu, init_event);
10348
+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
878110349 }
878210350
878310351 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
....@@ -8801,8 +10369,8 @@
880110369 u64 max_tsc = 0;
880210370 bool stable, backwards_tsc = false;
880310371
8804
- kvm_shared_msr_cpu_online();
8805
- ret = kvm_x86_ops->hardware_enable();
10372
+ kvm_user_return_msr_cpu_online();
10373
+ ret = kvm_x86_ops.hardware_enable();
880610374 if (ret != 0)
880710375 return ret;
880810376
....@@ -8828,7 +10396,7 @@
882810396 * before any KVM threads can be running. Unfortunately, we can't
882910397 * bring the TSCs fully up to date with real time, as we aren't yet far
883010398 * enough into CPU bringup that we know how much real time has actually
8831
- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
10399
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
883210400 * variables that haven't been updated yet.
883310401 *
883410402 * So we simply find the maximum observed TSC above, then record the
....@@ -8884,19 +10452,32 @@
888410452
888510453 void kvm_arch_hardware_disable(void)
888610454 {
8887
- kvm_x86_ops->hardware_disable();
10455
+ kvm_x86_ops.hardware_disable();
888810456 drop_user_return_notifiers();
888910457 }
889010458
8891
-int kvm_arch_hardware_setup(void)
10459
+int kvm_arch_hardware_setup(void *opaque)
889210460 {
10461
+ struct kvm_x86_init_ops *ops = opaque;
889310462 int r;
889410463
8895
- r = kvm_x86_ops->hardware_setup();
10464
+ rdmsrl_safe(MSR_EFER, &host_efer);
10465
+
10466
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
10467
+ rdmsrl(MSR_IA32_XSS, host_xss);
10468
+
10469
+ r = ops->hardware_setup();
889610470 if (r != 0)
889710471 return r;
889810472
8899
- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
10473
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
10474
+
10475
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
10476
+ supported_xss = 0;
10477
+
10478
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
10479
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
10480
+#undef __kvm_cpu_cap_has
890010481
890110482 if (kvm_has_tsc_control) {
890210483 /*
....@@ -8918,12 +10499,21 @@
891810499
891910500 void kvm_arch_hardware_unsetup(void)
892010501 {
8921
- kvm_x86_ops->hardware_unsetup();
10502
+ kvm_x86_ops.hardware_unsetup();
892210503 }
892310504
8924
-void kvm_arch_check_processor_compat(void *rtn)
10505
+int kvm_arch_check_processor_compat(void *opaque)
892510506 {
8926
- kvm_x86_ops->check_processor_compatibility(rtn);
10507
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
10508
+ struct kvm_x86_init_ops *ops = opaque;
10509
+
10510
+ WARN_ON(!irqs_disabled());
10511
+
10512
+ if (__cr4_reserved_bits(cpu_has, c) !=
10513
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
10514
+ return -EIO;
10515
+
10516
+ return ops->check_processor_compatibility();
892710517 }
892810518
892910519 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
....@@ -8940,107 +10530,35 @@
894010530 struct static_key kvm_no_apic_vcpu __read_mostly;
894110531 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
894210532
8943
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8944
-{
8945
- struct page *page;
8946
- int r;
8947
-
8948
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
8949
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
8950
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
8951
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
8952
- else
8953
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
8954
-
8955
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
8956
- if (!page) {
8957
- r = -ENOMEM;
8958
- goto fail;
8959
- }
8960
- vcpu->arch.pio_data = page_address(page);
8961
-
8962
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
8963
-
8964
- r = kvm_mmu_create(vcpu);
8965
- if (r < 0)
8966
- goto fail_free_pio_data;
8967
-
8968
- if (irqchip_in_kernel(vcpu->kvm)) {
8969
- r = kvm_create_lapic(vcpu);
8970
- if (r < 0)
8971
- goto fail_mmu_destroy;
8972
- } else
8973
- static_key_slow_inc(&kvm_no_apic_vcpu);
8974
-
8975
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
8976
- GFP_KERNEL);
8977
- if (!vcpu->arch.mce_banks) {
8978
- r = -ENOMEM;
8979
- goto fail_free_lapic;
8980
- }
8981
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
8982
-
8983
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
8984
- r = -ENOMEM;
8985
- goto fail_free_mce_banks;
8986
- }
8987
-
8988
- fx_init(vcpu);
8989
-
8990
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
8991
-
8992
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
8993
-
8994
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
8995
-
8996
- kvm_async_pf_hash_reset(vcpu);
8997
- kvm_pmu_init(vcpu);
8998
-
8999
- vcpu->arch.pending_external_vector = -1;
9000
- vcpu->arch.preempted_in_kernel = false;
9001
-
9002
- kvm_hv_vcpu_init(vcpu);
9003
-
9004
- return 0;
9005
-
9006
-fail_free_mce_banks:
9007
- kfree(vcpu->arch.mce_banks);
9008
-fail_free_lapic:
9009
- kvm_free_lapic(vcpu);
9010
-fail_mmu_destroy:
9011
- kvm_mmu_destroy(vcpu);
9012
-fail_free_pio_data:
9013
- free_page((unsigned long)vcpu->arch.pio_data);
9014
-fail:
9015
- return r;
9016
-}
9017
-
9018
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
9019
-{
9020
- int idx;
9021
-
9022
- kvm_hv_vcpu_uninit(vcpu);
9023
- kvm_pmu_destroy(vcpu);
9024
- kfree(vcpu->arch.mce_banks);
9025
- kvm_free_lapic(vcpu);
9026
- idx = srcu_read_lock(&vcpu->kvm->srcu);
9027
- kvm_mmu_destroy(vcpu);
9028
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
9029
- free_page((unsigned long)vcpu->arch.pio_data);
9030
- if (!lapic_in_kernel(vcpu))
9031
- static_key_slow_dec(&kvm_no_apic_vcpu);
9032
-}
9033
-
903410533 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
903510534 {
10535
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
10536
+
903610537 vcpu->arch.l1tf_flush_l1d = true;
9037
- kvm_x86_ops->sched_in(vcpu, cpu);
10538
+ if (pmu->version && unlikely(pmu->event_count)) {
10539
+ pmu->need_cleanup = true;
10540
+ kvm_make_request(KVM_REQ_PMU, vcpu);
10541
+ }
10542
+ kvm_x86_ops.sched_in(vcpu, cpu);
903810543 }
10544
+
10545
+void kvm_arch_free_vm(struct kvm *kvm)
10546
+{
10547
+ kfree(kvm->arch.hyperv.hv_pa_pg);
10548
+ vfree(kvm);
10549
+}
10550
+
903910551
904010552 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
904110553 {
10554
+ int ret;
10555
+
904210556 if (type)
904310557 return -EINVAL;
10558
+
10559
+ ret = kvm_page_track_init(kvm);
10560
+ if (ret)
10561
+ return ret;
904410562
904510563 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
904610564 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
....@@ -9059,7 +10577,7 @@
905910577 mutex_init(&kvm->arch.apic_map_lock);
906010578 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
906110579
9062
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
10580
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
906310581 pvclock_update_vm_gtod_copy(kvm);
906410582
906510583 kvm->arch.guest_can_read_msr_platform_info = true;
....@@ -9068,13 +10586,9 @@
906810586 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
906910587
907010588 kvm_hv_init_vm(kvm);
9071
- kvm_page_track_init(kvm);
907210589 kvm_mmu_init_vm(kvm);
907310590
9074
- if (kvm_x86_ops->vm_init)
9075
- return kvm_x86_ops->vm_init(kvm);
9076
-
9077
- return 0;
10591
+ return kvm_x86_ops.vm_init(kvm);
907810592 }
907910593
908010594 int kvm_arch_post_init_vm(struct kvm *kvm)
....@@ -9102,7 +10616,7 @@
910210616 kvm_unload_vcpu_mmu(vcpu);
910310617 }
910410618 kvm_for_each_vcpu(i, vcpu, kvm)
9105
- kvm_arch_vcpu_free(vcpu);
10619
+ kvm_vcpu_destroy(vcpu);
910610620
910710621 mutex_lock(&kvm->lock);
910810622 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
....@@ -9122,9 +10636,9 @@
912210636 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
912310637 {
912410638 int i, r;
9125
- unsigned long hva;
10639
+ unsigned long hva, old_npages;
912610640 struct kvm_memslots *slots = kvm_memslots(kvm);
9127
- struct kvm_memory_slot *slot, old;
10641
+ struct kvm_memory_slot *slot;
912810642
912910643 /* Called with kvm->slots_lock held. */
913010644 if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
....@@ -9132,7 +10646,7 @@
913210646
913310647 slot = id_to_memslot(slots, id);
913410648 if (size) {
9135
- if (slot->npages)
10649
+ if (slot && slot->npages)
913610650 return -EEXIST;
913710651
913810652 /*
....@@ -9144,13 +10658,13 @@
914410658 if (IS_ERR((void *)hva))
914510659 return PTR_ERR((void *)hva);
914610660 } else {
9147
- if (!slot->npages)
10661
+ if (!slot || !slot->npages)
914810662 return 0;
914910663
10664
+ old_npages = slot->npages;
915010665 hva = 0;
915110666 }
915210667
9153
- old = *slot;
915410668 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
915510669 struct kvm_userspace_memory_region m;
915610670
....@@ -9165,23 +10679,11 @@
916510679 }
916610680
916710681 if (!size)
9168
- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
10682
+ vm_munmap(hva, old_npages * PAGE_SIZE);
916910683
917010684 return 0;
917110685 }
917210686 EXPORT_SYMBOL_GPL(__x86_set_memory_region);
9173
-
9174
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9175
-{
9176
- int r;
9177
-
9178
- mutex_lock(&kvm->slots_lock);
9179
- r = __x86_set_memory_region(kvm, id, gpa, size);
9180
- mutex_unlock(&kvm->slots_lock);
9181
-
9182
- return r;
9183
-}
9184
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
918510687
918610688 void kvm_arch_pre_destroy_vm(struct kvm *kvm)
918710689 {
....@@ -9196,46 +10698,47 @@
919610698 * unless the the memory map has changed due to process exit
919710699 * or fd copying.
919810700 */
9199
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
9200
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
9201
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10701
+ mutex_lock(&kvm->slots_lock);
10702
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
10703
+ 0, 0);
10704
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
10705
+ 0, 0);
10706
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10707
+ mutex_unlock(&kvm->slots_lock);
920210708 }
9203
- if (kvm_x86_ops->vm_destroy)
9204
- kvm_x86_ops->vm_destroy(kvm);
10709
+ if (kvm_x86_ops.vm_destroy)
10710
+ kvm_x86_ops.vm_destroy(kvm);
10711
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
920510712 kvm_pic_destroy(kvm);
920610713 kvm_ioapic_destroy(kvm);
920710714 kvm_free_vcpus(kvm);
920810715 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
10716
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
920910717 kvm_mmu_uninit_vm(kvm);
921010718 kvm_page_track_cleanup(kvm);
921110719 kvm_hv_destroy_vm(kvm);
921210720 }
921310721
9214
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
9215
- struct kvm_memory_slot *dont)
10722
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
921610723 {
921710724 int i;
921810725
921910726 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
9220
- if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
9221
- kvfree(free->arch.rmap[i]);
9222
- free->arch.rmap[i] = NULL;
9223
- }
10727
+ kvfree(slot->arch.rmap[i]);
10728
+ slot->arch.rmap[i] = NULL;
10729
+
922410730 if (i == 0)
922510731 continue;
922610732
9227
- if (!dont || free->arch.lpage_info[i - 1] !=
9228
- dont->arch.lpage_info[i - 1]) {
9229
- kvfree(free->arch.lpage_info[i - 1]);
9230
- free->arch.lpage_info[i - 1] = NULL;
9231
- }
10733
+ kvfree(slot->arch.lpage_info[i - 1]);
10734
+ slot->arch.lpage_info[i - 1] = NULL;
923210735 }
923310736
9234
- kvm_page_track_free_memslot(free, dont);
10737
+ kvm_page_track_free_memslot(slot);
923510738 }
923610739
9237
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
9238
- unsigned long npages)
10740
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
10741
+ unsigned long npages)
923910742 {
924010743 int i;
924110744
....@@ -9257,13 +10760,13 @@
925710760
925810761 slot->arch.rmap[i] =
925910762 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
9260
- GFP_KERNEL);
10763
+ GFP_KERNEL_ACCOUNT);
926110764 if (!slot->arch.rmap[i])
926210765 goto out_free;
926310766 if (i == 0)
926410767 continue;
926510768
9266
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
10769
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
926710770 if (!linfo)
926810771 goto out_free;
926910772
....@@ -9276,11 +10779,9 @@
927610779 ugfn = slot->userspace_addr >> PAGE_SHIFT;
927710780 /*
927810781 * If the gfn and userspace address are not aligned wrt each
9279
- * other, or if explicitly asked to, disable large page
9280
- * support for this slot
10782
+ * other, disable large page support for this slot.
928110783 */
9282
- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
9283
- !kvm_largepages_enabled()) {
10784
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
928410785 unsigned long j;
928510786
928610787 for (j = 0; j < lpages; ++j)
....@@ -9327,76 +10828,23 @@
932710828 const struct kvm_userspace_memory_region *mem,
932810829 enum kvm_mr_change change)
932910830 {
9330
- if (change == KVM_MR_MOVE)
9331
- return kvm_arch_create_memslot(kvm, memslot,
9332
- mem->memory_size >> PAGE_SHIFT);
9333
-
10831
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
10832
+ return kvm_alloc_memslot_metadata(memslot,
10833
+ mem->memory_size >> PAGE_SHIFT);
933410834 return 0;
933510835 }
933610836
933710837 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
9338
- struct kvm_memory_slot *new)
10838
+ struct kvm_memory_slot *old,
10839
+ struct kvm_memory_slot *new,
10840
+ enum kvm_mr_change change)
933910841 {
9340
- /* Still write protect RO slot */
9341
- if (new->flags & KVM_MEM_READONLY) {
9342
- kvm_mmu_slot_remove_write_access(kvm, new);
9343
- return;
9344
- }
9345
-
934610842 /*
9347
- * Call kvm_x86_ops dirty logging hooks when they are valid.
9348
- *
9349
- * kvm_x86_ops->slot_disable_log_dirty is called when:
9350
- *
9351
- * - KVM_MR_CREATE with dirty logging is disabled
9352
- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
9353
- *
9354
- * The reason is, in case of PML, we need to set D-bit for any slots
9355
- * with dirty logging disabled in order to eliminate unnecessary GPA
9356
- * logging in PML buffer (and potential PML buffer full VMEXT). This
9357
- * guarantees leaving PML enabled during guest's lifetime won't have
9358
- * any additonal overhead from PML when guest is running with dirty
9359
- * logging disabled for memory slots.
9360
- *
9361
- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
9362
- * to dirty logging mode.
9363
- *
9364
- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
9365
- *
9366
- * In case of write protect:
9367
- *
9368
- * Write protect all pages for dirty logging.
9369
- *
9370
- * All the sptes including the large sptes which point to this
9371
- * slot are set to readonly. We can not create any new large
9372
- * spte on this slot until the end of the logging.
9373
- *
9374
- * See the comments in fast_page_fault().
10843
+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
10844
+ * See comments below.
937510845 */
9376
- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
9377
- if (kvm_x86_ops->slot_enable_log_dirty)
9378
- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
9379
- else
9380
- kvm_mmu_slot_remove_write_access(kvm, new);
9381
- } else {
9382
- if (kvm_x86_ops->slot_disable_log_dirty)
9383
- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
9384
- }
9385
-}
9386
-
9387
-void kvm_arch_commit_memory_region(struct kvm *kvm,
9388
- const struct kvm_userspace_memory_region *mem,
9389
- const struct kvm_memory_slot *old,
9390
- const struct kvm_memory_slot *new,
9391
- enum kvm_mr_change change)
9392
-{
9393
- int nr_mmu_pages = 0;
9394
-
9395
- if (!kvm->arch.n_requested_mmu_pages)
9396
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
9397
-
9398
- if (nr_mmu_pages)
9399
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
10846
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
10847
+ return;
940010848
940110849 /*
940210850 * Dirty logging tracks sptes in 4k granularity, meaning that large
....@@ -9409,29 +10857,91 @@
940910857 * Scan sptes if dirty logging has been stopped, dropping those
941010858 * which can be collapsed into a single large-page spte. Later
941110859 * page faults will create the large-page sptes.
10860
+ *
10861
+ * There is no need to do this in any of the following cases:
10862
+ * CREATE: No dirty mappings will already exist.
10863
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
10864
+ * kvm_arch_flush_shadow_memslot()
941210865 */
9413
- if ((change != KVM_MR_DELETE) &&
9414
- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
9415
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
10866
+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
10867
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
941610868 kvm_mmu_zap_collapsible_sptes(kvm, new);
941710869
941810870 /*
9419
- * Set up write protection and/or dirty logging for the new slot.
10871
+ * Enable or disable dirty logging for the slot.
942010872 *
9421
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
9422
- * been zapped so no dirty logging staff is needed for old slot. For
9423
- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
9424
- * new and it's also covered when dealing with the new slot.
10873
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
10874
+ * slot have been zapped so no dirty logging updates are needed for
10875
+ * the old slot.
10876
+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
10877
+ * any mappings that might be created in it will consume the
10878
+ * properties of the new slot and do not need to be updated here.
942510879 *
10880
+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
10881
+ * called to enable/disable dirty logging.
10882
+ *
10883
+ * When disabling dirty logging with PML enabled, the D-bit is set
10884
+ * for sptes in the slot in order to prevent unnecessary GPA
10885
+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
10886
+ * This guarantees leaving PML enabled for the guest's lifetime
10887
+ * won't have any additional overhead from PML when the guest is
10888
+ * running with dirty logging disabled.
10889
+ *
10890
+ * When enabling dirty logging, large sptes are write-protected
10891
+ * so they can be split on first write. New large sptes cannot
10892
+ * be created for this slot until the end of the logging.
10893
+ * See the comments in fast_page_fault().
10894
+ * For small sptes, nothing is done if the dirty log is in the
10895
+ * initial-all-set state. Otherwise, depending on whether pml
10896
+ * is enabled the D-bit or the W-bit will be cleared.
10897
+ */
10898
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
10899
+ if (kvm_x86_ops.slot_enable_log_dirty) {
10900
+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
10901
+ } else {
10902
+ int level =
10903
+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
10904
+ PG_LEVEL_2M : PG_LEVEL_4K;
10905
+
10906
+ /*
10907
+ * If we're with initial-all-set, we don't need
10908
+ * to write protect any small page because
10909
+ * they're reported as dirty already. However
10910
+ * we still need to write-protect huge pages
10911
+ * so that the page split can happen lazily on
10912
+ * the first write to the huge page.
10913
+ */
10914
+ kvm_mmu_slot_remove_write_access(kvm, new, level);
10915
+ }
10916
+ } else {
10917
+ if (kvm_x86_ops.slot_disable_log_dirty)
10918
+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
10919
+ }
10920
+}
10921
+
10922
+void kvm_arch_commit_memory_region(struct kvm *kvm,
10923
+ const struct kvm_userspace_memory_region *mem,
10924
+ struct kvm_memory_slot *old,
10925
+ const struct kvm_memory_slot *new,
10926
+ enum kvm_mr_change change)
10927
+{
10928
+ if (!kvm->arch.n_requested_mmu_pages)
10929
+ kvm_mmu_change_mmu_pages(kvm,
10930
+ kvm_mmu_calculate_default_mmu_pages(kvm));
10931
+
10932
+ /*
942610933 * FIXME: const-ify all uses of struct kvm_memory_slot.
942710934 */
9428
- if (change != KVM_MR_DELETE)
9429
- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
10935
+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
10936
+
10937
+ /* Free the arrays associated with the old memslot. */
10938
+ if (change == KVM_MR_MOVE)
10939
+ kvm_arch_free_memslot(kvm, old);
943010940 }
943110941
943210942 void kvm_arch_flush_shadow_all(struct kvm *kvm)
943310943 {
9434
- kvm_mmu_invalidate_zap_all_pages(kvm);
10944
+ kvm_mmu_zap_all(kvm);
943510945 }
943610946
943710947 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
....@@ -9443,8 +10953,8 @@
944310953 static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
944410954 {
944510955 return (is_guest_mode(vcpu) &&
9446
- kvm_x86_ops->guest_apic_has_interrupt &&
9447
- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
10956
+ kvm_x86_ops.guest_apic_has_interrupt &&
10957
+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
944810958 }
944910959
945010960 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
....@@ -9463,11 +10973,12 @@
946310973
946410974 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
946510975 (vcpu->arch.nmi_pending &&
9466
- kvm_x86_ops->nmi_allowed(vcpu)))
10976
+ kvm_x86_ops.nmi_allowed(vcpu, false)))
946710977 return true;
946810978
946910979 if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
9470
- (vcpu->arch.smi_pending && !is_smm(vcpu)))
10980
+ (vcpu->arch.smi_pending &&
10981
+ kvm_x86_ops.smi_allowed(vcpu, false)))
947110982 return true;
947210983
947310984 if (kvm_arch_interrupt_allowed(vcpu) &&
....@@ -9476,6 +10987,11 @@
947610987 return true;
947710988
947810989 if (kvm_hv_has_stimer_pending(vcpu))
10990
+ return true;
10991
+
10992
+ if (is_guest_mode(vcpu) &&
10993
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
10994
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
947910995 return true;
948010996
948110997 return false;
....@@ -9496,7 +11012,7 @@
949611012 kvm_test_request(KVM_REQ_EVENT, vcpu))
949711013 return true;
949811014
9499
- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
11015
+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
950011016 return true;
950111017
950211018 return false;
....@@ -9514,7 +11030,7 @@
951411030
951511031 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
951611032 {
9517
- return kvm_x86_ops->interrupt_allowed(vcpu);
11033
+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
951811034 }
951911035
952011036 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
....@@ -9536,7 +11052,7 @@
953611052 {
953711053 unsigned long rflags;
953811054
9539
- rflags = kvm_x86_ops->get_rflags(vcpu);
11055
+ rflags = kvm_x86_ops.get_rflags(vcpu);
954011056 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
954111057 rflags &= ~X86_EFLAGS_TF;
954211058 return rflags;
....@@ -9548,7 +11064,7 @@
954811064 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
954911065 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
955011066 rflags |= X86_EFLAGS_TF;
9551
- kvm_x86_ops->set_rflags(vcpu, rflags);
11067
+ kvm_x86_ops.set_rflags(vcpu, rflags);
955211068 }
955311069
955411070 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
....@@ -9562,7 +11078,7 @@
956211078 {
956311079 int r;
956411080
9565
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
11081
+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
956611082 work->wakeup_all)
956711083 return;
956811084
....@@ -9570,21 +11086,23 @@
957011086 if (unlikely(r))
957111087 return;
957211088
9573
- if (!vcpu->arch.mmu.direct_map &&
9574
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
11089
+ if (!vcpu->arch.mmu->direct_map &&
11090
+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
957511091 return;
957611092
9577
- vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
11093
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
957811094 }
957911095
958011096 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
958111097 {
11098
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
11099
+
958211100 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
958311101 }
958411102
958511103 static inline u32 kvm_async_pf_next_probe(u32 key)
958611104 {
9587
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
11105
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
958811106 }
958911107
959011108 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
....@@ -9602,7 +11120,7 @@
960211120 int i;
960311121 u32 key = kvm_async_pf_hash_fn(gfn);
960411122
9605
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
11123
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
960611124 (vcpu->arch.apf.gfns[key] != gfn &&
960711125 vcpu->arch.apf.gfns[key] != ~0); i++)
960811126 key = kvm_async_pf_next_probe(key);
....@@ -9620,6 +11138,10 @@
962011138 u32 i, j, k;
962111139
962211140 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
11141
+
11142
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
11143
+ return;
11144
+
962311145 while (true) {
962411146 vcpu->arch.apf.gfns[i] = ~0;
962511147 do {
....@@ -9638,21 +11160,64 @@
963811160 }
963911161 }
964011162
9641
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
11163
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
964211164 {
11165
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
964311166
9644
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
9645
- sizeof(val));
11167
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
11168
+ sizeof(reason));
964611169 }
964711170
9648
-static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
11171
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
964911172 {
11173
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
965011174
9651
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
9652
- sizeof(u32));
11175
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11176
+ &token, offset, sizeof(token));
965311177 }
965411178
9655
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
11179
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
11180
+{
11181
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
11182
+ u32 val;
11183
+
11184
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11185
+ &val, offset, sizeof(val)))
11186
+ return false;
11187
+
11188
+ return !val;
11189
+}
11190
+
11191
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
11192
+{
11193
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
11194
+ return false;
11195
+
11196
+ if (!kvm_pv_async_pf_enabled(vcpu) ||
11197
+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
11198
+ return false;
11199
+
11200
+ return true;
11201
+}
11202
+
11203
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
11204
+{
11205
+ if (unlikely(!lapic_in_kernel(vcpu) ||
11206
+ kvm_event_needs_reinjection(vcpu) ||
11207
+ vcpu->arch.exception.pending))
11208
+ return false;
11209
+
11210
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
11211
+ return false;
11212
+
11213
+ /*
11214
+ * If interrupts are off we cannot even use an artificial
11215
+ * halt state.
11216
+ */
11217
+ return kvm_arch_interrupt_allowed(vcpu);
11218
+}
11219
+
11220
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
965611221 struct kvm_async_pf *work)
965711222 {
965811223 struct x86_exception fault;
....@@ -9660,11 +11225,8 @@
966011225 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
966111226 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
966211227
9663
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
9664
- (vcpu->arch.apf.send_user_only &&
9665
- kvm_x86_ops->get_cpl(vcpu) == 0))
9666
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9667
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
11228
+ if (kvm_can_deliver_async_pf(vcpu) &&
11229
+ !apf_put_user_notpresent(vcpu)) {
966811230 fault.vector = PF_VECTOR;
966911231 fault.error_code_valid = true;
967011232 fault.error_code = 0;
....@@ -9672,14 +11234,28 @@
967211234 fault.address = work->arch.token;
967311235 fault.async_page_fault = true;
967411236 kvm_inject_page_fault(vcpu, &fault);
11237
+ return true;
11238
+ } else {
11239
+ /*
11240
+ * It is not possible to deliver a paravirtualized asynchronous
11241
+ * page fault, but putting the guest in an artificial halt state
11242
+ * can be beneficial nevertheless: if an interrupt arrives, we
11243
+ * can deliver it timely and perhaps the guest will schedule
11244
+ * another process. When the instruction that triggered a page
11245
+ * fault is retried, hopefully the page will be ready in the host.
11246
+ */
11247
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
11248
+ return false;
967511249 }
967611250 }
967711251
967811252 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
967911253 struct kvm_async_pf *work)
968011254 {
9681
- struct x86_exception fault;
9682
- u32 val;
11255
+ struct kvm_lapic_irq irq = {
11256
+ .delivery_mode = APIC_DM_FIXED,
11257
+ .vector = vcpu->arch.apf.vec
11258
+ };
968311259
968411260 if (work->wakeup_all)
968511261 work->arch.token = ~0; /* broadcast wakeup */
....@@ -9687,37 +11263,30 @@
968711263 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
968811264 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
968911265
9690
- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
9691
- !apf_get_user(vcpu, &val)) {
9692
- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
9693
- vcpu->arch.exception.pending &&
9694
- vcpu->arch.exception.nr == PF_VECTOR &&
9695
- !apf_put_user(vcpu, 0)) {
9696
- vcpu->arch.exception.injected = false;
9697
- vcpu->arch.exception.pending = false;
9698
- vcpu->arch.exception.nr = 0;
9699
- vcpu->arch.exception.has_error_code = false;
9700
- vcpu->arch.exception.error_code = 0;
9701
- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
9702
- fault.vector = PF_VECTOR;
9703
- fault.error_code_valid = true;
9704
- fault.error_code = 0;
9705
- fault.nested_page_fault = false;
9706
- fault.address = work->arch.token;
9707
- fault.async_page_fault = true;
9708
- kvm_inject_page_fault(vcpu, &fault);
9709
- }
11266
+ if ((work->wakeup_all || work->notpresent_injected) &&
11267
+ kvm_pv_async_pf_enabled(vcpu) &&
11268
+ !apf_put_user_ready(vcpu, work->arch.token)) {
11269
+ vcpu->arch.apf.pageready_pending = true;
11270
+ kvm_apic_set_irq(vcpu, &irq, NULL);
971011271 }
11272
+
971111273 vcpu->arch.apf.halted = false;
971211274 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
971311275 }
971411276
9715
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
11277
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
971611278 {
9717
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
11279
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
11280
+ if (!vcpu->arch.apf.pageready_pending)
11281
+ kvm_vcpu_kick(vcpu);
11282
+}
11283
+
11284
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
11285
+{
11286
+ if (!kvm_pv_async_pf_enabled(vcpu))
971811287 return true;
971911288 else
9720
- return kvm_can_do_async_pf(vcpu);
11289
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
972111290 }
972211291
972311292 void kvm_arch_start_assignment(struct kvm *kvm)
....@@ -9732,9 +11301,9 @@
973211301 }
973311302 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
973411303
9735
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
11304
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
973611305 {
9737
- return atomic_read(&kvm->arch.assigned_device_count);
11306
+ return arch_atomic_read(&kvm->arch.assigned_device_count);
973811307 }
973911308 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
974011309
....@@ -9758,7 +11327,7 @@
975811327
975911328 bool kvm_arch_has_irq_bypass(void)
976011329 {
9761
- return kvm_x86_ops->update_pi_irte != NULL;
11330
+ return true;
976211331 }
976311332
976411333 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
....@@ -9766,11 +11335,17 @@
976611335 {
976711336 struct kvm_kernel_irqfd *irqfd =
976811337 container_of(cons, struct kvm_kernel_irqfd, consumer);
11338
+ int ret;
976911339
977011340 irqfd->producer = prod;
11341
+ kvm_arch_start_assignment(irqfd->kvm);
11342
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
11343
+ prod->irq, irqfd->gsi, 1);
977111344
9772
- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
9773
- prod->irq, irqfd->gsi, 1);
11345
+ if (ret)
11346
+ kvm_arch_end_assignment(irqfd->kvm);
11347
+
11348
+ return ret;
977411349 }
977511350
977611351 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
....@@ -9789,26 +11364,185 @@
978911364 * when the irq is masked/disabled or the consumer side (KVM
979011365 * int this case doesn't want to receive the interrupts.
979111366 */
9792
- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
11367
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
979311368 if (ret)
979411369 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
979511370 " fails: %d\n", irqfd->consumer.token, ret);
11371
+
11372
+ kvm_arch_end_assignment(irqfd->kvm);
979611373 }
979711374
979811375 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
979911376 uint32_t guest_irq, bool set)
980011377 {
9801
- if (!kvm_x86_ops->update_pi_irte)
9802
- return -EINVAL;
9803
-
9804
- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
11378
+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
980511379 }
980611380
980711381 bool kvm_vector_hashing_enabled(void)
980811382 {
980911383 return vector_hashing;
981011384 }
9811
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
11385
+
11386
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
11387
+{
11388
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
11389
+}
11390
+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
11391
+
11392
+
11393
+int kvm_spec_ctrl_test_value(u64 value)
11394
+{
11395
+ /*
11396
+ * test that setting IA32_SPEC_CTRL to given value
11397
+ * is allowed by the host processor
11398
+ */
11399
+
11400
+ u64 saved_value;
11401
+ unsigned long flags;
11402
+ int ret = 0;
11403
+
11404
+ local_irq_save(flags);
11405
+
11406
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
11407
+ ret = 1;
11408
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
11409
+ ret = 1;
11410
+ else
11411
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
11412
+
11413
+ local_irq_restore(flags);
11414
+
11415
+ return ret;
11416
+}
11417
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
11418
+
11419
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
11420
+{
11421
+ struct x86_exception fault;
11422
+ u32 access = error_code &
11423
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
11424
+
11425
+ if (!(error_code & PFERR_PRESENT_MASK) ||
11426
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
11427
+ /*
11428
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
11429
+ * tables probably do not match the TLB. Just proceed
11430
+ * with the error code that the processor gave.
11431
+ */
11432
+ fault.vector = PF_VECTOR;
11433
+ fault.error_code_valid = true;
11434
+ fault.error_code = error_code;
11435
+ fault.nested_page_fault = false;
11436
+ fault.address = gva;
11437
+ }
11438
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
11439
+}
11440
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
11441
+
11442
+/*
11443
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
11444
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
11445
+ * indicates whether exit to userspace is needed.
11446
+ */
11447
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
11448
+ struct x86_exception *e)
11449
+{
11450
+ if (r == X86EMUL_PROPAGATE_FAULT) {
11451
+ kvm_inject_emulated_page_fault(vcpu, e);
11452
+ return 1;
11453
+ }
11454
+
11455
+ /*
11456
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
11457
+ * while handling a VMX instruction KVM could've handled the request
11458
+ * correctly by exiting to userspace and performing I/O but there
11459
+ * doesn't seem to be a real use-case behind such requests, just return
11460
+ * KVM_EXIT_INTERNAL_ERROR for now.
11461
+ */
11462
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11463
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11464
+ vcpu->run->internal.ndata = 0;
11465
+
11466
+ return 0;
11467
+}
11468
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
11469
+
11470
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
11471
+{
11472
+ bool pcid_enabled;
11473
+ struct x86_exception e;
11474
+ unsigned i;
11475
+ unsigned long roots_to_free = 0;
11476
+ struct {
11477
+ u64 pcid;
11478
+ u64 gla;
11479
+ } operand;
11480
+ int r;
11481
+
11482
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
11483
+ if (r != X86EMUL_CONTINUE)
11484
+ return kvm_handle_memory_failure(vcpu, r, &e);
11485
+
11486
+ if (operand.pcid >> 12 != 0) {
11487
+ kvm_inject_gp(vcpu, 0);
11488
+ return 1;
11489
+ }
11490
+
11491
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
11492
+
11493
+ switch (type) {
11494
+ case INVPCID_TYPE_INDIV_ADDR:
11495
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
11496
+ is_noncanonical_address(operand.gla, vcpu)) {
11497
+ kvm_inject_gp(vcpu, 0);
11498
+ return 1;
11499
+ }
11500
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
11501
+ return kvm_skip_emulated_instruction(vcpu);
11502
+
11503
+ case INVPCID_TYPE_SINGLE_CTXT:
11504
+ if (!pcid_enabled && (operand.pcid != 0)) {
11505
+ kvm_inject_gp(vcpu, 0);
11506
+ return 1;
11507
+ }
11508
+
11509
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
11510
+ kvm_mmu_sync_roots(vcpu);
11511
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
11512
+ }
11513
+
11514
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
11515
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
11516
+ == operand.pcid)
11517
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
11518
+
11519
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
11520
+ /*
11521
+ * If neither the current cr3 nor any of the prev_roots use the
11522
+ * given PCID, then nothing needs to be done here because a
11523
+ * resync will happen anyway before switching to any other CR3.
11524
+ */
11525
+
11526
+ return kvm_skip_emulated_instruction(vcpu);
11527
+
11528
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
11529
+ /*
11530
+ * Currently, KVM doesn't mark global entries in the shadow
11531
+ * page tables, so a non-global flush just degenerates to a
11532
+ * global flush. If needed, we could optimize this later by
11533
+ * keeping track of global entries in shadow page tables.
11534
+ */
11535
+
11536
+ fallthrough;
11537
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
11538
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
11539
+ return kvm_skip_emulated_instruction(vcpu);
11540
+
11541
+ default:
11542
+ BUG(); /* We have already checked above that type <= 3 */
11543
+ }
11544
+}
11545
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
981211546
981311547 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
981411548 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
....@@ -9820,12 +11554,31 @@
982011554 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
982111555 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
982211556 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
11557
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
982311558 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
982411559 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
982511560 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
982611561 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
9827
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
11562
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
982811563 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
982911564 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
983011565 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
983111566 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
11567
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
11568
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
11569
+
11570
+static int __init kvm_x86_init(void)
11571
+{
11572
+ kvm_mmu_x86_module_init();
11573
+ return 0;
11574
+}
11575
+module_init(kvm_x86_init);
11576
+
11577
+static void __exit kvm_x86_exit(void)
11578
+{
11579
+ /*
11580
+ * If module_init() is implemented, module_exit() must also be
11581
+ * implemented to allow module unload.
11582
+ */
11583
+}
11584
+module_exit(kvm_x86_exit);