hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/arch/x86/kvm/x86.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Kernel-based Virtual Machine driver for Linux
34 *
....@@ -13,22 +14,21 @@
1314 * Yaniv Kamay <yaniv@qumranet.com>
1415 * Amit Shah <amit.shah@qumranet.com>
1516 * Ben-Ami Yassour <benami@il.ibm.com>
16
- *
17
- * This work is licensed under the terms of the GNU GPL, version 2. See
18
- * the COPYING file in the top-level directory.
19
- *
2017 */
2118
2219 #include <linux/kvm_host.h>
2320 #include "irq.h"
21
+#include "ioapic.h"
2422 #include "mmu.h"
2523 #include "i8254.h"
2624 #include "tss.h"
2725 #include "kvm_cache_regs.h"
26
+#include "kvm_emulate.h"
2827 #include "x86.h"
2928 #include "cpuid.h"
3029 #include "pmu.h"
3130 #include "hyperv.h"
31
+#include "lapic.h"
3232
3333 #include <linux/clocksource.h>
3434 #include <linux/interrupt.h>
....@@ -54,7 +54,9 @@
5454 #include <linux/kvm_irqfd.h>
5555 #include <linux/irqbypass.h>
5656 #include <linux/sched/stat.h>
57
+#include <linux/sched/isolation.h>
5758 #include <linux/mem_encrypt.h>
59
+#include <linux/entry-kvm.h>
5860
5961 #include <trace/events/kvm.h>
6062
....@@ -69,6 +71,10 @@
6971 #include <asm/irq_remapping.h>
7072 #include <asm/mshyperv.h>
7173 #include <asm/hypervisor.h>
74
+#include <asm/tlbflush.h>
75
+#include <asm/intel_pt.h>
76
+#include <asm/emulate_prefix.h>
77
+#include <clocksource/hyperv_timer.h>
7278
7379 #define CREATE_TRACE_POINTS
7480 #include "trace.h"
....@@ -79,7 +85,7 @@
7985 EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
8086
8187 #define emul_to_vcpu(ctxt) \
82
- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
88
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
8389
8490 /* EFER defaults:
8591 * - enable syscall per default because its emulated by KVM
....@@ -94,9 +100,6 @@
94100
95101 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
96102
97
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
98
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
99
-
100103 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
101104 KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
102105
....@@ -108,7 +111,7 @@
108111 static void store_regs(struct kvm_vcpu *vcpu);
109112 static int sync_regs(struct kvm_vcpu *vcpu);
110113
111
-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
114
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
112115 EXPORT_SYMBOL_GPL(kvm_x86_ops);
113116
114117 static bool __read_mostly ignore_msrs = 0;
....@@ -138,10 +141,14 @@
138141 static u32 __read_mostly tsc_tolerance_ppm = 250;
139142 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
140143
141
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
142
-unsigned int __read_mostly lapic_timer_advance_ns = 0;
143
-module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
144
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
144
+/*
145
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
146
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
147
+ * advancement entirely. Any other value is used as-is and disables adaptive
148
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
149
+ */
150
+static int __read_mostly lapic_timer_advance_ns = -1;
151
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
145152
146153 static bool __read_mostly vector_hashing = true;
147154 module_param(vector_hashing, bool, S_IRUGO);
....@@ -153,85 +160,149 @@
153160 static bool __read_mostly force_emulation_prefix = false;
154161 module_param(force_emulation_prefix, bool, S_IRUGO);
155162
156
-#define KVM_NR_SHARED_MSRS 16
163
+int __read_mostly pi_inject_timer = -1;
164
+module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
157165
158
-struct kvm_shared_msrs_global {
166
+/*
167
+ * Restoring the host value for MSRs that are only consumed when running in
168
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
169
+ * returns to userspace, i.e. the kernel can run with the guest's value.
170
+ */
171
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
172
+
173
+struct kvm_user_return_msrs_global {
159174 int nr;
160
- u32 msrs[KVM_NR_SHARED_MSRS];
175
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
161176 };
162177
163
-struct kvm_shared_msrs {
178
+struct kvm_user_return_msrs {
164179 struct user_return_notifier urn;
165180 bool registered;
166
- struct kvm_shared_msr_values {
181
+ struct kvm_user_return_msr_values {
167182 u64 host;
168183 u64 curr;
169
- } values[KVM_NR_SHARED_MSRS];
184
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
170185 };
171186
172
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
173
-static struct kvm_shared_msrs __percpu *shared_msrs;
187
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
188
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
189
+
190
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
191
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
192
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
193
+ | XFEATURE_MASK_PKRU)
194
+
195
+u64 __read_mostly host_efer;
196
+EXPORT_SYMBOL_GPL(host_efer);
197
+
198
+bool __read_mostly allow_smaller_maxphyaddr = 0;
199
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
200
+
201
+static u64 __read_mostly host_xss;
202
+u64 __read_mostly supported_xss;
203
+EXPORT_SYMBOL_GPL(supported_xss);
174204
175205 struct kvm_stats_debugfs_item debugfs_entries[] = {
176
- { "pf_fixed", VCPU_STAT(pf_fixed) },
177
- { "pf_guest", VCPU_STAT(pf_guest) },
178
- { "tlb_flush", VCPU_STAT(tlb_flush) },
179
- { "invlpg", VCPU_STAT(invlpg) },
180
- { "exits", VCPU_STAT(exits) },
181
- { "io_exits", VCPU_STAT(io_exits) },
182
- { "mmio_exits", VCPU_STAT(mmio_exits) },
183
- { "signal_exits", VCPU_STAT(signal_exits) },
184
- { "irq_window", VCPU_STAT(irq_window_exits) },
185
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
186
- { "halt_exits", VCPU_STAT(halt_exits) },
187
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
188
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
189
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
190
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
191
- { "hypercalls", VCPU_STAT(hypercalls) },
192
- { "request_irq", VCPU_STAT(request_irq_exits) },
193
- { "irq_exits", VCPU_STAT(irq_exits) },
194
- { "host_state_reload", VCPU_STAT(host_state_reload) },
195
- { "fpu_reload", VCPU_STAT(fpu_reload) },
196
- { "insn_emulation", VCPU_STAT(insn_emulation) },
197
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
198
- { "irq_injections", VCPU_STAT(irq_injections) },
199
- { "nmi_injections", VCPU_STAT(nmi_injections) },
200
- { "req_event", VCPU_STAT(req_event) },
201
- { "l1d_flush", VCPU_STAT(l1d_flush) },
202
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
203
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
204
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
205
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
206
- { "mmu_flooded", VM_STAT(mmu_flooded) },
207
- { "mmu_recycled", VM_STAT(mmu_recycled) },
208
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
209
- { "mmu_unsync", VM_STAT(mmu_unsync) },
210
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
211
- { "largepages", VM_STAT(lpages, .mode = 0444) },
212
- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
213
- { "max_mmu_page_hash_collisions",
214
- VM_STAT(max_mmu_page_hash_collisions) },
206
+ VCPU_STAT("pf_fixed", pf_fixed),
207
+ VCPU_STAT("pf_guest", pf_guest),
208
+ VCPU_STAT("tlb_flush", tlb_flush),
209
+ VCPU_STAT("invlpg", invlpg),
210
+ VCPU_STAT("exits", exits),
211
+ VCPU_STAT("io_exits", io_exits),
212
+ VCPU_STAT("mmio_exits", mmio_exits),
213
+ VCPU_STAT("signal_exits", signal_exits),
214
+ VCPU_STAT("irq_window", irq_window_exits),
215
+ VCPU_STAT("nmi_window", nmi_window_exits),
216
+ VCPU_STAT("halt_exits", halt_exits),
217
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
218
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
219
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
220
+ VCPU_STAT("halt_wakeup", halt_wakeup),
221
+ VCPU_STAT("hypercalls", hypercalls),
222
+ VCPU_STAT("request_irq", request_irq_exits),
223
+ VCPU_STAT("irq_exits", irq_exits),
224
+ VCPU_STAT("host_state_reload", host_state_reload),
225
+ VCPU_STAT("fpu_reload", fpu_reload),
226
+ VCPU_STAT("insn_emulation", insn_emulation),
227
+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
228
+ VCPU_STAT("irq_injections", irq_injections),
229
+ VCPU_STAT("nmi_injections", nmi_injections),
230
+ VCPU_STAT("req_event", req_event),
231
+ VCPU_STAT("l1d_flush", l1d_flush),
232
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
233
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
234
+ VCPU_STAT("preemption_reported", preemption_reported),
235
+ VCPU_STAT("preemption_other", preemption_other),
236
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
237
+ VM_STAT("mmu_pte_write", mmu_pte_write),
238
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
239
+ VM_STAT("mmu_flooded", mmu_flooded),
240
+ VM_STAT("mmu_recycled", mmu_recycled),
241
+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
242
+ VM_STAT("mmu_unsync", mmu_unsync),
243
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
244
+ VM_STAT("largepages", lpages, .mode = 0444),
245
+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
246
+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
215247 { NULL }
216248 };
217249
218250 u64 __read_mostly host_xcr0;
251
+u64 __read_mostly supported_xcr0;
252
+EXPORT_SYMBOL_GPL(supported_xcr0);
253
+
254
+static struct kmem_cache *x86_fpu_cache;
255
+
256
+static struct kmem_cache *x86_emulator_cache;
257
+
258
+/*
259
+ * When called, it means the previous get/set msr reached an invalid msr.
260
+ * Return true if we want to ignore/silent this failed msr access.
261
+ */
262
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
263
+ u64 data, bool write)
264
+{
265
+ const char *op = write ? "wrmsr" : "rdmsr";
266
+
267
+ if (ignore_msrs) {
268
+ if (report_ignored_msrs)
269
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
270
+ op, msr, data);
271
+ /* Mask the error */
272
+ return true;
273
+ } else {
274
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
275
+ op, msr, data);
276
+ return false;
277
+ }
278
+}
279
+
280
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
281
+{
282
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
283
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
284
+
285
+ return kmem_cache_create_usercopy("x86_emulator", size,
286
+ __alignof__(struct x86_emulate_ctxt),
287
+ SLAB_ACCOUNT, useroffset,
288
+ size - useroffset, NULL);
289
+}
219290
220291 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
221292
222293 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
223294 {
224295 int i;
225
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
296
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
226297 vcpu->arch.apf.gfns[i] = ~0;
227298 }
228299
229300 static void kvm_on_user_return(struct user_return_notifier *urn)
230301 {
231302 unsigned slot;
232
- struct kvm_shared_msrs *locals
233
- = container_of(urn, struct kvm_shared_msrs, urn);
234
- struct kvm_shared_msr_values *values;
303
+ struct kvm_user_return_msrs *msrs
304
+ = container_of(urn, struct kvm_user_return_msrs, urn);
305
+ struct kvm_user_return_msr_values *values;
235306 unsigned long flags;
236307
237308 /*
....@@ -239,84 +310,89 @@
239310 * interrupted and executed through kvm_arch_hardware_disable()
240311 */
241312 local_irq_save(flags);
242
- if (locals->registered) {
243
- locals->registered = false;
313
+ if (msrs->registered) {
314
+ msrs->registered = false;
244315 user_return_notifier_unregister(urn);
245316 }
246317 local_irq_restore(flags);
247
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
248
- values = &locals->values[slot];
318
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
319
+ values = &msrs->values[slot];
249320 if (values->host != values->curr) {
250
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
321
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
251322 values->curr = values->host;
252323 }
253324 }
254325 }
255326
256
-static void shared_msr_update(unsigned slot, u32 msr)
327
+int kvm_probe_user_return_msr(u32 msr)
257328 {
329
+ u64 val;
330
+ int ret;
331
+
332
+ preempt_disable();
333
+ ret = rdmsrl_safe(msr, &val);
334
+ if (ret)
335
+ goto out;
336
+ ret = wrmsrl_safe(msr, val);
337
+out:
338
+ preempt_enable();
339
+ return ret;
340
+}
341
+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
342
+
343
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
344
+{
345
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
346
+ user_return_msrs_global.msrs[slot] = msr;
347
+ if (slot >= user_return_msrs_global.nr)
348
+ user_return_msrs_global.nr = slot + 1;
349
+}
350
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
351
+
352
+static void kvm_user_return_msr_cpu_online(void)
353
+{
354
+ unsigned int cpu = smp_processor_id();
355
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
258356 u64 value;
259
- unsigned int cpu = smp_processor_id();
260
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
357
+ int i;
261358
262
- /* only read, and nobody should modify it at this time,
263
- * so don't need lock */
264
- if (slot >= shared_msrs_global.nr) {
265
- printk(KERN_ERR "kvm: invalid MSR slot!");
266
- return;
359
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
360
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
361
+ msrs->values[i].host = value;
362
+ msrs->values[i].curr = value;
267363 }
268
- rdmsrl_safe(msr, &value);
269
- smsr->values[slot].host = value;
270
- smsr->values[slot].curr = value;
271364 }
272365
273
-void kvm_define_shared_msr(unsigned slot, u32 msr)
274
-{
275
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
276
- shared_msrs_global.msrs[slot] = msr;
277
- if (slot >= shared_msrs_global.nr)
278
- shared_msrs_global.nr = slot + 1;
279
-}
280
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
281
-
282
-static void kvm_shared_msr_cpu_online(void)
283
-{
284
- unsigned i;
285
-
286
- for (i = 0; i < shared_msrs_global.nr; ++i)
287
- shared_msr_update(i, shared_msrs_global.msrs[i]);
288
-}
289
-
290
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
366
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
291367 {
292368 unsigned int cpu = smp_processor_id();
293
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
369
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
294370 int err;
295371
296
- value = (value & mask) | (smsr->values[slot].host & ~mask);
297
- if (value == smsr->values[slot].curr)
372
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
373
+ if (value == msrs->values[slot].curr)
298374 return 0;
299
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
375
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
300376 if (err)
301377 return 1;
302378
303
- smsr->values[slot].curr = value;
304
- if (!smsr->registered) {
305
- smsr->urn.on_user_return = kvm_on_user_return;
306
- user_return_notifier_register(&smsr->urn);
307
- smsr->registered = true;
379
+ msrs->values[slot].curr = value;
380
+ if (!msrs->registered) {
381
+ msrs->urn.on_user_return = kvm_on_user_return;
382
+ user_return_notifier_register(&msrs->urn);
383
+ msrs->registered = true;
308384 }
309385 return 0;
310386 }
311
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
387
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
312388
313389 static void drop_user_return_notifiers(void)
314390 {
315391 unsigned int cpu = smp_processor_id();
316
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
392
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
317393
318
- if (smsr->registered)
319
- kvm_on_user_return(&smsr->urn);
394
+ if (msrs->registered)
395
+ kvm_on_user_return(&msrs->urn);
320396 }
321397
322398 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
....@@ -348,14 +424,15 @@
348424 }
349425
350426 kvm_lapic_set_base(vcpu, msr_info->data);
427
+ kvm_recalculate_apic_map(vcpu->kvm);
351428 return 0;
352429 }
353430 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
354431
355
-asmlinkage __visible void kvm_spurious_fault(void)
432
+asmlinkage __visible noinstr void kvm_spurious_fault(void)
356433 {
357434 /* Fault while not rebooting. We want the trace. */
358
- BUG();
435
+ BUG_ON(!kvm_rebooting);
359436 }
360437 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
361438
....@@ -384,6 +461,7 @@
384461 #define EXCPT_TRAP 1
385462 #define EXCPT_ABORT 2
386463 #define EXCPT_INTERRUPT 3
464
+#define EXCPT_DB 4
387465
388466 static int exception_type(int vector)
389467 {
....@@ -394,8 +472,14 @@
394472
395473 mask = 1 << vector;
396474
397
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
398
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
475
+ /*
476
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
477
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
478
+ */
479
+ if (mask & (1 << DB_VECTOR))
480
+ return EXCPT_DB;
481
+
482
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
399483 return EXCPT_TRAP;
400484
401485 if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
....@@ -405,9 +489,59 @@
405489 return EXCPT_FAULT;
406490 }
407491
492
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
493
+{
494
+ unsigned nr = vcpu->arch.exception.nr;
495
+ bool has_payload = vcpu->arch.exception.has_payload;
496
+ unsigned long payload = vcpu->arch.exception.payload;
497
+
498
+ if (!has_payload)
499
+ return;
500
+
501
+ switch (nr) {
502
+ case DB_VECTOR:
503
+ /*
504
+ * "Certain debug exceptions may clear bit 0-3. The
505
+ * remaining contents of the DR6 register are never
506
+ * cleared by the processor".
507
+ */
508
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
509
+ /*
510
+ * DR6.RTM is set by all #DB exceptions that don't clear it.
511
+ */
512
+ vcpu->arch.dr6 |= DR6_RTM;
513
+ vcpu->arch.dr6 |= payload;
514
+ /*
515
+ * Bit 16 should be set in the payload whenever the #DB
516
+ * exception should clear DR6.RTM. This makes the payload
517
+ * compatible with the pending debug exceptions under VMX.
518
+ * Though not currently documented in the SDM, this also
519
+ * makes the payload compatible with the exit qualification
520
+ * for #DB exceptions under VMX.
521
+ */
522
+ vcpu->arch.dr6 ^= payload & DR6_RTM;
523
+
524
+ /*
525
+ * The #DB payload is defined as compatible with the 'pending
526
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
527
+ * defined in the 'pending debug exceptions' field (enabled
528
+ * breakpoint), it is reserved and must be zero in DR6.
529
+ */
530
+ vcpu->arch.dr6 &= ~BIT(12);
531
+ break;
532
+ case PF_VECTOR:
533
+ vcpu->arch.cr2 = payload;
534
+ break;
535
+ }
536
+
537
+ vcpu->arch.exception.has_payload = false;
538
+ vcpu->arch.exception.payload = 0;
539
+}
540
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
541
+
408542 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
409543 unsigned nr, bool has_error, u32 error_code,
410
- bool reinject)
544
+ bool has_payload, unsigned long payload, bool reinject)
411545 {
412546 u32 prev_nr;
413547 int class1, class2;
....@@ -427,6 +561,14 @@
427561 */
428562 WARN_ON_ONCE(vcpu->arch.exception.pending);
429563 vcpu->arch.exception.injected = true;
564
+ if (WARN_ON_ONCE(has_payload)) {
565
+ /*
566
+ * A reinjected event has already
567
+ * delivered its payload.
568
+ */
569
+ has_payload = false;
570
+ payload = 0;
571
+ }
430572 } else {
431573 vcpu->arch.exception.pending = true;
432574 vcpu->arch.exception.injected = false;
....@@ -434,6 +576,10 @@
434576 vcpu->arch.exception.has_error_code = has_error;
435577 vcpu->arch.exception.nr = nr;
436578 vcpu->arch.exception.error_code = error_code;
579
+ vcpu->arch.exception.has_payload = has_payload;
580
+ vcpu->arch.exception.payload = payload;
581
+ if (!is_guest_mode(vcpu))
582
+ kvm_deliver_exception_payload(vcpu);
437583 return;
438584 }
439585
....@@ -458,6 +604,8 @@
458604 vcpu->arch.exception.has_error_code = true;
459605 vcpu->arch.exception.nr = DF_VECTOR;
460606 vcpu->arch.exception.error_code = 0;
607
+ vcpu->arch.exception.has_payload = false;
608
+ vcpu->arch.exception.payload = 0;
461609 } else
462610 /* replace previous exception with a new one in a hope
463611 that instruction re-execution will regenerate lost
....@@ -467,15 +615,29 @@
467615
468616 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
469617 {
470
- kvm_multiple_exception(vcpu, nr, false, 0, false);
618
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
471619 }
472620 EXPORT_SYMBOL_GPL(kvm_queue_exception);
473621
474622 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
475623 {
476
- kvm_multiple_exception(vcpu, nr, false, 0, true);
624
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
477625 }
478626 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
627
+
628
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
629
+ unsigned long payload)
630
+{
631
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
632
+}
633
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
634
+
635
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
636
+ u32 error_code, unsigned long payload)
637
+{
638
+ kvm_multiple_exception(vcpu, nr, true, error_code,
639
+ true, payload, false);
640
+}
479641
480642 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
481643 {
....@@ -493,23 +655,38 @@
493655 ++vcpu->stat.pf_guest;
494656 vcpu->arch.exception.nested_apf =
495657 is_guest_mode(vcpu) && fault->async_page_fault;
496
- if (vcpu->arch.exception.nested_apf)
658
+ if (vcpu->arch.exception.nested_apf) {
497659 vcpu->arch.apf.nested_apf_token = fault->address;
498
- else
499
- vcpu->arch.cr2 = fault->address;
500
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
660
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
661
+ } else {
662
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
663
+ fault->address);
664
+ }
501665 }
502666 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
503667
504
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
668
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
669
+ struct x86_exception *fault)
505670 {
506
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
507
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
508
- else
509
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
671
+ struct kvm_mmu *fault_mmu;
672
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
510673
674
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
675
+ vcpu->arch.walk_mmu;
676
+
677
+ /*
678
+ * Invalidate the TLB entry for the faulting address, if it exists,
679
+ * else the access will fault indefinitely (and to emulate hardware).
680
+ */
681
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
682
+ !(fault->error_code & PFERR_RSVD_MASK))
683
+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
684
+ fault_mmu->root_hpa);
685
+
686
+ fault_mmu->inject_page_fault(vcpu, fault);
511687 return fault->nested_page_fault;
512688 }
689
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
513690
514691 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
515692 {
....@@ -520,13 +697,13 @@
520697
521698 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
522699 {
523
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
700
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
524701 }
525702 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
526703
527704 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
528705 {
529
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
706
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
530707 }
531708 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
532709
....@@ -536,7 +713,7 @@
536713 */
537714 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
538715 {
539
- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
716
+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
540717 return true;
541718 kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
542719 return false;
....@@ -618,10 +795,8 @@
618795 ret = 1;
619796
620797 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
621
- __set_bit(VCPU_EXREG_PDPTR,
622
- (unsigned long *)&vcpu->arch.regs_avail);
623
- __set_bit(VCPU_EXREG_PDPTR,
624
- (unsigned long *)&vcpu->arch.regs_dirty);
798
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
799
+
625800 out:
626801
627802 return ret;
....@@ -631,7 +806,6 @@
631806 bool pdptrs_changed(struct kvm_vcpu *vcpu)
632807 {
633808 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
634
- bool changed = true;
635809 int offset;
636810 gfn_t gfn;
637811 int r;
....@@ -639,8 +813,7 @@
639813 if (!is_pae_paging(vcpu))
640814 return false;
641815
642
- if (!test_bit(VCPU_EXREG_PDPTR,
643
- (unsigned long *)&vcpu->arch.regs_avail))
816
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
644817 return true;
645818
646819 gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
....@@ -648,17 +821,16 @@
648821 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
649822 PFERR_USER_MASK | PFERR_WRITE_MASK);
650823 if (r < 0)
651
- goto out;
652
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
653
-out:
824
+ return true;
654825
655
- return changed;
826
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
656827 }
657828 EXPORT_SYMBOL_GPL(pdptrs_changed);
658829
659830 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
660831 {
661832 unsigned long old_cr0 = kvm_read_cr0(vcpu);
833
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
662834 unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
663835
664836 cr0 |= X86_CR0_ET;
....@@ -676,27 +848,27 @@
676848 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
677849 return 1;
678850
679
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
680851 #ifdef CONFIG_X86_64
681
- if ((vcpu->arch.efer & EFER_LME)) {
682
- int cs_db, cs_l;
852
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
853
+ (cr0 & X86_CR0_PG)) {
854
+ int cs_db, cs_l;
683855
684
- if (!is_pae(vcpu))
685
- return 1;
686
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
687
- if (cs_l)
688
- return 1;
689
- } else
690
-#endif
691
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
692
- kvm_read_cr3(vcpu)))
856
+ if (!is_pae(vcpu))
857
+ return 1;
858
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
859
+ if (cs_l)
693860 return 1;
694861 }
862
+#endif
863
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
864
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
865
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
866
+ return 1;
695867
696868 if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
697869 return 1;
698870
699
- kvm_x86_ops->set_cr0(vcpu, cr0);
871
+ kvm_x86_ops.set_cr0(vcpu, cr0);
700872
701873 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
702874 kvm_clear_async_pf_completion_queue(vcpu);
....@@ -721,27 +893,48 @@
721893 }
722894 EXPORT_SYMBOL_GPL(kvm_lmsw);
723895
724
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
896
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
725897 {
726
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
727
- !vcpu->guest_xcr0_loaded) {
728
- /* kvm_set_xcr() also depends on this */
898
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
899
+
729900 if (vcpu->arch.xcr0 != host_xcr0)
730901 xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
731
- vcpu->guest_xcr0_loaded = 1;
732
- }
733
-}
734
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
735902
736
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
903
+ if (vcpu->arch.xsaves_enabled &&
904
+ vcpu->arch.ia32_xss != host_xss)
905
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
906
+ }
907
+
908
+ if (static_cpu_has(X86_FEATURE_PKU) &&
909
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
910
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
911
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
912
+ __write_pkru(vcpu->arch.pkru);
913
+}
914
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
915
+
916
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
737917 {
738
- if (vcpu->guest_xcr0_loaded) {
918
+ if (static_cpu_has(X86_FEATURE_PKU) &&
919
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
920
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
921
+ vcpu->arch.pkru = rdpkru();
922
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
923
+ __write_pkru(vcpu->arch.host_pkru);
924
+ }
925
+
926
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
927
+
739928 if (vcpu->arch.xcr0 != host_xcr0)
740929 xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
741
- vcpu->guest_xcr0_loaded = 0;
930
+
931
+ if (vcpu->arch.xsaves_enabled &&
932
+ vcpu->arch.ia32_xss != host_xss)
933
+ wrmsrl(MSR_IA32_XSS, host_xss);
742934 }
935
+
743936 }
744
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
937
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
745938
746939 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
747940 {
....@@ -779,13 +972,13 @@
779972 vcpu->arch.xcr0 = xcr0;
780973
781974 if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
782
- kvm_update_cpuid(vcpu);
975
+ kvm_update_cpuid_runtime(vcpu);
783976 return 0;
784977 }
785978
786979 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
787980 {
788
- if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
981
+ if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
789982 __kvm_set_xcr(vcpu, index, xcr)) {
790983 kvm_inject_gp(vcpu, 0);
791984 return 1;
....@@ -794,63 +987,20 @@
794987 }
795988 EXPORT_SYMBOL_GPL(kvm_set_xcr);
796989
797
-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
798
-{
799
- u64 reserved_bits = CR4_RESERVED_BITS;
800
-
801
- if (!cpu_has(c, X86_FEATURE_XSAVE))
802
- reserved_bits |= X86_CR4_OSXSAVE;
803
-
804
- if (!cpu_has(c, X86_FEATURE_SMEP))
805
- reserved_bits |= X86_CR4_SMEP;
806
-
807
- if (!cpu_has(c, X86_FEATURE_SMAP))
808
- reserved_bits |= X86_CR4_SMAP;
809
-
810
- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
811
- reserved_bits |= X86_CR4_FSGSBASE;
812
-
813
- if (!cpu_has(c, X86_FEATURE_PKU))
814
- reserved_bits |= X86_CR4_PKE;
815
-
816
- if (!cpu_has(c, X86_FEATURE_LA57) &&
817
- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
818
- reserved_bits |= X86_CR4_LA57;
819
-
820
- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
821
- reserved_bits |= X86_CR4_UMIP;
822
-
823
- return reserved_bits;
824
-}
825
-
826
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
990
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
827991 {
828992 if (cr4 & cr4_reserved_bits)
829993 return -EINVAL;
830994
831
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
995
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
832996 return -EINVAL;
833997
834
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
835
- return -EINVAL;
836
-
837
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
838
- return -EINVAL;
839
-
840
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
841
- return -EINVAL;
842
-
843
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
844
- return -EINVAL;
845
-
846
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
847
- return -EINVAL;
848
-
849
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
998
+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
850999 return -EINVAL;
8511000
8521001 return 0;
8531002 }
1003
+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
8541004
8551005 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
8561006 {
....@@ -882,15 +1032,14 @@
8821032 return 1;
8831033 }
8841034
885
- if (kvm_x86_ops->set_cr4(vcpu, cr4))
886
- return 1;
1035
+ kvm_x86_ops.set_cr4(vcpu, cr4);
8871036
8881037 if (((cr4 ^ old_cr4) & mmu_role_bits) ||
8891038 (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
8901039 kvm_mmu_reset_context(vcpu);
8911040
8921041 if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
893
- kvm_update_cpuid(vcpu);
1042
+ kvm_update_cpuid_runtime(vcpu);
8941043
8951044 return 0;
8961045 }
....@@ -911,21 +1060,21 @@
9111060 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
9121061 if (!skip_tlb_flush) {
9131062 kvm_mmu_sync_roots(vcpu);
914
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1063
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
9151064 }
9161065 return 0;
9171066 }
9181067
9191068 if (is_long_mode(vcpu) &&
920
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
1069
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
9211070 return 1;
9221071 else if (is_pae_paging(vcpu) &&
9231072 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
9241073 return 1;
9251074
926
- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
1075
+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
9271076 vcpu->arch.cr3 = cr3;
928
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
1077
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
9291078
9301079 return 0;
9311080 }
....@@ -963,13 +1112,7 @@
9631112 }
9641113 }
9651114
966
-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
967
-{
968
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
969
- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
970
-}
971
-
972
-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
1115
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
9731116 {
9741117 unsigned long dr7;
9751118
....@@ -977,11 +1120,12 @@
9771120 dr7 = vcpu->arch.guest_debug_dr7;
9781121 else
9791122 dr7 = vcpu->arch.dr7;
980
- kvm_x86_ops->set_dr7(vcpu, dr7);
1123
+ kvm_x86_ops.set_dr7(vcpu, dr7);
9811124 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
9821125 if (dr7 & DR7_BP_EN_MASK)
9831126 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
9841127 }
1128
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
9851129
9861130 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
9871131 {
....@@ -1003,17 +1147,14 @@
10031147 vcpu->arch.eff_db[dr] = val;
10041148 break;
10051149 case 4:
1006
- /* fall through */
10071150 case 6:
1008
- if (val & 0xffffffff00000000ULL)
1151
+ if (!kvm_dr6_valid(val))
10091152 return -1; /* #GP */
10101153 vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1011
- kvm_update_dr6(vcpu);
10121154 break;
10131155 case 5:
1014
- /* fall through */
10151156 default: /* 7 */
1016
- if (val & 0xffffffff00000000ULL)
1157
+ if (!kvm_dr7_valid(val))
10171158 return -1; /* #GP */
10181159 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
10191160 kvm_update_dr7(vcpu);
....@@ -1042,15 +1183,10 @@
10421183 *val = vcpu->arch.db[array_index_nospec(dr, size)];
10431184 break;
10441185 case 4:
1045
- /* fall through */
10461186 case 6:
1047
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1048
- *val = vcpu->arch.dr6;
1049
- else
1050
- *val = kvm_x86_ops->get_dr6(vcpu);
1187
+ *val = vcpu->arch.dr6;
10511188 break;
10521189 case 5:
1053
- /* fall through */
10541190 default: /* 7 */
10551191 *val = vcpu->arch.dr7;
10561192 break;
....@@ -1061,15 +1197,15 @@
10611197
10621198 bool kvm_rdpmc(struct kvm_vcpu *vcpu)
10631199 {
1064
- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
1200
+ u32 ecx = kvm_rcx_read(vcpu);
10651201 u64 data;
10661202 int err;
10671203
10681204 err = kvm_pmu_rdpmc(vcpu, ecx, &data);
10691205 if (err)
10701206 return err;
1071
- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
1072
- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
1207
+ kvm_rax_write(vcpu, (u32)data);
1208
+ kvm_rdx_write(vcpu, data >> 32);
10731209 return err;
10741210 }
10751211 EXPORT_SYMBOL_GPL(kvm_rdpmc);
....@@ -1078,26 +1214,66 @@
10781214 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
10791215 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
10801216 *
1081
- * This list is modified at module load time to reflect the
1217
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1218
+ * extract the supported MSRs from the related const lists.
1219
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
10821220 * capabilities of the host cpu. This capabilities test skips MSRs that are
1083
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
1221
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
10841222 * may depend on host virtualization features rather than host cpu features.
10851223 */
10861224
1087
-static u32 msrs_to_save[] = {
1225
+static const u32 msrs_to_save_all[] = {
10881226 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
10891227 MSR_STAR,
10901228 #ifdef CONFIG_X86_64
10911229 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
10921230 #endif
10931231 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1094
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1095
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
1232
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1233
+ MSR_IA32_SPEC_CTRL,
1234
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1235
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1236
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1237
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1238
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1239
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1240
+ MSR_IA32_UMWAIT_CONTROL,
1241
+
1242
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1243
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
1244
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1245
+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1246
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1247
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1248
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1249
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1250
+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
1251
+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
1252
+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
1253
+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
1254
+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
1255
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1256
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1257
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1258
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1259
+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
1260
+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
1261
+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
1262
+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
1263
+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
1264
+
1265
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1266
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
1267
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1268
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1269
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1270
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
10961271 };
10971272
1273
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
10981274 static unsigned num_msrs_to_save;
10991275
1100
-static u32 emulated_msrs[] = {
1276
+static const u32 emulated_msrs_all[] = {
11011277 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
11021278 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
11031279 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
....@@ -1113,12 +1289,18 @@
11131289 HV_X64_MSR_VP_ASSIST_PAGE,
11141290 HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
11151291 HV_X64_MSR_TSC_EMULATION_STATUS,
1292
+ HV_X64_MSR_SYNDBG_OPTIONS,
1293
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1294
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1295
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
11161296
11171297 MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1118
- MSR_KVM_PV_EOI_EN,
1298
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
11191299
11201300 MSR_IA32_TSC_ADJUST,
11211301 MSR_IA32_TSCDEADLINE,
1302
+ MSR_IA32_ARCH_CAPABILITIES,
1303
+ MSR_IA32_PERF_CAPABILITIES,
11221304 MSR_IA32_MISC_ENABLE,
11231305 MSR_IA32_MCG_STATUS,
11241306 MSR_IA32_MCG_CTL,
....@@ -1128,15 +1310,41 @@
11281310 MSR_PLATFORM_INFO,
11291311 MSR_MISC_FEATURES_ENABLES,
11301312 MSR_AMD64_VIRT_SPEC_CTRL,
1313
+ MSR_IA32_POWER_CTL,
1314
+ MSR_IA32_UCODE_REV,
1315
+
1316
+ /*
1317
+ * The following list leaves out MSRs whose values are determined
1318
+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1319
+ * We always support the "true" VMX control MSRs, even if the host
1320
+ * processor does not, so I am putting these registers here rather
1321
+ * than in msrs_to_save_all.
1322
+ */
1323
+ MSR_IA32_VMX_BASIC,
1324
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1325
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1326
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
1327
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1328
+ MSR_IA32_VMX_MISC,
1329
+ MSR_IA32_VMX_CR0_FIXED0,
1330
+ MSR_IA32_VMX_CR4_FIXED0,
1331
+ MSR_IA32_VMX_VMCS_ENUM,
1332
+ MSR_IA32_VMX_PROCBASED_CTLS2,
1333
+ MSR_IA32_VMX_EPT_VPID_CAP,
1334
+ MSR_IA32_VMX_VMFUNC,
1335
+
1336
+ MSR_K7_HWCR,
1337
+ MSR_KVM_POLL_CONTROL,
11311338 };
11321339
1340
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
11331341 static unsigned num_emulated_msrs;
11341342
11351343 /*
11361344 * List of msr numbers which are used to expose MSR-based features that
11371345 * can be used by a hypervisor to validate requested CPU features.
11381346 */
1139
-static u32 msr_based_features[] = {
1347
+static const u32 msr_based_features_all[] = {
11401348 MSR_IA32_VMX_BASIC,
11411349 MSR_IA32_VMX_TRUE_PINBASED_CTLS,
11421350 MSR_IA32_VMX_PINBASED_CTLS,
....@@ -1156,18 +1364,41 @@
11561364 MSR_IA32_VMX_EPT_VPID_CAP,
11571365 MSR_IA32_VMX_VMFUNC,
11581366
1159
- MSR_F10H_DECFG,
1367
+ MSR_AMD64_DE_CFG,
11601368 MSR_IA32_UCODE_REV,
11611369 MSR_IA32_ARCH_CAPABILITIES,
1370
+ MSR_IA32_PERF_CAPABILITIES,
11621371 };
11631372
1373
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
11641374 static unsigned int num_msr_based_features;
11651375
1166
-u64 kvm_get_arch_capabilities(void)
1167
-{
1168
- u64 data;
1376
+/*
1377
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1378
+ * does not yet virtualize. These include:
1379
+ * 10 - MISC_PACKAGE_CTRLS
1380
+ * 11 - ENERGY_FILTERING_CTL
1381
+ * 12 - DOITM
1382
+ * 18 - FB_CLEAR_CTRL
1383
+ * 21 - XAPIC_DISABLE_STATUS
1384
+ * 23 - OVERCLOCKING_STATUS
1385
+ */
11691386
1170
- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
1387
+#define KVM_SUPPORTED_ARCH_CAP \
1388
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1389
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1390
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1391
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1392
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
1393
+
1394
+static u64 kvm_get_arch_capabilities(void)
1395
+{
1396
+ u64 data = 0;
1397
+
1398
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1399
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1400
+ data &= KVM_SUPPORTED_ARCH_CAP;
1401
+ }
11711402
11721403 /*
11731404 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
....@@ -1196,34 +1427,30 @@
11961427 if (!boot_cpu_has_bug(X86_BUG_MDS))
11971428 data |= ARCH_CAP_MDS_NO;
11981429
1199
- /*
1200
- * On TAA affected systems, export MDS_NO=0 when:
1201
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
1202
- * - Updated microcode is present. This is detected by
1203
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
1204
- * that VERW clears CPU buffers.
1205
- *
1206
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
1207
- * mitigation and don't complain:
1208
- *
1209
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
1210
- *
1211
- * If TSX is disabled on the system, guests are also mitigated against
1212
- * TAA and clear CPU buffer mitigation is not required for guests.
1213
- */
1214
- if (!boot_cpu_has(X86_FEATURE_RTM))
1430
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
1431
+ /*
1432
+ * If RTM=0 because the kernel has disabled TSX, the host might
1433
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
1434
+ * and therefore knows that there cannot be TAA) but keep
1435
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1436
+ * and we want to allow migrating those guests to tsx=off hosts.
1437
+ */
12151438 data &= ~ARCH_CAP_TAA_NO;
1216
- else if (!boot_cpu_has_bug(X86_BUG_TAA))
1439
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
12171440 data |= ARCH_CAP_TAA_NO;
1218
- else if (data & ARCH_CAP_TSX_CTRL_MSR)
1219
- data &= ~ARCH_CAP_MDS_NO;
1441
+ } else {
1442
+ /*
1443
+ * Nothing to do here; we emulate TSX_CTRL if present on the
1444
+ * host so the guest can choose between disabling TSX or
1445
+ * using VERW to clear CPU buffers.
1446
+ */
1447
+ }
12201448
1221
- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
1222
- data &= ~ARCH_CAP_TSX_CTRL_MSR;
1449
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
1450
+ data |= ARCH_CAP_GDS_NO;
1451
+
12231452 return data;
12241453 }
1225
-
1226
-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
12271454
12281455 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
12291456 {
....@@ -1235,8 +1462,7 @@
12351462 rdmsrl_safe(msr->index, &msr->data);
12361463 break;
12371464 default:
1238
- if (kvm_x86_ops->get_msr_feature(msr))
1239
- return 1;
1465
+ return kvm_x86_ops.get_msr_feature(msr);
12401466 }
12411467 return 0;
12421468 }
....@@ -1248,6 +1474,14 @@
12481474
12491475 msr.index = index;
12501476 r = kvm_get_msr_feature(&msr);
1477
+
1478
+ if (r == KVM_MSR_RET_INVALID) {
1479
+ /* Unconditionally clear the output for simplicity */
1480
+ *data = 0;
1481
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
1482
+ r = 0;
1483
+ }
1484
+
12511485 if (r)
12521486 return r;
12531487
....@@ -1262,6 +1496,13 @@
12621496 return false;
12631497
12641498 if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1499
+ return false;
1500
+
1501
+ if (efer & (EFER_LME | EFER_LMA) &&
1502
+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1503
+ return false;
1504
+
1505
+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
12651506 return false;
12661507
12671508 return true;
....@@ -1280,6 +1521,7 @@
12801521 {
12811522 u64 old_efer = vcpu->arch.efer;
12821523 u64 efer = msr_info->data;
1524
+ int r;
12831525
12841526 if (efer & efer_reserved_bits)
12851527 return 1;
....@@ -1296,7 +1538,11 @@
12961538 efer &= ~EFER_LMA;
12971539 efer |= vcpu->arch.efer & EFER_LMA;
12981540
1299
- kvm_x86_ops->set_efer(vcpu, efer);
1541
+ r = kvm_x86_ops.set_efer(vcpu, efer);
1542
+ if (r) {
1543
+ WARN_ON(r > 0);
1544
+ return r;
1545
+ }
13001546
13011547 /* Update reserved bits */
13021548 if ((efer ^ old_efer) & EFER_NX)
....@@ -1311,20 +1557,73 @@
13111557 }
13121558 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
13131559
1560
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1561
+{
1562
+ struct kvm_x86_msr_filter *msr_filter;
1563
+ struct msr_bitmap_range *ranges;
1564
+ struct kvm *kvm = vcpu->kvm;
1565
+ bool allowed;
1566
+ int idx;
1567
+ u32 i;
1568
+
1569
+ /* x2APIC MSRs do not support filtering. */
1570
+ if (index >= 0x800 && index <= 0x8ff)
1571
+ return true;
1572
+
1573
+ idx = srcu_read_lock(&kvm->srcu);
1574
+
1575
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1576
+ if (!msr_filter) {
1577
+ allowed = true;
1578
+ goto out;
1579
+ }
1580
+
1581
+ allowed = msr_filter->default_allow;
1582
+ ranges = msr_filter->ranges;
1583
+
1584
+ for (i = 0; i < msr_filter->count; i++) {
1585
+ u32 start = ranges[i].base;
1586
+ u32 end = start + ranges[i].nmsrs;
1587
+ u32 flags = ranges[i].flags;
1588
+ unsigned long *bitmap = ranges[i].bitmap;
1589
+
1590
+ if ((index >= start) && (index < end) && (flags & type)) {
1591
+ allowed = !!test_bit(index - start, bitmap);
1592
+ break;
1593
+ }
1594
+
1595
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
1596
+ ++vcpu->stat.exits;
1597
+ }
1598
+
1599
+out:
1600
+ srcu_read_unlock(&kvm->srcu, idx);
1601
+
1602
+ return allowed;
1603
+}
1604
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1605
+
13141606 /*
1315
- * Writes msr value into into the appropriate "register".
1607
+ * Write @data into the MSR specified by @index. Select MSR specific fault
1608
+ * checks are bypassed if @host_initiated is %true.
13161609 * Returns 0 on success, non-0 otherwise.
13171610 * Assumes vcpu_load() was already called.
13181611 */
1319
-int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1612
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1613
+ bool host_initiated)
13201614 {
1321
- switch (msr->index) {
1615
+ struct msr_data msr;
1616
+
1617
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1618
+ return KVM_MSR_RET_FILTERED;
1619
+
1620
+ switch (index) {
13221621 case MSR_FS_BASE:
13231622 case MSR_GS_BASE:
13241623 case MSR_KERNEL_GS_BASE:
13251624 case MSR_CSTAR:
13261625 case MSR_LSTAR:
1327
- if (is_noncanonical_address(msr->data, vcpu))
1626
+ if (is_noncanonical_address(data, vcpu))
13281627 return 1;
13291628 break;
13301629 case MSR_IA32_SYSENTER_EIP:
....@@ -1341,54 +1640,313 @@
13411640 * value, and that something deterministic happens if the guest
13421641 * invokes 64-bit SYSENTER.
13431642 */
1344
- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
1643
+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
13451644 }
1346
- return kvm_x86_ops->set_msr(vcpu, msr);
1645
+
1646
+ msr.data = data;
1647
+ msr.index = index;
1648
+ msr.host_initiated = host_initiated;
1649
+
1650
+ return kvm_x86_ops.set_msr(vcpu, &msr);
1651
+}
1652
+
1653
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1654
+ u32 index, u64 data, bool host_initiated)
1655
+{
1656
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1657
+
1658
+ if (ret == KVM_MSR_RET_INVALID)
1659
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
1660
+ ret = 0;
1661
+
1662
+ return ret;
1663
+}
1664
+
1665
+/*
1666
+ * Read the MSR specified by @index into @data. Select MSR specific fault
1667
+ * checks are bypassed if @host_initiated is %true.
1668
+ * Returns 0 on success, non-0 otherwise.
1669
+ * Assumes vcpu_load() was already called.
1670
+ */
1671
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1672
+ bool host_initiated)
1673
+{
1674
+ struct msr_data msr;
1675
+ int ret;
1676
+
1677
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1678
+ return KVM_MSR_RET_FILTERED;
1679
+
1680
+ msr.index = index;
1681
+ msr.host_initiated = host_initiated;
1682
+
1683
+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
1684
+ if (!ret)
1685
+ *data = msr.data;
1686
+ return ret;
1687
+}
1688
+
1689
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1690
+ u32 index, u64 *data, bool host_initiated)
1691
+{
1692
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1693
+
1694
+ if (ret == KVM_MSR_RET_INVALID) {
1695
+ /* Unconditionally clear *data for simplicity */
1696
+ *data = 0;
1697
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
1698
+ ret = 0;
1699
+ }
1700
+
1701
+ return ret;
1702
+}
1703
+
1704
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1705
+{
1706
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
1707
+}
1708
+EXPORT_SYMBOL_GPL(kvm_get_msr);
1709
+
1710
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1711
+{
1712
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
13471713 }
13481714 EXPORT_SYMBOL_GPL(kvm_set_msr);
1715
+
1716
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
1717
+{
1718
+ if (vcpu->run->msr.error) {
1719
+ kvm_inject_gp(vcpu, 0);
1720
+ return 1;
1721
+ } else if (is_read) {
1722
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1723
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1724
+ }
1725
+
1726
+ return kvm_skip_emulated_instruction(vcpu);
1727
+}
1728
+
1729
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1730
+{
1731
+ return complete_emulated_msr(vcpu, true);
1732
+}
1733
+
1734
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
1735
+{
1736
+ return complete_emulated_msr(vcpu, false);
1737
+}
1738
+
1739
+static u64 kvm_msr_reason(int r)
1740
+{
1741
+ switch (r) {
1742
+ case KVM_MSR_RET_INVALID:
1743
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
1744
+ case KVM_MSR_RET_FILTERED:
1745
+ return KVM_MSR_EXIT_REASON_FILTER;
1746
+ default:
1747
+ return KVM_MSR_EXIT_REASON_INVAL;
1748
+ }
1749
+}
1750
+
1751
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1752
+ u32 exit_reason, u64 data,
1753
+ int (*completion)(struct kvm_vcpu *vcpu),
1754
+ int r)
1755
+{
1756
+ u64 msr_reason = kvm_msr_reason(r);
1757
+
1758
+ /* Check if the user wanted to know about this MSR fault */
1759
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
1760
+ return 0;
1761
+
1762
+ vcpu->run->exit_reason = exit_reason;
1763
+ vcpu->run->msr.error = 0;
1764
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
1765
+ vcpu->run->msr.reason = msr_reason;
1766
+ vcpu->run->msr.index = index;
1767
+ vcpu->run->msr.data = data;
1768
+ vcpu->arch.complete_userspace_io = completion;
1769
+
1770
+ return 1;
1771
+}
1772
+
1773
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
1774
+{
1775
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
1776
+ complete_emulated_rdmsr, r);
1777
+}
1778
+
1779
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
1780
+{
1781
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
1782
+ complete_emulated_wrmsr, r);
1783
+}
1784
+
1785
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
1786
+{
1787
+ u32 ecx = kvm_rcx_read(vcpu);
1788
+ u64 data;
1789
+ int r;
1790
+
1791
+ r = kvm_get_msr(vcpu, ecx, &data);
1792
+
1793
+ /* MSR read failed? See if we should ask user space */
1794
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
1795
+ /* Bounce to user space */
1796
+ return 0;
1797
+ }
1798
+
1799
+ /* MSR read failed? Inject a #GP */
1800
+ if (r) {
1801
+ trace_kvm_msr_read_ex(ecx);
1802
+ kvm_inject_gp(vcpu, 0);
1803
+ return 1;
1804
+ }
1805
+
1806
+ trace_kvm_msr_read(ecx, data);
1807
+
1808
+ kvm_rax_write(vcpu, data & -1u);
1809
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
1810
+ return kvm_skip_emulated_instruction(vcpu);
1811
+}
1812
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
1813
+
1814
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
1815
+{
1816
+ u32 ecx = kvm_rcx_read(vcpu);
1817
+ u64 data = kvm_read_edx_eax(vcpu);
1818
+ int r;
1819
+
1820
+ r = kvm_set_msr(vcpu, ecx, data);
1821
+
1822
+ /* MSR write failed? See if we should ask user space */
1823
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
1824
+ /* Bounce to user space */
1825
+ return 0;
1826
+
1827
+ /* Signal all other negative errors to userspace */
1828
+ if (r < 0)
1829
+ return r;
1830
+
1831
+ /* MSR write failed? Inject a #GP */
1832
+ if (r > 0) {
1833
+ trace_kvm_msr_write_ex(ecx, data);
1834
+ kvm_inject_gp(vcpu, 0);
1835
+ return 1;
1836
+ }
1837
+
1838
+ trace_kvm_msr_write(ecx, data);
1839
+ return kvm_skip_emulated_instruction(vcpu);
1840
+}
1841
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
1842
+
1843
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
1844
+{
1845
+ return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
1846
+ xfer_to_guest_mode_work_pending();
1847
+}
1848
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
1849
+
1850
+/*
1851
+ * The fast path for frequent and performance sensitive wrmsr emulation,
1852
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
1853
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
1854
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
1855
+ * other cases which must be called after interrupts are enabled on the host.
1856
+ */
1857
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
1858
+{
1859
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
1860
+ return 1;
1861
+
1862
+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
1863
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
1864
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
1865
+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
1866
+
1867
+ data &= ~(1 << 12);
1868
+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
1869
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
1870
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
1871
+ trace_kvm_apic_write(APIC_ICR, (u32)data);
1872
+ return 0;
1873
+ }
1874
+
1875
+ return 1;
1876
+}
1877
+
1878
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
1879
+{
1880
+ if (!kvm_can_use_hv_timer(vcpu))
1881
+ return 1;
1882
+
1883
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
1884
+ return 0;
1885
+}
1886
+
1887
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
1888
+{
1889
+ u32 msr = kvm_rcx_read(vcpu);
1890
+ u64 data;
1891
+ fastpath_t ret = EXIT_FASTPATH_NONE;
1892
+
1893
+ switch (msr) {
1894
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
1895
+ data = kvm_read_edx_eax(vcpu);
1896
+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
1897
+ kvm_skip_emulated_instruction(vcpu);
1898
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
1899
+ }
1900
+ break;
1901
+ case MSR_IA32_TSCDEADLINE:
1902
+ data = kvm_read_edx_eax(vcpu);
1903
+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
1904
+ kvm_skip_emulated_instruction(vcpu);
1905
+ ret = EXIT_FASTPATH_REENTER_GUEST;
1906
+ }
1907
+ break;
1908
+ default:
1909
+ break;
1910
+ }
1911
+
1912
+ if (ret != EXIT_FASTPATH_NONE)
1913
+ trace_kvm_msr_write(msr, data);
1914
+
1915
+ return ret;
1916
+}
1917
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
13491918
13501919 /*
13511920 * Adapt set_msr() to msr_io()'s calling convention
13521921 */
13531922 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
13541923 {
1355
- struct msr_data msr;
1356
- int r;
1357
-
1358
- msr.index = index;
1359
- msr.host_initiated = true;
1360
- r = kvm_get_msr(vcpu, &msr);
1361
- if (r)
1362
- return r;
1363
-
1364
- *data = msr.data;
1365
- return 0;
1924
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
13661925 }
13671926
13681927 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
13691928 {
1370
- struct msr_data msr;
1371
-
1372
- msr.data = *data;
1373
- msr.index = index;
1374
- msr.host_initiated = true;
1375
- return kvm_set_msr(vcpu, &msr);
1929
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
13761930 }
13771931
13781932 #ifdef CONFIG_X86_64
1933
+struct pvclock_clock {
1934
+ int vclock_mode;
1935
+ u64 cycle_last;
1936
+ u64 mask;
1937
+ u32 mult;
1938
+ u32 shift;
1939
+ u64 base_cycles;
1940
+ u64 offset;
1941
+};
1942
+
13791943 struct pvclock_gtod_data {
13801944 seqcount_t seq;
13811945
1382
- struct { /* extract of a clocksource struct */
1383
- int vclock_mode;
1384
- u64 cycle_last;
1385
- u64 mask;
1386
- u32 mult;
1387
- u32 shift;
1388
- } clock;
1946
+ struct pvclock_clock clock; /* extract of a clocksource struct */
1947
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
13891948
1390
- u64 boot_ns;
1391
- u64 nsec_base;
1949
+ ktime_t offs_boot;
13921950 u64 wall_time_sec;
13931951 };
13941952
....@@ -1397,44 +1955,54 @@
13971955 static void update_pvclock_gtod(struct timekeeper *tk)
13981956 {
13991957 struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1400
- u64 boot_ns;
1401
-
1402
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
14031958
14041959 write_seqcount_begin(&vdata->seq);
14051960
14061961 /* copy pvclock gtod data */
1407
- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
1962
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
14081963 vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
14091964 vdata->clock.mask = tk->tkr_mono.mask;
14101965 vdata->clock.mult = tk->tkr_mono.mult;
14111966 vdata->clock.shift = tk->tkr_mono.shift;
1967
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
1968
+ vdata->clock.offset = tk->tkr_mono.base;
14121969
1413
- vdata->boot_ns = boot_ns;
1414
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
1970
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
1971
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
1972
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
1973
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
1974
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
1975
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
1976
+ vdata->raw_clock.offset = tk->tkr_raw.base;
14151977
14161978 vdata->wall_time_sec = tk->xtime_sec;
14171979
1980
+ vdata->offs_boot = tk->offs_boot;
1981
+
14181982 write_seqcount_end(&vdata->seq);
14191983 }
1420
-#endif
14211984
1422
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
1985
+static s64 get_kvmclock_base_ns(void)
14231986 {
1424
- /*
1425
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
1426
- * vcpu_enter_guest. This function is only called from
1427
- * the physical CPU that is running vcpu.
1428
- */
1429
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
1987
+ /* Count up from boot time, but with the frequency of the raw clock. */
1988
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
14301989 }
1990
+#else
1991
+static s64 get_kvmclock_base_ns(void)
1992
+{
1993
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
1994
+ return ktime_get_boottime_ns();
1995
+}
1996
+#endif
14311997
14321998 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
14331999 {
14342000 int version;
14352001 int r;
14362002 struct pvclock_wall_clock wc;
1437
- struct timespec64 boot;
2003
+ u64 wall_nsec;
2004
+
2005
+ kvm->arch.wall_clock = wall_clock;
14382006
14392007 if (!wall_clock)
14402008 return;
....@@ -1454,23 +2022,46 @@
14542022 /*
14552023 * The guest calculates current wall clock time by adding
14562024 * system time (updated by kvm_guest_time_update below) to the
1457
- * wall clock specified here. guest system time equals host
1458
- * system time for us, thus we must fill in host boot time here.
2025
+ * wall clock specified here. We do the reverse here.
14592026 */
1460
- getboottime64(&boot);
2027
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
14612028
1462
- if (kvm->arch.kvmclock_offset) {
1463
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
1464
- boot = timespec64_sub(boot, ts);
1465
- }
1466
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
1467
- wc.nsec = boot.tv_nsec;
2029
+ wc.nsec = do_div(wall_nsec, 1000000000);
2030
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
14682031 wc.version = version;
14692032
14702033 kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
14712034
14722035 version++;
14732036 kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2037
+}
2038
+
2039
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2040
+ bool old_msr, bool host_initiated)
2041
+{
2042
+ struct kvm_arch *ka = &vcpu->kvm->arch;
2043
+
2044
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
2045
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2046
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2047
+
2048
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
2049
+ }
2050
+
2051
+ vcpu->arch.time = system_time;
2052
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2053
+
2054
+ /* we verify if the enable bit is set... */
2055
+ vcpu->arch.pv_time_enabled = false;
2056
+ if (!(system_time & 1))
2057
+ return;
2058
+
2059
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
2060
+ &vcpu->arch.pv_time, system_time & ~1ULL,
2061
+ sizeof(struct pvclock_vcpu_time_info)))
2062
+ vcpu->arch.pv_time_enabled = true;
2063
+
2064
+ return;
14742065 }
14752066
14762067 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
....@@ -1505,9 +2096,6 @@
15052096
15062097 *pshift = shift;
15072098 *pmultiplier = div_frac(scaled64, tps32);
1508
-
1509
- pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
1510
- __func__, base_hz, scaled_hz, shift, *pmultiplier);
15112099 }
15122100
15132101 #ifdef CONFIG_X86_64
....@@ -1604,7 +2192,7 @@
16042192
16052193 static inline int gtod_is_based_on_tsc(int mode)
16062194 {
1607
- return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
2195
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
16082196 }
16092197
16102198 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
....@@ -1633,12 +2221,6 @@
16332221 atomic_read(&vcpu->kvm->online_vcpus),
16342222 ka->use_master_clock, gtod->clock.vclock_mode);
16352223 #endif
1636
-}
1637
-
1638
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1639
-{
1640
- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1641
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
16422224 }
16432225
16442226 /*
....@@ -1679,15 +2261,14 @@
16792261
16802262 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
16812263 {
1682
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
1683
-
1684
- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
2264
+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
16852265 }
16862266 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
16872267
16882268 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
16892269 {
1690
- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
2270
+ vcpu->arch.l1_tsc_offset = offset;
2271
+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
16912272 }
16922273
16932274 static inline bool kvm_check_tsc_unstable(void)
....@@ -1697,29 +2278,28 @@
16972278 * TSC is marked unstable when we're running on Hyper-V,
16982279 * 'TSC page' clocksource is good.
16992280 */
1700
- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
2281
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
17012282 return false;
17022283 #endif
17032284 return check_tsc_unstable();
17042285 }
17052286
1706
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
2287
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
17072288 {
17082289 struct kvm *kvm = vcpu->kvm;
17092290 u64 offset, ns, elapsed;
17102291 unsigned long flags;
17112292 bool matched;
17122293 bool already_matched;
1713
- u64 data = msr->data;
17142294 bool synchronizing = false;
17152295
17162296 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
17172297 offset = kvm_compute_tsc_offset(vcpu, data);
1718
- ns = ktime_get_boot_ns();
2298
+ ns = get_kvmclock_base_ns();
17192299 elapsed = ns - kvm->arch.last_tsc_nsec;
17202300
17212301 if (vcpu->arch.virtual_tsc_khz) {
1722
- if (data == 0 && msr->host_initiated) {
2302
+ if (data == 0) {
17232303 /*
17242304 * detection of vcpu initialization -- need to sync
17252305 * with other vCPUs. This particularly helps to keep
....@@ -1750,12 +2330,10 @@
17502330 vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
17512331 if (!kvm_check_tsc_unstable()) {
17522332 offset = kvm->arch.cur_tsc_offset;
1753
- pr_debug("kvm: matched tsc offset for %llu\n", data);
17542333 } else {
17552334 u64 delta = nsec_to_cycles(vcpu, elapsed);
17562335 data += delta;
17572336 offset = kvm_compute_tsc_offset(vcpu, data);
1758
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
17592337 }
17602338 matched = true;
17612339 already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
....@@ -1774,8 +2352,6 @@
17742352 kvm->arch.cur_tsc_write = data;
17752353 kvm->arch.cur_tsc_offset = offset;
17762354 matched = false;
1777
- pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1778
- kvm->arch.cur_tsc_generation, data);
17792355 }
17802356
17812357 /*
....@@ -1793,9 +2369,6 @@
17932369 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
17942370 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
17952371
1796
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
1797
- update_ia32_tsc_adjust_msr(vcpu, offset);
1798
-
17992372 kvm_vcpu_write_tsc_offset(vcpu, offset);
18002373 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
18012374
....@@ -1810,12 +2383,10 @@
18102383 spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
18112384 }
18122385
1813
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
1814
-
18152386 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
18162387 s64 adjustment)
18172388 {
1818
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
2389
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
18192390 kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
18202391 }
18212392
....@@ -1849,43 +2420,43 @@
18492420 return last;
18502421 }
18512422
1852
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
2423
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2424
+ int *mode)
18532425 {
18542426 long v;
1855
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
18562427 u64 tsc_pg_val;
18572428
1858
- switch (gtod->clock.vclock_mode) {
1859
- case VCLOCK_HVCLOCK:
2429
+ switch (clock->vclock_mode) {
2430
+ case VDSO_CLOCKMODE_HVCLOCK:
18602431 tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
18612432 tsc_timestamp);
18622433 if (tsc_pg_val != U64_MAX) {
18632434 /* TSC page valid */
1864
- *mode = VCLOCK_HVCLOCK;
1865
- v = (tsc_pg_val - gtod->clock.cycle_last) &
1866
- gtod->clock.mask;
2435
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
2436
+ v = (tsc_pg_val - clock->cycle_last) &
2437
+ clock->mask;
18672438 } else {
18682439 /* TSC page invalid */
1869
- *mode = VCLOCK_NONE;
2440
+ *mode = VDSO_CLOCKMODE_NONE;
18702441 }
18712442 break;
1872
- case VCLOCK_TSC:
1873
- *mode = VCLOCK_TSC;
2443
+ case VDSO_CLOCKMODE_TSC:
2444
+ *mode = VDSO_CLOCKMODE_TSC;
18742445 *tsc_timestamp = read_tsc();
1875
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
1876
- gtod->clock.mask;
2446
+ v = (*tsc_timestamp - clock->cycle_last) &
2447
+ clock->mask;
18772448 break;
18782449 default:
1879
- *mode = VCLOCK_NONE;
2450
+ *mode = VDSO_CLOCKMODE_NONE;
18802451 }
18812452
1882
- if (*mode == VCLOCK_NONE)
2453
+ if (*mode == VDSO_CLOCKMODE_NONE)
18832454 *tsc_timestamp = v = 0;
18842455
1885
- return v * gtod->clock.mult;
2456
+ return v * clock->mult;
18862457 }
18872458
1888
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
2459
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
18892460 {
18902461 struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
18912462 unsigned long seq;
....@@ -1894,10 +2465,10 @@
18942465
18952466 do {
18962467 seq = read_seqcount_begin(&gtod->seq);
1897
- ns = gtod->nsec_base;
1898
- ns += vgettsc(tsc_timestamp, &mode);
1899
- ns >>= gtod->clock.shift;
1900
- ns += gtod->boot_ns;
2468
+ ns = gtod->raw_clock.base_cycles;
2469
+ ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2470
+ ns >>= gtod->raw_clock.shift;
2471
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
19012472 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
19022473 *t = ns;
19032474
....@@ -1914,8 +2485,8 @@
19142485 do {
19152486 seq = read_seqcount_begin(&gtod->seq);
19162487 ts->tv_sec = gtod->wall_time_sec;
1917
- ns = gtod->nsec_base;
1918
- ns += vgettsc(tsc_timestamp, &mode);
2488
+ ns = gtod->clock.base_cycles;
2489
+ ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
19192490 ns >>= gtod->clock.shift;
19202491 } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
19212492
....@@ -1932,7 +2503,7 @@
19322503 if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
19332504 return false;
19342505
1935
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
2506
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
19362507 tsc_timestamp));
19372508 }
19382509
....@@ -2057,7 +2628,7 @@
20572628 spin_lock(&ka->pvclock_gtod_sync_lock);
20582629 if (!ka->use_master_clock) {
20592630 spin_unlock(&ka->pvclock_gtod_sync_lock);
2060
- return ktime_get_boot_ns() + ka->kvmclock_offset;
2631
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
20612632 }
20622633
20632634 hv_clock.tsc_timestamp = ka->master_cycle_now;
....@@ -2073,7 +2644,7 @@
20732644 &hv_clock.tsc_to_system_mul);
20742645 ret = __pvclock_read_cycles(&hv_clock, rdtsc());
20752646 } else
2076
- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
2647
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
20772648
20782649 put_cpu();
20792650
....@@ -2172,7 +2743,7 @@
21722743 }
21732744 if (!use_master_clock) {
21742745 host_tsc = rdtsc();
2175
- kernel_ns = ktime_get_boot_ns();
2746
+ kernel_ns = get_kvmclock_base_ns();
21762747 }
21772748
21782749 tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
....@@ -2284,6 +2855,18 @@
22842855 KVMCLOCK_SYNC_PERIOD);
22852856 }
22862857
2858
+/*
2859
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
2860
+ */
2861
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
2862
+{
2863
+ /* McStatusWrEn enabled? */
2864
+ if (guest_cpuid_is_amd_or_hygon(vcpu))
2865
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
2866
+
2867
+ return false;
2868
+}
2869
+
22872870 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
22882871 {
22892872 u64 mcg_cap = vcpu->arch.mcg_cap;
....@@ -2313,14 +2896,22 @@
23132896 /* only 0 or all 1s can be written to IA32_MCi_CTL
23142897 * some Linux kernels though clear bit 10 in bank 4 to
23152898 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
2316
- * this to avoid an uncatched #GP in the guest
2899
+ * this to avoid an uncatched #GP in the guest.
2900
+ *
2901
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
2902
+ * correctable, single-bit ECC data errors.
23172903 */
23182904 if ((offset & 0x3) == 0 &&
2319
- data != 0 && (data | (1 << 10)) != ~(u64)0)
2320
- return -1;
2905
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
2906
+ return 1;
2907
+
2908
+ /* MCi_STATUS */
23212909 if (!msr_info->host_initiated &&
2322
- (offset & 0x3) == 1 && data != 0)
2323
- return -1;
2910
+ (offset & 0x3) == 1 && data != 0) {
2911
+ if (!can_set_mci_status(vcpu))
2912
+ return 1;
2913
+ }
2914
+
23242915 vcpu->arch.mce_banks[offset] = data;
23252916 break;
23262917 }
....@@ -2340,104 +2931,192 @@
23402931 u32 page_num = data & ~PAGE_MASK;
23412932 u64 page_addr = data & PAGE_MASK;
23422933 u8 *page;
2343
- int r;
23442934
2345
- r = -E2BIG;
23462935 if (page_num >= blob_size)
2347
- goto out;
2348
- r = -ENOMEM;
2936
+ return 1;
2937
+
23492938 page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
2350
- if (IS_ERR(page)) {
2351
- r = PTR_ERR(page);
2352
- goto out;
2939
+ if (IS_ERR(page))
2940
+ return PTR_ERR(page);
2941
+
2942
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
2943
+ kfree(page);
2944
+ return 1;
23532945 }
2354
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
2355
- goto out_free;
2356
- r = 0;
2357
-out_free:
2358
- kfree(page);
2359
-out:
2360
- return r;
2946
+ return 0;
2947
+}
2948
+
2949
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
2950
+{
2951
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
2952
+
2953
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
23612954 }
23622955
23632956 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
23642957 {
23652958 gpa_t gpa = data & ~0x3f;
23662959
2367
- /* Bits 3:5 are reserved, Should be zero */
2368
- if (data & 0x38)
2960
+ /* Bits 4:5 are reserved, Should be zero */
2961
+ if (data & 0x30)
23692962 return 1;
23702963
2371
- vcpu->arch.apf.msr_val = data;
2964
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
2965
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
2966
+ return 1;
23722967
2373
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
2968
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
2969
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
2970
+ return 1;
2971
+
2972
+ if (!lapic_in_kernel(vcpu))
2973
+ return data ? 1 : 0;
2974
+
2975
+ vcpu->arch.apf.msr_en_val = data;
2976
+
2977
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
23742978 kvm_clear_async_pf_completion_queue(vcpu);
23752979 kvm_async_pf_hash_reset(vcpu);
23762980 return 0;
23772981 }
23782982
23792983 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2380
- sizeof(u32)))
2984
+ sizeof(u64)))
23812985 return 1;
23822986
23832987 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
23842988 vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
2989
+
23852990 kvm_async_pf_wakeup_all(vcpu);
2991
+
2992
+ return 0;
2993
+}
2994
+
2995
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
2996
+{
2997
+ /* Bits 8-63 are reserved */
2998
+ if (data >> 8)
2999
+ return 1;
3000
+
3001
+ if (!lapic_in_kernel(vcpu))
3002
+ return 1;
3003
+
3004
+ vcpu->arch.apf.msr_int_val = data;
3005
+
3006
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3007
+
23863008 return 0;
23873009 }
23883010
23893011 static void kvmclock_reset(struct kvm_vcpu *vcpu)
23903012 {
23913013 vcpu->arch.pv_time_enabled = false;
3014
+ vcpu->arch.time = 0;
23923015 }
23933016
2394
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
3017
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
23953018 {
23963019 ++vcpu->stat.tlb_flush;
2397
- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
3020
+ kvm_x86_ops.tlb_flush_all(vcpu);
3021
+}
3022
+
3023
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3024
+{
3025
+ ++vcpu->stat.tlb_flush;
3026
+ kvm_x86_ops.tlb_flush_guest(vcpu);
23983027 }
23993028
24003029 static void record_steal_time(struct kvm_vcpu *vcpu)
24013030 {
2402
- struct kvm_host_map map;
2403
- struct kvm_steal_time *st;
3031
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3032
+ struct kvm_steal_time __user *st;
3033
+ struct kvm_memslots *slots;
3034
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3035
+ u64 steal;
3036
+ u32 version;
24043037
24053038 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
24063039 return;
24073040
2408
- /* -EAGAIN is returned in atomic context so we can just return. */
2409
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
2410
- &map, &vcpu->arch.st.cache, false))
3041
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
24113042 return;
24123043
2413
- st = map.hva +
2414
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3044
+ slots = kvm_memslots(vcpu->kvm);
24153045
3046
+ if (unlikely(slots->generation != ghc->generation ||
3047
+ gpa != ghc->gpa ||
3048
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3049
+ /* We rely on the fact that it fits in a single page. */
3050
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3051
+
3052
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3053
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3054
+ return;
3055
+ }
3056
+
3057
+ st = (struct kvm_steal_time __user *)ghc->hva;
24163058 /*
24173059 * Doing a TLB flush here, on the guest's behalf, can avoid
24183060 * expensive IPIs.
24193061 */
2420
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
2421
- kvm_vcpu_flush_tlb(vcpu, false);
3062
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3063
+ u8 st_preempted = 0;
3064
+ int err = -EFAULT;
24223065
2423
- vcpu->arch.st.preempted = 0;
3066
+ if (!user_access_begin(st, sizeof(*st)))
3067
+ return;
24243068
2425
- if (st->version & 1)
2426
- st->version += 1; /* first time write, random junk */
3069
+ asm volatile("1: xchgb %0, %2\n"
3070
+ "xor %1, %1\n"
3071
+ "2:\n"
3072
+ _ASM_EXTABLE_UA(1b, 2b)
3073
+ : "+q" (st_preempted),
3074
+ "+&r" (err),
3075
+ "+m" (st->preempted));
3076
+ if (err)
3077
+ goto out;
24273078
2428
- st->version += 1;
3079
+ user_access_end();
3080
+
3081
+ vcpu->arch.st.preempted = 0;
3082
+
3083
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3084
+ st_preempted & KVM_VCPU_FLUSH_TLB);
3085
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
3086
+ kvm_vcpu_flush_tlb_guest(vcpu);
3087
+
3088
+ if (!user_access_begin(st, sizeof(*st)))
3089
+ goto dirty;
3090
+ } else {
3091
+ if (!user_access_begin(st, sizeof(*st)))
3092
+ return;
3093
+
3094
+ unsafe_put_user(0, &st->preempted, out);
3095
+ vcpu->arch.st.preempted = 0;
3096
+ }
3097
+
3098
+ unsafe_get_user(version, &st->version, out);
3099
+ if (version & 1)
3100
+ version += 1; /* first time write, random junk */
3101
+
3102
+ version += 1;
3103
+ unsafe_put_user(version, &st->version, out);
24293104
24303105 smp_wmb();
24313106
2432
- st->steal += current->sched_info.run_delay -
3107
+ unsafe_get_user(steal, &st->steal, out);
3108
+ steal += current->sched_info.run_delay -
24333109 vcpu->arch.st.last_steal;
24343110 vcpu->arch.st.last_steal = current->sched_info.run_delay;
3111
+ unsafe_put_user(steal, &st->steal, out);
24353112
2436
- smp_wmb();
3113
+ version += 1;
3114
+ unsafe_put_user(version, &st->version, out);
24373115
2438
- st->version += 1;
2439
-
2440
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
3116
+ out:
3117
+ user_access_end();
3118
+ dirty:
3119
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
24413120 }
24423121
24433122 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
....@@ -2465,14 +3144,31 @@
24653144 return 1;
24663145 vcpu->arch.arch_capabilities = data;
24673146 break;
3147
+ case MSR_IA32_PERF_CAPABILITIES: {
3148
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
3149
+
3150
+ if (!msr_info->host_initiated)
3151
+ return 1;
3152
+ if (kvm_get_msr_feature(&msr_ent))
3153
+ return 1;
3154
+ if (data & ~msr_ent.data)
3155
+ return 1;
3156
+
3157
+ vcpu->arch.perf_capabilities = data;
3158
+
3159
+ return 0;
3160
+ }
24683161 case MSR_EFER:
24693162 return set_efer(vcpu, msr_info);
24703163 case MSR_K7_HWCR:
24713164 data &= ~(u64)0x40; /* ignore flush filter disable */
24723165 data &= ~(u64)0x100; /* ignore ignne emulation enable */
24733166 data &= ~(u64)0x8; /* ignore TLB cache disable */
2474
- data &= ~(u64)0x40000; /* ignore Mc status write enable */
2475
- if (data != 0) {
3167
+
3168
+ /* Handle McStatusWrEn */
3169
+ if (data == BIT_ULL(18)) {
3170
+ vcpu->arch.msr_hwcr = data;
3171
+ } else if (data != 0) {
24763172 vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
24773173 data);
24783174 return 1;
....@@ -2493,9 +3189,9 @@
24933189 /* Values other than LBR and BTF are vendor-specific,
24943190 thus reserved and should throw a #GP */
24953191 return 1;
2496
- }
2497
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2498
- __func__, data);
3192
+ } else if (report_ignored_msrs)
3193
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
3194
+ __func__, data);
24993195 break;
25003196 case 0x200 ... 0x2ff:
25013197 return kvm_mtrr_set_msr(vcpu, msr, data);
....@@ -2520,15 +3216,46 @@
25203216 }
25213217 break;
25223218 case MSR_IA32_MISC_ENABLE:
2523
- vcpu->arch.ia32_misc_enable_msr = data;
3219
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3220
+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
3221
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3222
+ return 1;
3223
+ vcpu->arch.ia32_misc_enable_msr = data;
3224
+ kvm_update_cpuid_runtime(vcpu);
3225
+ } else {
3226
+ vcpu->arch.ia32_misc_enable_msr = data;
3227
+ }
25243228 break;
25253229 case MSR_IA32_SMBASE:
25263230 if (!msr_info->host_initiated)
25273231 return 1;
25283232 vcpu->arch.smbase = data;
25293233 break;
3234
+ case MSR_IA32_POWER_CTL:
3235
+ vcpu->arch.msr_ia32_power_ctl = data;
3236
+ break;
25303237 case MSR_IA32_TSC:
2531
- kvm_write_tsc(vcpu, msr_info);
3238
+ if (msr_info->host_initiated) {
3239
+ kvm_synchronize_tsc(vcpu, data);
3240
+ } else {
3241
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3242
+ adjust_tsc_offset_guest(vcpu, adj);
3243
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
3244
+ }
3245
+ break;
3246
+ case MSR_IA32_XSS:
3247
+ if (!msr_info->host_initiated &&
3248
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3249
+ return 1;
3250
+ /*
3251
+ * KVM supports exposing PT to the guest, but does not support
3252
+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3253
+ * XSAVES/XRSTORS to save/restore PT MSRs.
3254
+ */
3255
+ if (data & ~supported_xss)
3256
+ return 1;
3257
+ vcpu->arch.ia32_xss = data;
3258
+ kvm_update_cpuid_runtime(vcpu);
25323259 break;
25333260 case MSR_SMI_COUNT:
25343261 if (!msr_info->host_initiated)
....@@ -2536,46 +3263,54 @@
25363263 vcpu->arch.smi_count = data;
25373264 break;
25383265 case MSR_KVM_WALL_CLOCK_NEW:
3266
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3267
+ return 1;
3268
+
3269
+ kvm_write_wall_clock(vcpu->kvm, data);
3270
+ break;
25393271 case MSR_KVM_WALL_CLOCK:
2540
- vcpu->kvm->arch.wall_clock = data;
3272
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3273
+ return 1;
3274
+
25413275 kvm_write_wall_clock(vcpu->kvm, data);
25423276 break;
25433277 case MSR_KVM_SYSTEM_TIME_NEW:
2544
- case MSR_KVM_SYSTEM_TIME: {
2545
- struct kvm_arch *ka = &vcpu->kvm->arch;
3278
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3279
+ return 1;
25463280
2547
- kvmclock_reset(vcpu);
2548
-
2549
- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
2550
- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
2551
-
2552
- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
2553
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2554
-
2555
- ka->boot_vcpu_runs_old_kvmclock = tmp;
2556
- }
2557
-
2558
- vcpu->arch.time = data;
2559
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2560
-
2561
- /* we verify if the enable bit is set... */
2562
- if (!(data & 1))
2563
- break;
2564
-
2565
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2566
- &vcpu->arch.pv_time, data & ~1ULL,
2567
- sizeof(struct pvclock_vcpu_time_info)))
2568
- vcpu->arch.pv_time_enabled = false;
2569
- else
2570
- vcpu->arch.pv_time_enabled = true;
2571
-
3281
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
25723282 break;
2573
- }
3283
+ case MSR_KVM_SYSTEM_TIME:
3284
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3285
+ return 1;
3286
+
3287
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
3288
+ break;
25743289 case MSR_KVM_ASYNC_PF_EN:
3290
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3291
+ return 1;
3292
+
25753293 if (kvm_pv_enable_async_pf(vcpu, data))
25763294 return 1;
25773295 break;
3296
+ case MSR_KVM_ASYNC_PF_INT:
3297
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3298
+ return 1;
3299
+
3300
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
3301
+ return 1;
3302
+ break;
3303
+ case MSR_KVM_ASYNC_PF_ACK:
3304
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3305
+ return 1;
3306
+ if (data & 0x1) {
3307
+ vcpu->arch.apf.pageready_pending = false;
3308
+ kvm_check_async_pf_completion(vcpu);
3309
+ }
3310
+ break;
25783311 case MSR_KVM_STEAL_TIME:
3312
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3313
+ return 1;
25793314
25803315 if (unlikely(!sched_info_on()))
25813316 return 1;
....@@ -2592,8 +3327,22 @@
25923327
25933328 break;
25943329 case MSR_KVM_PV_EOI_EN:
3330
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3331
+ return 1;
3332
+
25953333 if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
25963334 return 1;
3335
+ break;
3336
+
3337
+ case MSR_KVM_POLL_CONTROL:
3338
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3339
+ return 1;
3340
+
3341
+ /* only enable bit supported */
3342
+ if (data & (-1ULL << 1))
3343
+ return 1;
3344
+
3345
+ vcpu->arch.msr_kvm_poll_control = data;
25973346 break;
25983347
25993348 case MSR_IA32_MCG_CTL:
....@@ -2603,7 +3352,8 @@
26033352
26043353 case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
26053354 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
2606
- pr = true; /* fall through */
3355
+ pr = true;
3356
+ fallthrough;
26073357 case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
26083358 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
26093359 if (kvm_pmu_is_valid_msr(vcpu, msr))
....@@ -2624,6 +3374,8 @@
26243374 */
26253375 break;
26263376 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3377
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3378
+ case HV_X64_MSR_SYNDBG_OPTIONS:
26273379 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
26283380 case HV_X64_MSR_CRASH_CTL:
26293381 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
....@@ -2669,33 +3421,11 @@
26693421 return xen_hvm_config(vcpu, data);
26703422 if (kvm_pmu_is_valid_msr(vcpu, msr))
26713423 return kvm_pmu_set_msr(vcpu, msr_info);
2672
- if (!ignore_msrs) {
2673
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
2674
- msr, data);
2675
- return 1;
2676
- } else {
2677
- if (report_ignored_msrs)
2678
- vcpu_unimpl(vcpu,
2679
- "ignored wrmsr: 0x%x data 0x%llx\n",
2680
- msr, data);
2681
- break;
2682
- }
3424
+ return KVM_MSR_RET_INVALID;
26833425 }
26843426 return 0;
26853427 }
26863428 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2687
-
2688
-
2689
-/*
2690
- * Reads an msr value (of 'msr_index') into 'pdata'.
2691
- * Returns 0 on success, non-0 otherwise.
2692
- * Assumes vcpu_load() was already called.
2693
- */
2694
-int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
2695
-{
2696
- return kvm_x86_ops->get_msr(vcpu, msr);
2697
-}
2698
-EXPORT_SYMBOL_GPL(kvm_get_msr);
26993429
27003430 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
27013431 {
....@@ -2748,7 +3478,6 @@
27483478 case MSR_K8_SYSCFG:
27493479 case MSR_K8_TSEG_ADDR:
27503480 case MSR_K8_TSEG_MASK:
2751
- case MSR_K7_HWCR:
27523481 case MSR_VM_HSAVE_PA:
27533482 case MSR_K8_INT_PENDING_MSG:
27543483 case MSR_AMD64_NB_CFG:
....@@ -2757,6 +3486,17 @@
27573486 case MSR_IA32_PERF_CTL:
27583487 case MSR_AMD64_DC_CFG:
27593488 case MSR_F15H_EX_CFG:
3489
+ /*
3490
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
3491
+ * limit) MSRs. Just return 0, as we do not want to expose the host
3492
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
3493
+ * so for existing CPU-specific MSRs.
3494
+ */
3495
+ case MSR_RAPL_POWER_UNIT:
3496
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
3497
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
3498
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
3499
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
27603500 msr_info->data = 0;
27613501 break;
27623502 case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
....@@ -2765,7 +3505,7 @@
27653505 case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
27663506 case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
27673507 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2768
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
3508
+ return kvm_pmu_get_msr(vcpu, msr_info);
27693509 msr_info->data = 0;
27703510 break;
27713511 case MSR_IA32_UCODE_REV:
....@@ -2777,9 +3517,31 @@
27773517 return 1;
27783518 msr_info->data = vcpu->arch.arch_capabilities;
27793519 break;
2780
- case MSR_IA32_TSC:
2781
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
3520
+ case MSR_IA32_PERF_CAPABILITIES:
3521
+ if (!msr_info->host_initiated &&
3522
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
3523
+ return 1;
3524
+ msr_info->data = vcpu->arch.perf_capabilities;
27823525 break;
3526
+ case MSR_IA32_POWER_CTL:
3527
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
3528
+ break;
3529
+ case MSR_IA32_TSC: {
3530
+ /*
3531
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
3532
+ * even when not intercepted. AMD manual doesn't explicitly
3533
+ * state this but appears to behave the same.
3534
+ *
3535
+ * On userspace reads and writes, however, we unconditionally
3536
+ * return L1's TSC value to ensure backwards-compatible
3537
+ * behavior for migration.
3538
+ */
3539
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
3540
+ vcpu->arch.tsc_offset;
3541
+
3542
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
3543
+ break;
3544
+ }
27833545 case MSR_MTRRcap:
27843546 case 0x200 ... 0x2ff:
27853547 return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
....@@ -2805,7 +3567,6 @@
28053567 break;
28063568 case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
28073569 return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
2808
- break;
28093570 case MSR_IA32_TSCDEADLINE:
28103571 msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
28113572 break;
....@@ -2833,21 +3594,64 @@
28333594 msr_info->data = vcpu->arch.efer;
28343595 break;
28353596 case MSR_KVM_WALL_CLOCK:
3597
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3598
+ return 1;
3599
+
3600
+ msr_info->data = vcpu->kvm->arch.wall_clock;
3601
+ break;
28363602 case MSR_KVM_WALL_CLOCK_NEW:
3603
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3604
+ return 1;
3605
+
28373606 msr_info->data = vcpu->kvm->arch.wall_clock;
28383607 break;
28393608 case MSR_KVM_SYSTEM_TIME:
3609
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3610
+ return 1;
3611
+
3612
+ msr_info->data = vcpu->arch.time;
3613
+ break;
28403614 case MSR_KVM_SYSTEM_TIME_NEW:
3615
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3616
+ return 1;
3617
+
28413618 msr_info->data = vcpu->arch.time;
28423619 break;
28433620 case MSR_KVM_ASYNC_PF_EN:
2844
- msr_info->data = vcpu->arch.apf.msr_val;
3621
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3622
+ return 1;
3623
+
3624
+ msr_info->data = vcpu->arch.apf.msr_en_val;
3625
+ break;
3626
+ case MSR_KVM_ASYNC_PF_INT:
3627
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3628
+ return 1;
3629
+
3630
+ msr_info->data = vcpu->arch.apf.msr_int_val;
3631
+ break;
3632
+ case MSR_KVM_ASYNC_PF_ACK:
3633
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3634
+ return 1;
3635
+
3636
+ msr_info->data = 0;
28453637 break;
28463638 case MSR_KVM_STEAL_TIME:
3639
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3640
+ return 1;
3641
+
28473642 msr_info->data = vcpu->arch.st.msr_val;
28483643 break;
28493644 case MSR_KVM_PV_EOI_EN:
3645
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3646
+ return 1;
3647
+
28503648 msr_info->data = vcpu->arch.pv_eoi.msr_val;
3649
+ break;
3650
+ case MSR_KVM_POLL_CONTROL:
3651
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3652
+ return 1;
3653
+
3654
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
28513655 break;
28523656 case MSR_IA32_P5_MC_ADDR:
28533657 case MSR_IA32_P5_MC_TYPE:
....@@ -2857,6 +3661,12 @@
28573661 case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
28583662 return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
28593663 msr_info->host_initiated);
3664
+ case MSR_IA32_XSS:
3665
+ if (!msr_info->host_initiated &&
3666
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3667
+ return 1;
3668
+ msr_info->data = vcpu->arch.ia32_xss;
3669
+ break;
28603670 case MSR_K7_CLK_CTL:
28613671 /*
28623672 * Provide expected ramp-up count for K7. All other
....@@ -2870,6 +3680,8 @@
28703680 msr_info->data = 0x20000000;
28713681 break;
28723682 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3683
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3684
+ case HV_X64_MSR_SYNDBG_OPTIONS:
28733685 case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
28743686 case HV_X64_MSR_CRASH_CTL:
28753687 case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
....@@ -2879,7 +3691,6 @@
28793691 return kvm_hv_get_msr_common(vcpu,
28803692 msr_info->index, &msr_info->data,
28813693 msr_info->host_initiated);
2882
- break;
28833694 case MSR_IA32_BBL_CR_CTL3:
28843695 /* This legacy MSR exists but isn't fully documented in current
28853696 * silicon. It is however accessed by winxp in very narrow
....@@ -2912,20 +3723,13 @@
29123723 case MSR_MISC_FEATURES_ENABLES:
29133724 msr_info->data = vcpu->arch.msr_misc_features_enables;
29143725 break;
3726
+ case MSR_K7_HWCR:
3727
+ msr_info->data = vcpu->arch.msr_hwcr;
3728
+ break;
29153729 default:
29163730 if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
2917
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
2918
- if (!ignore_msrs) {
2919
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
2920
- msr_info->index);
2921
- return 1;
2922
- } else {
2923
- if (report_ignored_msrs)
2924
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
2925
- msr_info->index);
2926
- msr_info->data = 0;
2927
- }
2928
- break;
3731
+ return kvm_pmu_get_msr(vcpu, msr_info);
3732
+ return KVM_MSR_RET_INVALID;
29293733 }
29303734 return 0;
29313735 }
....@@ -2966,7 +3770,7 @@
29663770 unsigned size;
29673771
29683772 r = -EFAULT;
2969
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
3773
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
29703774 goto out;
29713775
29723776 r = -E2BIG;
....@@ -3037,24 +3841,33 @@
30373841 case KVM_CAP_HYPERV_VP_INDEX:
30383842 case KVM_CAP_HYPERV_EVENTFD:
30393843 case KVM_CAP_HYPERV_TLBFLUSH:
3844
+ case KVM_CAP_HYPERV_SEND_IPI:
3845
+ case KVM_CAP_HYPERV_CPUID:
30403846 case KVM_CAP_PCI_SEGMENT:
30413847 case KVM_CAP_DEBUGREGS:
30423848 case KVM_CAP_X86_ROBUST_SINGLESTEP:
30433849 case KVM_CAP_XSAVE:
30443850 case KVM_CAP_ASYNC_PF:
3851
+ case KVM_CAP_ASYNC_PF_INT:
30453852 case KVM_CAP_GET_TSC_KHZ:
30463853 case KVM_CAP_KVMCLOCK_CTRL:
30473854 case KVM_CAP_READONLY_MEM:
30483855 case KVM_CAP_HYPERV_TIME:
30493856 case KVM_CAP_IOAPIC_POLARITY_IGNORED:
30503857 case KVM_CAP_TSC_DEADLINE_TIMER:
3051
- case KVM_CAP_ENABLE_CAP_VM:
30523858 case KVM_CAP_DISABLE_QUIRKS:
30533859 case KVM_CAP_SET_BOOT_CPU_ID:
30543860 case KVM_CAP_SPLIT_IRQCHIP:
30553861 case KVM_CAP_IMMEDIATE_EXIT:
3862
+ case KVM_CAP_PMU_EVENT_FILTER:
30563863 case KVM_CAP_GET_MSR_FEATURES:
30573864 case KVM_CAP_MSR_PLATFORM_INFO:
3865
+ case KVM_CAP_EXCEPTION_PAYLOAD:
3866
+ case KVM_CAP_SET_GUEST_DEBUG:
3867
+ case KVM_CAP_LAST_CPU:
3868
+ case KVM_CAP_X86_USER_SPACE_MSR:
3869
+ case KVM_CAP_X86_MSR_FILTER:
3870
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
30583871 r = 1;
30593872 break;
30603873 case KVM_CAP_SYNC_REGS:
....@@ -3064,7 +3877,8 @@
30643877 r = KVM_CLOCK_TSC_STABLE;
30653878 break;
30663879 case KVM_CAP_X86_DISABLE_EXITS:
3067
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
3880
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
3881
+ KVM_X86_DISABLE_EXITS_CSTATE;
30683882 if(kvm_can_mwait_in_guest())
30693883 r |= KVM_X86_DISABLE_EXITS_MWAIT;
30703884 break;
....@@ -3077,10 +3891,10 @@
30773891 * fringe case that is not enabled except via specific settings
30783892 * of the module parameters.
30793893 */
3080
- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
3894
+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
30813895 break;
30823896 case KVM_CAP_VAPIC:
3083
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
3897
+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
30843898 break;
30853899 case KVM_CAP_NR_VCPUS:
30863900 r = KVM_SOFT_MAX_VCPUS;
....@@ -3090,9 +3904,6 @@
30903904 break;
30913905 case KVM_CAP_MAX_VCPU_ID:
30923906 r = KVM_MAX_VCPU_ID;
3093
- break;
3094
- case KVM_CAP_NR_MEMSLOTS:
3095
- r = KVM_USER_MEM_SLOTS;
30963907 break;
30973908 case KVM_CAP_PV_MMU: /* obsolete */
30983909 r = 0;
....@@ -3110,8 +3921,20 @@
31103921 r = KVM_X2APIC_API_VALID_FLAGS;
31113922 break;
31123923 case KVM_CAP_NESTED_STATE:
3113
- r = kvm_x86_ops->get_nested_state ?
3114
- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
3924
+ r = kvm_x86_ops.nested_ops->get_state ?
3925
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
3926
+ break;
3927
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
3928
+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
3929
+ break;
3930
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
3931
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
3932
+ break;
3933
+ case KVM_CAP_SMALLER_MAXPHYADDR:
3934
+ r = (int) allow_smaller_maxphyaddr;
3935
+ break;
3936
+ case KVM_CAP_STEAL_TIME:
3937
+ r = sched_info_on();
31153938 break;
31163939 default:
31173940 break;
....@@ -3133,11 +3956,11 @@
31333956 unsigned n;
31343957
31353958 r = -EFAULT;
3136
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
3959
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
31373960 goto out;
31383961 n = msr_list.nmsrs;
31393962 msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
3140
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
3963
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
31413964 goto out;
31423965 r = -E2BIG;
31433966 if (n < msr_list.nmsrs)
....@@ -3159,7 +3982,7 @@
31593982 struct kvm_cpuid2 cpuid;
31603983
31613984 r = -EFAULT;
3162
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3985
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
31633986 goto out;
31643987
31653988 r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
....@@ -3168,12 +3991,12 @@
31683991 goto out;
31693992
31703993 r = -EFAULT;
3171
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3994
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
31723995 goto out;
31733996 r = 0;
31743997 break;
31753998 }
3176
- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
3999
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
31774000 r = -EFAULT;
31784001 if (copy_to_user(argp, &kvm_mce_cap_supported,
31794002 sizeof(kvm_mce_cap_supported)))
....@@ -3205,9 +4028,9 @@
32054028 case KVM_GET_MSRS:
32064029 r = msr_io(NULL, argp, do_get_msr_feature, 1);
32074030 break;
3208
- }
32094031 default:
32104032 r = -EINVAL;
4033
+ break;
32114034 }
32124035 out:
32134036 return r;
....@@ -3227,14 +4050,17 @@
32274050 {
32284051 /* Address WBINVD may be executed by guest */
32294052 if (need_emulate_wbinvd(vcpu)) {
3230
- if (kvm_x86_ops->has_wbinvd_exit())
4053
+ if (kvm_x86_ops.has_wbinvd_exit())
32314054 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
32324055 else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
32334056 smp_call_function_single(vcpu->cpu,
32344057 wbinvd_ipi, NULL, 1);
32354058 }
32364059
3237
- kvm_x86_ops->vcpu_load(vcpu, cpu);
4060
+ kvm_x86_ops.vcpu_load(vcpu, cpu);
4061
+
4062
+ /* Save host pkru register if supported */
4063
+ vcpu->arch.host_pkru = read_pkru();
32384064
32394065 /* Apply any externally detected TSC adjustments (due to suspend) */
32404066 if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
....@@ -3275,52 +4101,68 @@
32754101
32764102 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
32774103 {
3278
- struct kvm_host_map map;
3279
- struct kvm_steal_time *st;
4104
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4105
+ struct kvm_steal_time __user *st;
4106
+ struct kvm_memslots *slots;
4107
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
4108
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
32804109
4110
+ /*
4111
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
4112
+ * an instruction boundary and will not trigger guest emulation of any
4113
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
4114
+ * when this is true, for example allowing the vCPU to be marked
4115
+ * preempted if and only if the VM-Exit was due to a host interrupt.
4116
+ */
4117
+ if (!vcpu->arch.at_instruction_boundary) {
4118
+ vcpu->stat.preemption_other++;
4119
+ return;
4120
+ }
4121
+
4122
+ vcpu->stat.preemption_reported++;
32814123 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
32824124 return;
32834125
32844126 if (vcpu->arch.st.preempted)
32854127 return;
32864128
3287
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
3288
- &vcpu->arch.st.cache, true))
4129
+ /* This happens on process exit */
4130
+ if (unlikely(current->mm != vcpu->kvm->mm))
32894131 return;
32904132
3291
- st = map.hva +
3292
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
4133
+ slots = kvm_memslots(vcpu->kvm);
32934134
3294
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4135
+ if (unlikely(slots->generation != ghc->generation ||
4136
+ gpa != ghc->gpa ||
4137
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
4138
+ return;
32954139
3296
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
4140
+ st = (struct kvm_steal_time __user *)ghc->hva;
4141
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
4142
+
4143
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4144
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4145
+
4146
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
32974147 }
32984148
32994149 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
33004150 {
33014151 int idx;
33024152
3303
- if (vcpu->preempted)
3304
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
4153
+ if (vcpu->preempted) {
4154
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
33054155
3306
- /*
3307
- * Disable page faults because we're in atomic context here.
3308
- * kvm_write_guest_offset_cached() would call might_fault()
3309
- * that relies on pagefault_disable() to tell if there's a
3310
- * bug. NOTE: the write to guest memory may not go through if
3311
- * during postcopy live migration or if there's heavy guest
3312
- * paging.
3313
- */
3314
- pagefault_disable();
3315
- /*
3316
- * kvm_memslots() will be called by
3317
- * kvm_write_guest_offset_cached() so take the srcu lock.
3318
- */
3319
- idx = srcu_read_lock(&vcpu->kvm->srcu);
3320
- kvm_steal_time_set_preempted(vcpu);
3321
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
3322
- pagefault_enable();
3323
- kvm_x86_ops->vcpu_put(vcpu);
4156
+ /*
4157
+ * Take the srcu lock as memslots will be accessed to check the gfn
4158
+ * cache generation against the memslots generation.
4159
+ */
4160
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
4161
+ kvm_steal_time_set_preempted(vcpu);
4162
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
4163
+ }
4164
+
4165
+ kvm_x86_ops.vcpu_put(vcpu);
33244166 vcpu->arch.last_host_tsc = rdtsc();
33254167 /*
33264168 * If userspace has set any breakpoints or watchpoints, dr6 is restored
....@@ -3334,7 +4176,7 @@
33344176 struct kvm_lapic_state *s)
33354177 {
33364178 if (vcpu->arch.apicv_active)
3337
- kvm_x86_ops->sync_pir_to_irr(vcpu);
4179
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
33384180
33394181 return kvm_apic_get_state(vcpu, s);
33404182 }
....@@ -3453,8 +4295,7 @@
34534295 for (bank = 0; bank < bank_num; bank++)
34544296 vcpu->arch.mce_banks[bank*4] = ~(u64)0;
34554297
3456
- if (kvm_x86_ops->setup_mce)
3457
- kvm_x86_ops->setup_mce(vcpu);
4298
+ kvm_x86_ops.setup_mce(vcpu);
34584299 out:
34594300 return r;
34604301 }
....@@ -3516,28 +4357,56 @@
35164357 process_smi(vcpu);
35174358
35184359 /*
3519
- * FIXME: pass injected and pending separately. This is only
3520
- * needed for nested virtualization, whose state cannot be
3521
- * migrated yet. For now we can combine them.
4360
+ * In guest mode, payload delivery should be deferred,
4361
+ * so that the L1 hypervisor can intercept #PF before
4362
+ * CR2 is modified (or intercept #DB before DR6 is
4363
+ * modified under nVMX). Unless the per-VM capability,
4364
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
4365
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
4366
+ * opportunistically defer the exception payload, deliver it if the
4367
+ * capability hasn't been requested before processing a
4368
+ * KVM_GET_VCPU_EVENTS.
35224369 */
3523
- events->exception.injected =
3524
- (vcpu->arch.exception.pending ||
3525
- vcpu->arch.exception.injected) &&
3526
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
4370
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
4371
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
4372
+ kvm_deliver_exception_payload(vcpu);
4373
+
4374
+ /*
4375
+ * The API doesn't provide the instruction length for software
4376
+ * exceptions, so don't report them. As long as the guest RIP
4377
+ * isn't advanced, we should expect to encounter the exception
4378
+ * again.
4379
+ */
4380
+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
4381
+ events->exception.injected = 0;
4382
+ events->exception.pending = 0;
4383
+ } else {
4384
+ events->exception.injected = vcpu->arch.exception.injected;
4385
+ events->exception.pending = vcpu->arch.exception.pending;
4386
+ /*
4387
+ * For ABI compatibility, deliberately conflate
4388
+ * pending and injected exceptions when
4389
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
4390
+ */
4391
+ if (!vcpu->kvm->arch.exception_payload_enabled)
4392
+ events->exception.injected |=
4393
+ vcpu->arch.exception.pending;
4394
+ }
35274395 events->exception.nr = vcpu->arch.exception.nr;
35284396 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3529
- events->exception.pad = 0;
35304397 events->exception.error_code = vcpu->arch.exception.error_code;
4398
+ events->exception_has_payload = vcpu->arch.exception.has_payload;
4399
+ events->exception_payload = vcpu->arch.exception.payload;
35314400
35324401 events->interrupt.injected =
35334402 vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
35344403 events->interrupt.nr = vcpu->arch.interrupt.nr;
35354404 events->interrupt.soft = 0;
3536
- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
4405
+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
35374406
35384407 events->nmi.injected = vcpu->arch.nmi_injected;
35394408 events->nmi.pending = vcpu->arch.nmi_pending != 0;
3540
- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
4409
+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
35414410 events->nmi.pad = 0;
35424411
35434412 events->sipi_vector = 0; /* never valid when reporting to user space */
....@@ -3551,10 +4420,13 @@
35514420 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
35524421 | KVM_VCPUEVENT_VALID_SHADOW
35534422 | KVM_VCPUEVENT_VALID_SMM);
4423
+ if (vcpu->kvm->arch.exception_payload_enabled)
4424
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
4425
+
35544426 memset(&events->reserved, 0, sizeof(events->reserved));
35554427 }
35564428
3557
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
4429
+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
35584430
35594431 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
35604432 struct kvm_vcpu_events *events)
....@@ -3562,12 +4434,24 @@
35624434 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
35634435 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
35644436 | KVM_VCPUEVENT_VALID_SHADOW
3565
- | KVM_VCPUEVENT_VALID_SMM))
4437
+ | KVM_VCPUEVENT_VALID_SMM
4438
+ | KVM_VCPUEVENT_VALID_PAYLOAD))
35664439 return -EINVAL;
35674440
3568
- if (events->exception.injected &&
3569
- (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
3570
- is_guest_mode(vcpu)))
4441
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
4442
+ if (!vcpu->kvm->arch.exception_payload_enabled)
4443
+ return -EINVAL;
4444
+ if (events->exception.pending)
4445
+ events->exception.injected = 0;
4446
+ else
4447
+ events->exception_has_payload = 0;
4448
+ } else {
4449
+ events->exception.pending = 0;
4450
+ events->exception_has_payload = 0;
4451
+ }
4452
+
4453
+ if ((events->exception.injected || events->exception.pending) &&
4454
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
35714455 return -EINVAL;
35724456
35734457 /* INITs are latched while in SMM */
....@@ -3577,35 +4461,40 @@
35774461 return -EINVAL;
35784462
35794463 process_nmi(vcpu);
3580
- vcpu->arch.exception.injected = false;
3581
- vcpu->arch.exception.pending = events->exception.injected;
4464
+ vcpu->arch.exception.injected = events->exception.injected;
4465
+ vcpu->arch.exception.pending = events->exception.pending;
35824466 vcpu->arch.exception.nr = events->exception.nr;
35834467 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
35844468 vcpu->arch.exception.error_code = events->exception.error_code;
4469
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
4470
+ vcpu->arch.exception.payload = events->exception_payload;
35854471
35864472 vcpu->arch.interrupt.injected = events->interrupt.injected;
35874473 vcpu->arch.interrupt.nr = events->interrupt.nr;
35884474 vcpu->arch.interrupt.soft = events->interrupt.soft;
35894475 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3590
- kvm_x86_ops->set_interrupt_shadow(vcpu,
4476
+ kvm_x86_ops.set_interrupt_shadow(vcpu,
35914477 events->interrupt.shadow);
35924478
35934479 vcpu->arch.nmi_injected = events->nmi.injected;
35944480 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
35954481 vcpu->arch.nmi_pending = events->nmi.pending;
3596
- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
4482
+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
35974483
35984484 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
35994485 lapic_in_kernel(vcpu))
36004486 vcpu->arch.apic->sipi_vector = events->sipi_vector;
36014487
36024488 if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
3603
- u32 hflags = vcpu->arch.hflags;
3604
- if (events->smi.smm)
3605
- hflags |= HF_SMM_MASK;
3606
- else
3607
- hflags &= ~HF_SMM_MASK;
3608
- kvm_set_hflags(vcpu, hflags);
4489
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
4490
+ if (events->smi.smm)
4491
+ vcpu->arch.hflags |= HF_SMM_MASK;
4492
+ else
4493
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
4494
+
4495
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
4496
+ kvm_smm_changed(vcpu);
4497
+ }
36094498
36104499 vcpu->arch.smi_pending = events->smi.pending;
36114500
....@@ -3614,12 +4503,13 @@
36144503 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
36154504 else
36164505 vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
3617
- if (lapic_in_kernel(vcpu)) {
3618
- if (events->smi.latched_init)
3619
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3620
- else
3621
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
3622
- }
4506
+ }
4507
+
4508
+ if (lapic_in_kernel(vcpu)) {
4509
+ if (events->smi.latched_init)
4510
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
4511
+ else
4512
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
36234513 }
36244514 }
36254515
....@@ -3633,12 +4523,11 @@
36334523 {
36344524 unsigned long val;
36354525
4526
+ memset(dbgregs, 0, sizeof(*dbgregs));
36364527 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
36374528 kvm_get_dr(vcpu, 6, &val);
36384529 dbgregs->dr6 = val;
36394530 dbgregs->dr7 = vcpu->arch.dr7;
3640
- dbgregs->flags = 0;
3641
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
36424531 }
36434532
36444533 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
....@@ -3655,7 +4544,6 @@
36554544 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
36564545 kvm_update_dr0123(vcpu);
36574546 vcpu->arch.dr6 = dbgregs->dr6;
3658
- kvm_update_dr6(vcpu);
36594547 vcpu->arch.dr7 = dbgregs->dr7;
36604548 kvm_update_dr7(vcpu);
36614549
....@@ -3666,7 +4554,7 @@
36664554
36674555 static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
36684556 {
3669
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
4557
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
36704558 u64 xstate_bv = xsave->header.xfeatures;
36714559 u64 valid;
36724560
....@@ -3686,15 +4574,15 @@
36864574 */
36874575 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
36884576 while (valid) {
3689
- u64 feature = valid & -valid;
3690
- int index = fls64(feature) - 1;
3691
- void *src = get_xsave_addr(xsave, feature);
4577
+ u64 xfeature_mask = valid & -valid;
4578
+ int xfeature_nr = fls64(xfeature_mask) - 1;
4579
+ void *src = get_xsave_addr(xsave, xfeature_nr);
36924580
36934581 if (src) {
36944582 u32 size, offset, ecx, edx;
3695
- cpuid_count(XSTATE_CPUID, index,
4583
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
36964584 &size, &offset, &ecx, &edx);
3697
- if (feature == XFEATURE_MASK_PKRU)
4585
+ if (xfeature_nr == XFEATURE_PKRU)
36984586 memcpy(dest + offset, &vcpu->arch.pkru,
36994587 sizeof(vcpu->arch.pkru));
37004588 else
....@@ -3702,13 +4590,13 @@
37024590
37034591 }
37044592
3705
- valid -= feature;
4593
+ valid -= xfeature_mask;
37064594 }
37074595 }
37084596
37094597 static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
37104598 {
3711
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
4599
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
37124600 u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
37134601 u64 valid;
37144602
....@@ -3729,22 +4617,22 @@
37294617 */
37304618 valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
37314619 while (valid) {
3732
- u64 feature = valid & -valid;
3733
- int index = fls64(feature) - 1;
3734
- void *dest = get_xsave_addr(xsave, feature);
4620
+ u64 xfeature_mask = valid & -valid;
4621
+ int xfeature_nr = fls64(xfeature_mask) - 1;
4622
+ void *dest = get_xsave_addr(xsave, xfeature_nr);
37354623
37364624 if (dest) {
37374625 u32 size, offset, ecx, edx;
3738
- cpuid_count(XSTATE_CPUID, index,
4626
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
37394627 &size, &offset, &ecx, &edx);
3740
- if (feature == XFEATURE_MASK_PKRU)
4628
+ if (xfeature_nr == XFEATURE_PKRU)
37414629 memcpy(&vcpu->arch.pkru, src + offset,
37424630 sizeof(vcpu->arch.pkru));
37434631 else
37444632 memcpy(dest, src + offset, size);
37454633 }
37464634
3747
- valid -= feature;
4635
+ valid -= xfeature_mask;
37484636 }
37494637 }
37504638
....@@ -3756,7 +4644,7 @@
37564644 fill_xsave((u8 *) guest_xsave->region, vcpu);
37574645 } else {
37584646 memcpy(guest_xsave->region,
3759
- &vcpu->arch.guest_fpu.state.fxsave,
4647
+ &vcpu->arch.guest_fpu->state.fxsave,
37604648 sizeof(struct fxregs_state));
37614649 *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
37624650 XFEATURE_MASK_FPSSE;
....@@ -3778,15 +4666,14 @@
37784666 * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
37794667 * with old userspace.
37804668 */
3781
- if (xstate_bv & ~kvm_supported_xcr0() ||
3782
- mxcsr & ~mxcsr_feature_mask)
4669
+ if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
37834670 return -EINVAL;
37844671 load_xsave(vcpu, (u8 *)guest_xsave->region);
37854672 } else {
37864673 if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
37874674 mxcsr & ~mxcsr_feature_mask)
37884675 return -EINVAL;
3789
- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
4676
+ memcpy(&vcpu->arch.guest_fpu->state.fxsave,
37904677 guest_xsave->region, sizeof(struct fxregs_state));
37914678 }
37924679 return 0;
....@@ -3847,6 +4734,10 @@
38474734 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
38484735 struct kvm_enable_cap *cap)
38494736 {
4737
+ int r;
4738
+ uint16_t vmcs_version;
4739
+ void __user *user_ptr;
4740
+
38504741 if (cap->flags)
38514742 return -EINVAL;
38524743
....@@ -3854,11 +4745,37 @@
38544745 case KVM_CAP_HYPERV_SYNIC2:
38554746 if (cap->args[0])
38564747 return -EINVAL;
4748
+ fallthrough;
4749
+
38574750 case KVM_CAP_HYPERV_SYNIC:
38584751 if (!irqchip_in_kernel(vcpu->kvm))
38594752 return -EINVAL;
38604753 return kvm_hv_activate_synic(vcpu, cap->cap ==
38614754 KVM_CAP_HYPERV_SYNIC2);
4755
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4756
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
4757
+ return -ENOTTY;
4758
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
4759
+ if (!r) {
4760
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
4761
+ if (copy_to_user(user_ptr, &vmcs_version,
4762
+ sizeof(vmcs_version)))
4763
+ r = -EFAULT;
4764
+ }
4765
+ return r;
4766
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4767
+ if (!kvm_x86_ops.enable_direct_tlbflush)
4768
+ return -ENOTTY;
4769
+
4770
+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
4771
+
4772
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4773
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
4774
+ if (vcpu->arch.pv_cpuid.enforce)
4775
+ kvm_update_pv_runtime(vcpu);
4776
+
4777
+ return 0;
4778
+
38624779 default:
38634780 return -EINVAL;
38644781 }
....@@ -3885,7 +4802,8 @@
38854802 r = -EINVAL;
38864803 if (!lapic_in_kernel(vcpu))
38874804 goto out;
3888
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
4805
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
4806
+ GFP_KERNEL_ACCOUNT);
38894807
38904808 r = -ENOMEM;
38914809 if (!u.lapic)
....@@ -3916,7 +4834,7 @@
39164834 struct kvm_interrupt irq;
39174835
39184836 r = -EFAULT;
3919
- if (copy_from_user(&irq, argp, sizeof irq))
4837
+ if (copy_from_user(&irq, argp, sizeof(irq)))
39204838 goto out;
39214839 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
39224840 break;
....@@ -3934,7 +4852,7 @@
39344852 struct kvm_cpuid cpuid;
39354853
39364854 r = -EFAULT;
3937
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4855
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39384856 goto out;
39394857 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
39404858 break;
....@@ -3944,7 +4862,7 @@
39444862 struct kvm_cpuid2 cpuid;
39454863
39464864 r = -EFAULT;
3947
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4865
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39484866 goto out;
39494867 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
39504868 cpuid_arg->entries);
....@@ -3955,14 +4873,14 @@
39554873 struct kvm_cpuid2 cpuid;
39564874
39574875 r = -EFAULT;
3958
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
4876
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
39594877 goto out;
39604878 r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
39614879 cpuid_arg->entries);
39624880 if (r)
39634881 goto out;
39644882 r = -EFAULT;
3965
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
4883
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
39664884 goto out;
39674885 r = 0;
39684886 break;
....@@ -3983,13 +4901,13 @@
39834901 struct kvm_tpr_access_ctl tac;
39844902
39854903 r = -EFAULT;
3986
- if (copy_from_user(&tac, argp, sizeof tac))
4904
+ if (copy_from_user(&tac, argp, sizeof(tac)))
39874905 goto out;
39884906 r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
39894907 if (r)
39904908 goto out;
39914909 r = -EFAULT;
3992
- if (copy_to_user(argp, &tac, sizeof tac))
4910
+ if (copy_to_user(argp, &tac, sizeof(tac)))
39934911 goto out;
39944912 r = 0;
39954913 break;
....@@ -4002,7 +4920,7 @@
40024920 if (!lapic_in_kernel(vcpu))
40034921 goto out;
40044922 r = -EFAULT;
4005
- if (copy_from_user(&va, argp, sizeof va))
4923
+ if (copy_from_user(&va, argp, sizeof(va)))
40064924 goto out;
40074925 idx = srcu_read_lock(&vcpu->kvm->srcu);
40084926 r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
....@@ -4013,7 +4931,7 @@
40134931 u64 mcg_cap;
40144932
40154933 r = -EFAULT;
4016
- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
4934
+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
40174935 goto out;
40184936 r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
40194937 break;
....@@ -4022,7 +4940,7 @@
40224940 struct kvm_x86_mce mce;
40234941
40244942 r = -EFAULT;
4025
- if (copy_from_user(&mce, argp, sizeof mce))
4943
+ if (copy_from_user(&mce, argp, sizeof(mce)))
40264944 goto out;
40274945 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
40284946 break;
....@@ -4072,7 +4990,7 @@
40724990 break;
40734991 }
40744992 case KVM_GET_XSAVE: {
4075
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
4993
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
40764994 r = -ENOMEM;
40774995 if (!u.xsave)
40784996 break;
....@@ -4096,7 +5014,7 @@
40965014 break;
40975015 }
40985016 case KVM_GET_XCRS: {
4099
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
5017
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
41005018 r = -ENOMEM;
41015019 if (!u.xcrs)
41025020 break;
....@@ -4126,7 +5044,8 @@
41265044 r = -EINVAL;
41275045 user_tsc_khz = (u32)arg;
41285046
4129
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
5047
+ if (kvm_has_tsc_control &&
5048
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
41305049 goto out;
41315050
41325051 if (user_tsc_khz == 0)
....@@ -4159,7 +5078,7 @@
41595078 u32 user_data_size;
41605079
41615080 r = -EINVAL;
4162
- if (!kvm_x86_ops->get_nested_state)
5081
+ if (!kvm_x86_ops.nested_ops->get_state)
41635082 break;
41645083
41655084 BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
....@@ -4167,8 +5086,8 @@
41675086 if (get_user(user_data_size, &user_kvm_nested_state->size))
41685087 break;
41695088
4170
- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
4171
- user_data_size);
5089
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5090
+ user_data_size);
41725091 if (r < 0)
41735092 break;
41745093
....@@ -4189,7 +5108,7 @@
41895108 int idx;
41905109
41915110 r = -EINVAL;
4192
- if (!kvm_x86_ops->set_nested_state)
5111
+ if (!kvm_x86_ops.nested_ops->set_state)
41935112 break;
41945113
41955114 r = -EFAULT;
....@@ -4201,16 +5120,38 @@
42015120 break;
42025121
42035122 if (kvm_state.flags &
4204
- ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
5123
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
5124
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5125
+ | KVM_STATE_NESTED_GIF_SET))
42055126 break;
42065127
42075128 /* nested_run_pending implies guest_mode. */
4208
- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
5129
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5130
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
42095131 break;
42105132
42115133 idx = srcu_read_lock(&vcpu->kvm->srcu);
4212
- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
5134
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
42135135 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5136
+ break;
5137
+ }
5138
+ case KVM_GET_SUPPORTED_HV_CPUID: {
5139
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
5140
+ struct kvm_cpuid2 cpuid;
5141
+
5142
+ r = -EFAULT;
5143
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5144
+ goto out;
5145
+
5146
+ r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
5147
+ cpuid_arg->entries);
5148
+ if (r)
5149
+ goto out;
5150
+
5151
+ r = -EFAULT;
5152
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
5153
+ goto out;
5154
+ r = 0;
42145155 break;
42155156 }
42165157 default:
....@@ -4234,14 +5175,14 @@
42345175
42355176 if (addr > (unsigned int)(-3 * PAGE_SIZE))
42365177 return -EINVAL;
4237
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
5178
+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
42385179 return ret;
42395180 }
42405181
42415182 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
42425183 u64 ident_addr)
42435184 {
4244
- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
5185
+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
42455186 }
42465187
42475188 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
....@@ -4382,9 +5323,6 @@
43825323 {
43835324 struct kvm_pit *pit = kvm->arch.vpit;
43845325
4385
- if (!pit)
4386
- return -ENXIO;
4387
-
43885326 /* pit->pit_state.lock was overloaded to prevent userspace from getting
43895327 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
43905328 * ioctls in parallel. Use a separate lock if that ioctl isn't rare.
....@@ -4396,50 +5334,13 @@
43965334 return 0;
43975335 }
43985336
4399
-/**
4400
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
4401
- * @kvm: kvm instance
4402
- * @log: slot id and address to which we copy the log
4403
- *
4404
- * Steps 1-4 below provide general overview of dirty page logging. See
4405
- * kvm_get_dirty_log_protect() function description for additional details.
4406
- *
4407
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
4408
- * always flush the TLB (step 4) even if previous step failed and the dirty
4409
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
4410
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
4411
- * writes will be marked dirty for next log read.
4412
- *
4413
- * 1. Take a snapshot of the bit and clear it if needed.
4414
- * 2. Write protect the corresponding page.
4415
- * 3. Copy the snapshot to the userspace.
4416
- * 4. Flush TLB's if needed.
4417
- */
4418
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
5337
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
44195338 {
4420
- bool is_dirty = false;
4421
- int r;
4422
-
4423
- mutex_lock(&kvm->slots_lock);
4424
-
44255339 /*
44265340 * Flush potentially hardware-cached dirty pages to dirty_bitmap.
44275341 */
4428
- if (kvm_x86_ops->flush_log_dirty)
4429
- kvm_x86_ops->flush_log_dirty(kvm);
4430
-
4431
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
4432
-
4433
- /*
4434
- * All the TLBs can be flushed out of mmu lock, see the comments in
4435
- * kvm_mmu_slot_remove_write_access().
4436
- */
4437
- lockdep_assert_held(&kvm->slots_lock);
4438
- if (is_dirty)
4439
- kvm_flush_remote_tlbs(kvm);
4440
-
4441
- mutex_unlock(&kvm->slots_lock);
4442
- return r;
5342
+ if (kvm_x86_ops.flush_log_dirty)
5343
+ kvm_x86_ops.flush_log_dirty(kvm);
44435344 }
44445345
44455346 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
....@@ -4454,8 +5355,8 @@
44545355 return 0;
44555356 }
44565357
4457
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4458
- struct kvm_enable_cap *cap)
5358
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
5359
+ struct kvm_enable_cap *cap)
44595360 {
44605361 int r;
44615362
....@@ -4513,10 +5414,25 @@
45135414 kvm->arch.hlt_in_guest = true;
45145415 if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
45155416 kvm->arch.pause_in_guest = true;
5417
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
5418
+ kvm->arch.cstate_in_guest = true;
45165419 r = 0;
45175420 break;
45185421 case KVM_CAP_MSR_PLATFORM_INFO:
45195422 kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
5423
+ r = 0;
5424
+ break;
5425
+ case KVM_CAP_EXCEPTION_PAYLOAD:
5426
+ kvm->arch.exception_payload_enabled = cap->args[0];
5427
+ r = 0;
5428
+ break;
5429
+ case KVM_CAP_X86_USER_SPACE_MSR:
5430
+ r = -EINVAL;
5431
+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
5432
+ KVM_MSR_EXIT_REASON_UNKNOWN |
5433
+ KVM_MSR_EXIT_REASON_FILTER))
5434
+ break;
5435
+ kvm->arch.user_space_msr_mask = cap->args[0];
45205436 r = 0;
45215437 break;
45225438 default:
....@@ -4525,6 +5441,180 @@
45255441 }
45265442 return r;
45275443 }
5444
+
5445
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
5446
+{
5447
+ struct kvm_x86_msr_filter *msr_filter;
5448
+
5449
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
5450
+ if (!msr_filter)
5451
+ return NULL;
5452
+
5453
+ msr_filter->default_allow = default_allow;
5454
+ return msr_filter;
5455
+}
5456
+
5457
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
5458
+{
5459
+ u32 i;
5460
+
5461
+ if (!msr_filter)
5462
+ return;
5463
+
5464
+ for (i = 0; i < msr_filter->count; i++)
5465
+ kfree(msr_filter->ranges[i].bitmap);
5466
+
5467
+ kfree(msr_filter);
5468
+}
5469
+
5470
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
5471
+ struct kvm_msr_filter_range *user_range)
5472
+{
5473
+ struct msr_bitmap_range range;
5474
+ unsigned long *bitmap = NULL;
5475
+ size_t bitmap_size;
5476
+ int r;
5477
+
5478
+ if (!user_range->nmsrs)
5479
+ return 0;
5480
+
5481
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
5482
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
5483
+ return -EINVAL;
5484
+
5485
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
5486
+ if (IS_ERR(bitmap))
5487
+ return PTR_ERR(bitmap);
5488
+
5489
+ range = (struct msr_bitmap_range) {
5490
+ .flags = user_range->flags,
5491
+ .base = user_range->base,
5492
+ .nmsrs = user_range->nmsrs,
5493
+ .bitmap = bitmap,
5494
+ };
5495
+
5496
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
5497
+ r = -EINVAL;
5498
+ goto err;
5499
+ }
5500
+
5501
+ if (!range.flags) {
5502
+ r = -EINVAL;
5503
+ goto err;
5504
+ }
5505
+
5506
+ /* Everything ok, add this range identifier. */
5507
+ msr_filter->ranges[msr_filter->count] = range;
5508
+ msr_filter->count++;
5509
+
5510
+ return 0;
5511
+err:
5512
+ kfree(bitmap);
5513
+ return r;
5514
+}
5515
+
5516
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
5517
+ struct kvm_msr_filter *filter)
5518
+{
5519
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
5520
+ bool default_allow;
5521
+ bool empty = true;
5522
+ int r = 0;
5523
+ u32 i;
5524
+
5525
+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
5526
+ return -EINVAL;
5527
+
5528
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
5529
+ empty &= !filter->ranges[i].nmsrs;
5530
+
5531
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
5532
+ if (empty && !default_allow)
5533
+ return -EINVAL;
5534
+
5535
+ new_filter = kvm_alloc_msr_filter(default_allow);
5536
+ if (!new_filter)
5537
+ return -ENOMEM;
5538
+
5539
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
5540
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
5541
+ if (r) {
5542
+ kvm_free_msr_filter(new_filter);
5543
+ return r;
5544
+ }
5545
+ }
5546
+
5547
+ mutex_lock(&kvm->lock);
5548
+
5549
+ /* The per-VM filter is protected by kvm->lock... */
5550
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
5551
+
5552
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
5553
+ synchronize_srcu(&kvm->srcu);
5554
+
5555
+ kvm_free_msr_filter(old_filter);
5556
+
5557
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
5558
+ mutex_unlock(&kvm->lock);
5559
+
5560
+ return 0;
5561
+}
5562
+
5563
+#ifdef CONFIG_KVM_COMPAT
5564
+/* for KVM_X86_SET_MSR_FILTER */
5565
+struct kvm_msr_filter_range_compat {
5566
+ __u32 flags;
5567
+ __u32 nmsrs;
5568
+ __u32 base;
5569
+ __u32 bitmap;
5570
+};
5571
+
5572
+struct kvm_msr_filter_compat {
5573
+ __u32 flags;
5574
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
5575
+};
5576
+
5577
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
5578
+
5579
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
5580
+ unsigned long arg)
5581
+{
5582
+ void __user *argp = (void __user *)arg;
5583
+ struct kvm *kvm = filp->private_data;
5584
+ long r = -ENOTTY;
5585
+
5586
+ switch (ioctl) {
5587
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
5588
+ struct kvm_msr_filter __user *user_msr_filter = argp;
5589
+ struct kvm_msr_filter_compat filter_compat;
5590
+ struct kvm_msr_filter filter;
5591
+ int i;
5592
+
5593
+ if (copy_from_user(&filter_compat, user_msr_filter,
5594
+ sizeof(filter_compat)))
5595
+ return -EFAULT;
5596
+
5597
+ filter.flags = filter_compat.flags;
5598
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
5599
+ struct kvm_msr_filter_range_compat *cr;
5600
+
5601
+ cr = &filter_compat.ranges[i];
5602
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
5603
+ .flags = cr->flags,
5604
+ .nmsrs = cr->nmsrs,
5605
+ .base = cr->base,
5606
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
5607
+ };
5608
+ }
5609
+
5610
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5611
+ break;
5612
+ }
5613
+ }
5614
+
5615
+ return r;
5616
+}
5617
+#endif
45285618
45295619 long kvm_arch_vm_ioctl(struct file *filp,
45305620 unsigned int ioctl, unsigned long arg)
....@@ -4555,7 +5645,7 @@
45555645 if (kvm->created_vcpus)
45565646 goto set_identity_unlock;
45575647 r = -EFAULT;
4558
- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
5648
+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
45595649 goto set_identity_unlock;
45605650 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
45615651 set_identity_unlock:
....@@ -4639,7 +5729,7 @@
46395729 if (r)
46405730 goto get_irqchip_out;
46415731 r = -EFAULT;
4642
- if (copy_to_user(argp, chip, sizeof *chip))
5732
+ if (copy_to_user(argp, chip, sizeof(*chip)))
46435733 goto get_irqchip_out;
46445734 r = 0;
46455735 get_irqchip_out:
....@@ -4660,9 +5750,6 @@
46605750 if (!irqchip_kernel(kvm))
46615751 goto set_irqchip_out;
46625752 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
4663
- if (r)
4664
- goto set_irqchip_out;
4665
- r = 0;
46665753 set_irqchip_out:
46675754 kfree(chip);
46685755 break;
....@@ -4685,7 +5772,7 @@
46855772 }
46865773 case KVM_SET_PIT: {
46875774 r = -EFAULT;
4688
- if (copy_from_user(&u.ps, argp, sizeof u.ps))
5775
+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
46895776 goto out;
46905777 mutex_lock(&kvm->lock);
46915778 r = -ENXIO;
....@@ -4726,6 +5813,9 @@
47265813 struct kvm_reinject_control control;
47275814 r = -EFAULT;
47285815 if (copy_from_user(&control, argp, sizeof(control)))
5816
+ goto out;
5817
+ r = -ENXIO;
5818
+ if (!kvm->arch.vpit)
47295819 goto out;
47305820 r = kvm_vm_ioctl_reinject(kvm, &control);
47315821 break;
....@@ -4790,19 +5880,10 @@
47905880 r = 0;
47915881 break;
47925882 }
4793
- case KVM_ENABLE_CAP: {
4794
- struct kvm_enable_cap cap;
4795
-
4796
- r = -EFAULT;
4797
- if (copy_from_user(&cap, argp, sizeof(cap)))
4798
- goto out;
4799
- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
4800
- break;
4801
- }
48025883 case KVM_MEMORY_ENCRYPT_OP: {
48035884 r = -ENOTTY;
4804
- if (kvm_x86_ops->mem_enc_op)
4805
- r = kvm_x86_ops->mem_enc_op(kvm, argp);
5885
+ if (kvm_x86_ops.mem_enc_op)
5886
+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
48065887 break;
48075888 }
48085889 case KVM_MEMORY_ENCRYPT_REG_REGION: {
....@@ -4813,8 +5894,8 @@
48135894 goto out;
48145895
48155896 r = -ENOTTY;
4816
- if (kvm_x86_ops->mem_enc_reg_region)
4817
- r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
5897
+ if (kvm_x86_ops.mem_enc_reg_region)
5898
+ r = kvm_x86_ops.mem_enc_reg_region(kvm, &region);
48185899 break;
48195900 }
48205901 case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
....@@ -4825,8 +5906,8 @@
48255906 goto out;
48265907
48275908 r = -ENOTTY;
4828
- if (kvm_x86_ops->mem_enc_unreg_region)
4829
- r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
5909
+ if (kvm_x86_ops.mem_enc_unreg_region)
5910
+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, &region);
48305911 break;
48315912 }
48325913 case KVM_HYPERV_EVENTFD: {
....@@ -4838,6 +5919,19 @@
48385919 r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
48395920 break;
48405921 }
5922
+ case KVM_SET_PMU_EVENT_FILTER:
5923
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
5924
+ break;
5925
+ case KVM_X86_SET_MSR_FILTER: {
5926
+ struct kvm_msr_filter __user *user_msr_filter = argp;
5927
+ struct kvm_msr_filter filter;
5928
+
5929
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
5930
+ return -EFAULT;
5931
+
5932
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
5933
+ break;
5934
+ }
48415935 default:
48425936 r = -ENOTTY;
48435937 }
....@@ -4847,58 +5941,96 @@
48475941
48485942 static void kvm_init_msr_list(void)
48495943 {
5944
+ struct x86_pmu_capability x86_pmu;
48505945 u32 dummy[2];
4851
- unsigned i, j;
5946
+ unsigned i;
48525947
4853
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
4854
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
5948
+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
5949
+ "Please update the fixed PMCs in msrs_to_saved_all[]");
5950
+
5951
+ perf_get_x86_pmu_capability(&x86_pmu);
5952
+
5953
+ num_msrs_to_save = 0;
5954
+ num_emulated_msrs = 0;
5955
+ num_msr_based_features = 0;
5956
+
5957
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
5958
+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
48555959 continue;
48565960
48575961 /*
48585962 * Even MSRs that are valid in the host may not be exposed
48595963 * to the guests in some cases.
48605964 */
4861
- switch (msrs_to_save[i]) {
5965
+ switch (msrs_to_save_all[i]) {
48625966 case MSR_IA32_BNDCFGS:
48635967 if (!kvm_mpx_supported())
48645968 continue;
48655969 break;
48665970 case MSR_TSC_AUX:
4867
- if (!kvm_x86_ops->rdtscp_supported())
5971
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
5972
+ continue;
5973
+ break;
5974
+ case MSR_IA32_UMWAIT_CONTROL:
5975
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
5976
+ continue;
5977
+ break;
5978
+ case MSR_IA32_RTIT_CTL:
5979
+ case MSR_IA32_RTIT_STATUS:
5980
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
5981
+ continue;
5982
+ break;
5983
+ case MSR_IA32_RTIT_CR3_MATCH:
5984
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5985
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
5986
+ continue;
5987
+ break;
5988
+ case MSR_IA32_RTIT_OUTPUT_BASE:
5989
+ case MSR_IA32_RTIT_OUTPUT_MASK:
5990
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5991
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
5992
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
5993
+ continue;
5994
+ break;
5995
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
5996
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
5997
+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
5998
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
5999
+ continue;
6000
+ break;
6001
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
6002
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
6003
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
6004
+ continue;
6005
+ break;
6006
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
6007
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
6008
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
48686009 continue;
48696010 break;
48706011 default:
48716012 break;
48726013 }
48736014
4874
- if (j < i)
4875
- msrs_to_save[j] = msrs_to_save[i];
4876
- j++;
6015
+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
48776016 }
4878
- num_msrs_to_save = j;
48796017
4880
- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
4881
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
6018
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
6019
+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
48826020 continue;
48836021
4884
- if (j < i)
4885
- emulated_msrs[j] = emulated_msrs[i];
4886
- j++;
6022
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
48876023 }
4888
- num_emulated_msrs = j;
48896024
4890
- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
6025
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
48916026 struct kvm_msr_entry msr;
48926027
4893
- msr.index = msr_based_features[i];
6028
+ msr.index = msr_based_features_all[i];
48946029 if (kvm_get_msr_feature(&msr))
48956030 continue;
48966031
4897
- if (j < i)
4898
- msr_based_features[j] = msr_based_features[i];
4899
- j++;
6032
+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
49006033 }
4901
- num_msr_based_features = j;
49026034 }
49036035
49046036 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
....@@ -4947,13 +6079,13 @@
49476079 static void kvm_set_segment(struct kvm_vcpu *vcpu,
49486080 struct kvm_segment *var, int seg)
49496081 {
4950
- kvm_x86_ops->set_segment(vcpu, var, seg);
6082
+ kvm_x86_ops.set_segment(vcpu, var, seg);
49516083 }
49526084
49536085 void kvm_get_segment(struct kvm_vcpu *vcpu,
49546086 struct kvm_segment *var, int seg)
49556087 {
4956
- kvm_x86_ops->get_segment(vcpu, var, seg);
6088
+ kvm_x86_ops.get_segment(vcpu, var, seg);
49576089 }
49586090
49596091 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
....@@ -4965,7 +6097,7 @@
49656097
49666098 /* NPT walks are always user-walks */
49676099 access |= PFERR_USER_MASK;
4968
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
6100
+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
49696101
49706102 return t_gpa;
49716103 }
....@@ -4973,14 +6105,14 @@
49736105 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
49746106 struct x86_exception *exception)
49756107 {
4976
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6108
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49776109 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49786110 }
49796111
49806112 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
49816113 struct x86_exception *exception)
49826114 {
4983
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6115
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49846116 access |= PFERR_FETCH_MASK;
49856117 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49866118 }
....@@ -4988,7 +6120,7 @@
49886120 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
49896121 struct x86_exception *exception)
49906122 {
4991
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6123
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
49926124 access |= PFERR_WRITE_MASK;
49936125 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
49946126 }
....@@ -5037,7 +6169,7 @@
50376169 struct x86_exception *exception)
50386170 {
50396171 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5040
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6172
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
50416173 unsigned offset;
50426174 int ret;
50436175
....@@ -5062,7 +6194,7 @@
50626194 gva_t addr, void *val, unsigned int bytes,
50636195 struct x86_exception *exception)
50646196 {
5065
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
6197
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
50666198
50676199 /*
50686200 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
....@@ -5083,7 +6215,7 @@
50836215 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
50846216 u32 access = 0;
50856217
5086
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
6218
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
50876219 access |= PFERR_USER_MASK;
50886220
50896221 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
....@@ -5136,7 +6268,7 @@
51366268 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
51376269 u32 access = PFERR_WRITE_MASK;
51386270
5139
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
6271
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
51406272 access |= PFERR_USER_MASK;
51416273
51426274 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
....@@ -5149,13 +6281,6 @@
51496281 /* kvm_write_guest_virt_system can pull in tons of pages. */
51506282 vcpu->arch.l1tf_flush_l1d = true;
51516283
5152
- /*
5153
- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
5154
- * is returned, but our callers are not ready for that and they blindly
5155
- * call kvm_inject_page_fault. Ensure that they at least do not leak
5156
- * uninitialized kernel stack memory into cr2 and error code.
5157
- */
5158
- memset(exception, 0, sizeof(*exception));
51596284 return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
51606285 PFERR_WRITE_MASK, exception);
51616286 }
....@@ -5163,25 +6288,23 @@
51636288
51646289 int handle_ud(struct kvm_vcpu *vcpu)
51656290 {
6291
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
51666292 int emul_type = EMULTYPE_TRAP_UD;
5167
- enum emulation_result er;
51686293 char sig[5]; /* ud2; .ascii "kvm" */
51696294 struct x86_exception e;
6295
+
6296
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
6297
+ return 1;
51706298
51716299 if (force_emulation_prefix &&
51726300 kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
51736301 sig, sizeof(sig), &e) == 0 &&
5174
- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
6302
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
51756303 kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
5176
- emul_type = 0;
6304
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
51776305 }
51786306
5179
- er = kvm_emulate_instruction(vcpu, emul_type);
5180
- if (er == EMULATE_USER_EXIT)
5181
- return 0;
5182
- if (er != EMULATE_DONE)
5183
- kvm_queue_exception(vcpu, UD_VECTOR);
5184
- return 1;
6307
+ return kvm_emulate_instruction(vcpu, emul_type);
51856308 }
51866309 EXPORT_SYMBOL_GPL(handle_ud);
51876310
....@@ -5204,7 +6327,7 @@
52046327 gpa_t *gpa, struct x86_exception *exception,
52056328 bool write)
52066329 {
5207
- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
6330
+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
52086331 | (write ? PFERR_WRITE_MASK : 0);
52096332
52106333 /*
....@@ -5214,7 +6337,7 @@
52146337 */
52156338 if (vcpu_match_mmio_gva(vcpu, gva)
52166339 && !permission_fault(vcpu, vcpu->arch.walk_mmu,
5217
- vcpu->arch.access, 0, access)) {
6340
+ vcpu->arch.mmio_access, 0, access)) {
52186341 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
52196342 (gva & (PAGE_SIZE - 1));
52206343 trace_vcpu_match_mmio(gva, *gpa, write, false);
....@@ -5323,7 +6446,7 @@
53236446 int handled, ret;
53246447 bool write = ops->write;
53256448 struct kvm_mmio_fragment *frag;
5326
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6449
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
53276450
53286451 /*
53296452 * If the exit was due to a NPF we may already have a GPA.
....@@ -5332,10 +6455,9 @@
53326455 * operation using rep will only have the initial GPA from the NPF
53336456 * occurred.
53346457 */
5335
- if (vcpu->arch.gpa_available &&
5336
- emulator_can_use_gpa(ctxt) &&
5337
- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
5338
- gpa = vcpu->arch.gpa_val;
6458
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
6459
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
6460
+ gpa = ctxt->gpa_val;
53396461 ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
53406462 } else {
53416463 ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
....@@ -5456,9 +6578,10 @@
54566578 unsigned int bytes,
54576579 struct x86_exception *exception)
54586580 {
6581
+ struct kvm_host_map map;
54596582 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6583
+ u64 page_line_mask;
54606584 gpa_t gpa;
5461
- struct page *page;
54626585 char *kaddr;
54636586 bool exchanged;
54646587
....@@ -5472,15 +6595,23 @@
54726595 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
54736596 goto emul_write;
54746597
5475
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
6598
+ /*
6599
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
6600
+ * enabled in the host and the access splits a cache line.
6601
+ */
6602
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
6603
+ page_line_mask = ~(cache_line_size() - 1);
6604
+ else
6605
+ page_line_mask = PAGE_MASK;
6606
+
6607
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
54766608 goto emul_write;
54776609
5478
- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
5479
- if (is_error_page(page))
6610
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
54806611 goto emul_write;
54816612
5482
- kaddr = kmap_atomic(page);
5483
- kaddr += offset_in_page(gpa);
6613
+ kaddr = map.hva + offset_in_page(gpa);
6614
+
54846615 switch (bytes) {
54856616 case 1:
54866617 exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
....@@ -5497,13 +6628,12 @@
54976628 default:
54986629 BUG();
54996630 }
5500
- kunmap_atomic(kaddr);
5501
- kvm_release_page_dirty(page);
6631
+
6632
+ kvm_vcpu_unmap(vcpu, &map, true);
55026633
55036634 if (!exchanged)
55046635 return X86EMUL_CMPXCHG_FAILED;
55056636
5506
- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
55076637 kvm_page_track_write(vcpu, gpa, new, bytes);
55086638
55096639 return X86EMUL_CONTINUE;
....@@ -5557,11 +6687,9 @@
55576687 return 0;
55586688 }
55596689
5560
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
5561
- int size, unsigned short port, void *val,
5562
- unsigned int count)
6690
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
6691
+ unsigned short port, void *val, unsigned int count)
55636692 {
5564
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
55656693 int ret;
55666694
55676695 if (vcpu->arch.pio.count)
....@@ -5581,20 +6709,33 @@
55816709 return 0;
55826710 }
55836711
5584
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
5585
- int size, unsigned short port,
5586
- const void *val, unsigned int count)
6712
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
6713
+ int size, unsigned short port, void *val,
6714
+ unsigned int count)
55876715 {
5588
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6716
+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
55896717
6718
+}
6719
+
6720
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
6721
+ unsigned short port, const void *val,
6722
+ unsigned int count)
6723
+{
55906724 memcpy(vcpu->arch.pio_data, val, size * count);
55916725 trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
55926726 return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
55936727 }
55946728
6729
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
6730
+ int size, unsigned short port,
6731
+ const void *val, unsigned int count)
6732
+{
6733
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
6734
+}
6735
+
55956736 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
55966737 {
5597
- return kvm_x86_ops->get_segment_base(vcpu, seg);
6738
+ return kvm_x86_ops.get_segment_base(vcpu, seg);
55986739 }
55996740
56006741 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
....@@ -5607,7 +6748,7 @@
56076748 if (!need_emulate_wbinvd(vcpu))
56086749 return X86EMUL_CONTINUE;
56096750
5610
- if (kvm_x86_ops->has_wbinvd_exit()) {
6751
+ if (kvm_x86_ops.has_wbinvd_exit()) {
56116752 int cpu = get_cpu();
56126753
56136754 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
....@@ -5712,27 +6853,27 @@
57126853
57136854 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
57146855 {
5715
- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
6856
+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
57166857 }
57176858
57186859 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57196860 {
5720
- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
6861
+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
57216862 }
57226863
57236864 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57246865 {
5725
- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
6866
+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
57266867 }
57276868
57286869 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57296870 {
5730
- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
6871
+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
57316872 }
57326873
57336874 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
57346875 {
5735
- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
6876
+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
57366877 }
57376878
57386879 static unsigned long emulator_get_cached_segment_base(
....@@ -5810,28 +6951,33 @@
58106951 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
58116952 u32 msr_index, u64 *pdata)
58126953 {
5813
- struct msr_data msr;
6954
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
58146955 int r;
58156956
5816
- msr.index = msr_index;
5817
- msr.host_initiated = false;
5818
- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
5819
- if (r)
5820
- return r;
6957
+ r = kvm_get_msr(vcpu, msr_index, pdata);
58216958
5822
- *pdata = msr.data;
5823
- return 0;
6959
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
6960
+ /* Bounce to user space */
6961
+ return X86EMUL_IO_NEEDED;
6962
+ }
6963
+
6964
+ return r;
58246965 }
58256966
58266967 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
58276968 u32 msr_index, u64 data)
58286969 {
5829
- struct msr_data msr;
6970
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
6971
+ int r;
58306972
5831
- msr.data = data;
5832
- msr.index = msr_index;
5833
- msr.host_initiated = false;
5834
- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
6973
+ r = kvm_set_msr(vcpu, msr_index, data);
6974
+
6975
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
6976
+ /* Bounce to user space */
6977
+ return X86EMUL_IO_NEEDED;
6978
+ }
6979
+
6980
+ return r;
58356981 }
58366982
58376983 static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
....@@ -5851,7 +6997,7 @@
58516997 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
58526998 u32 pmc)
58536999 {
5854
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
7000
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
58557001 }
58567002
58577003 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
....@@ -5869,13 +7015,35 @@
58697015 struct x86_instruction_info *info,
58707016 enum x86_intercept_stage stage)
58717017 {
5872
- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
7018
+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
7019
+ &ctxt->exception);
58737020 }
58747021
58757022 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
5876
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
7023
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
7024
+ bool exact_only)
58777025 {
5878
- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
7026
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
7027
+}
7028
+
7029
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
7030
+{
7031
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
7032
+}
7033
+
7034
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
7035
+{
7036
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
7037
+}
7038
+
7039
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
7040
+{
7041
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
7042
+}
7043
+
7044
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
7045
+{
7046
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
58797047 }
58807048
58817049 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
....@@ -5890,7 +7058,7 @@
58907058
58917059 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
58927060 {
5893
- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
7061
+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
58947062 }
58957063
58967064 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
....@@ -5900,12 +7068,26 @@
59007068
59017069 static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
59027070 {
5903
- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
7071
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7072
+
7073
+ vcpu->arch.hflags = emul_flags;
7074
+ kvm_mmu_reset_context(vcpu);
59047075 }
59057076
5906
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
7077
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
7078
+ const char *smstate)
59077079 {
5908
- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
7080
+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
7081
+}
7082
+
7083
+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
7084
+{
7085
+ kvm_smm_changed(emul_to_vcpu(ctxt));
7086
+}
7087
+
7088
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
7089
+{
7090
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
59097091 }
59107092
59117093 static const struct x86_emulate_ops emulate_ops = {
....@@ -5944,15 +7126,21 @@
59447126 .fix_hypercall = emulator_fix_hypercall,
59457127 .intercept = emulator_intercept,
59467128 .get_cpuid = emulator_get_cpuid,
7129
+ .guest_has_long_mode = emulator_guest_has_long_mode,
7130
+ .guest_has_movbe = emulator_guest_has_movbe,
7131
+ .guest_has_fxsr = emulator_guest_has_fxsr,
7132
+ .guest_has_rdpid = emulator_guest_has_rdpid,
59477133 .set_nmi_mask = emulator_set_nmi_mask,
59487134 .get_hflags = emulator_get_hflags,
59497135 .set_hflags = emulator_set_hflags,
59507136 .pre_leave_smm = emulator_pre_leave_smm,
7137
+ .post_leave_smm = emulator_post_leave_smm,
7138
+ .set_xcr = emulator_set_xcr,
59517139 };
59527140
59537141 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
59547142 {
5955
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
7143
+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
59567144 /*
59577145 * an sti; sti; sequence only disable interrupts for the first
59587146 * instruction. So, if the last instruction, be it emulated or
....@@ -5963,7 +7151,7 @@
59637151 if (int_shadow & mask)
59647152 mask = 0;
59657153 if (unlikely(int_shadow || mask)) {
5966
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
7154
+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
59677155 if (!mask)
59687156 kvm_make_request(KVM_REQ_EVENT, vcpu);
59697157 }
....@@ -5971,9 +7159,9 @@
59717159
59727160 static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
59737161 {
5974
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7162
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
59757163 if (ctxt->exception.vector == PF_VECTOR)
5976
- return kvm_propagate_fault(vcpu, &ctxt->exception);
7164
+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
59777165
59787166 if (ctxt->exception.error_code_valid)
59797167 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
....@@ -5983,13 +7171,31 @@
59837171 return false;
59847172 }
59857173
7174
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
7175
+{
7176
+ struct x86_emulate_ctxt *ctxt;
7177
+
7178
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
7179
+ if (!ctxt) {
7180
+ pr_err("kvm: failed to allocate vcpu's emulator\n");
7181
+ return NULL;
7182
+ }
7183
+
7184
+ ctxt->vcpu = vcpu;
7185
+ ctxt->ops = &emulate_ops;
7186
+ vcpu->arch.emulate_ctxt = ctxt;
7187
+
7188
+ return ctxt;
7189
+}
7190
+
59867191 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
59877192 {
5988
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7193
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
59897194 int cs_db, cs_l;
59907195
5991
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
7196
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
59927197
7198
+ ctxt->gpa_available = false;
59937199 ctxt->eflags = kvm_get_rflags(vcpu);
59947200 ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
59957201
....@@ -6003,13 +7209,18 @@
60037209 BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
60047210 BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
60057211
7212
+ ctxt->interruptibility = 0;
7213
+ ctxt->have_exception = false;
7214
+ ctxt->exception.vector = -1;
7215
+ ctxt->perm_ok = false;
7216
+
60067217 init_decode_cache(ctxt);
60077218 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
60087219 }
60097220
6010
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
7221
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
60117222 {
6012
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7223
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
60137224 int ret;
60147225
60157226 init_emulate_ctxt(vcpu);
....@@ -6019,37 +7230,43 @@
60197230 ctxt->_eip = ctxt->eip + inc_eip;
60207231 ret = emulate_int_real(ctxt, irq);
60217232
6022
- if (ret != X86EMUL_CONTINUE)
6023
- return EMULATE_FAIL;
6024
-
6025
- ctxt->eip = ctxt->_eip;
6026
- kvm_rip_write(vcpu, ctxt->eip);
6027
- kvm_set_rflags(vcpu, ctxt->eflags);
6028
-
6029
- return EMULATE_DONE;
7233
+ if (ret != X86EMUL_CONTINUE) {
7234
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7235
+ } else {
7236
+ ctxt->eip = ctxt->_eip;
7237
+ kvm_rip_write(vcpu, ctxt->eip);
7238
+ kvm_set_rflags(vcpu, ctxt->eflags);
7239
+ }
60307240 }
60317241 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
60327242
60337243 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
60347244 {
6035
- int r = EMULATE_DONE;
6036
-
60377245 ++vcpu->stat.insn_emulation_fail;
60387246 trace_kvm_emulate_insn_failed(vcpu);
60397247
6040
- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
6041
- return EMULATE_FAIL;
7248
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
7249
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7250
+ return 1;
7251
+ }
60427252
6043
- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
7253
+ if (emulation_type & EMULTYPE_SKIP) {
60447254 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
60457255 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
60467256 vcpu->run->internal.ndata = 0;
6047
- r = EMULATE_USER_EXIT;
7257
+ return 0;
60487258 }
60497259
60507260 kvm_queue_exception(vcpu, UD_VECTOR);
60517261
6052
- return r;
7262
+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
7263
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7264
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
7265
+ vcpu->run->internal.ndata = 0;
7266
+ return 0;
7267
+ }
7268
+
7269
+ return 1;
60537270 }
60547271
60557272 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
....@@ -6059,13 +7276,14 @@
60597276 gpa_t gpa = cr2_or_gpa;
60607277 kvm_pfn_t pfn;
60617278
6062
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
7279
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
60637280 return false;
60647281
6065
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
7282
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7283
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
60667284 return false;
60677285
6068
- if (!vcpu->arch.mmu.direct_map) {
7286
+ if (!vcpu->arch.mmu->direct_map) {
60697287 /*
60707288 * Write permission should be allowed since only
60717289 * write access need to be emulated.
....@@ -6098,7 +7316,7 @@
60987316 kvm_release_pfn_clean(pfn);
60997317
61007318 /* The instructions are well-emulated on direct mmu. */
6101
- if (vcpu->arch.mmu.direct_map) {
7319
+ if (vcpu->arch.mmu->direct_map) {
61027320 unsigned int indirect_shadow_pages;
61037321
61047322 spin_lock(&vcpu->kvm->mmu_lock);
....@@ -6150,10 +7368,11 @@
61507368 */
61517369 vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
61527370
6153
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
7371
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
61547372 return false;
61557373
6156
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
7374
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
7375
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
61577376 return false;
61587377
61597378 if (x86_page_table_writing_insn(ctxt))
....@@ -6165,7 +7384,7 @@
61657384 vcpu->arch.last_retry_eip = ctxt->eip;
61667385 vcpu->arch.last_retry_addr = cr2_or_gpa;
61677386
6168
- if (!vcpu->arch.mmu.direct_map)
7387
+ if (!vcpu->arch.mmu->direct_map)
61697388 gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
61707389
61717390 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
....@@ -6189,16 +7408,6 @@
61897408 kvm_mmu_reset_context(vcpu);
61907409 }
61917410
6192
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
6193
-{
6194
- unsigned changed = vcpu->arch.hflags ^ emul_flags;
6195
-
6196
- vcpu->arch.hflags = emul_flags;
6197
-
6198
- if (changed & HF_SMM_MASK)
6199
- kvm_smm_changed(vcpu);
6200
-}
6201
-
62027411 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
62037412 unsigned long *db)
62047413 {
....@@ -6214,34 +7423,29 @@
62147423 return dr6;
62157424 }
62167425
6217
-static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
7426
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
62187427 {
62197428 struct kvm_run *kvm_run = vcpu->run;
62207429
62217430 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
62227431 kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
6223
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
7432
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
62247433 kvm_run->debug.arch.exception = DB_VECTOR;
62257434 kvm_run->exit_reason = KVM_EXIT_DEBUG;
6226
- *r = EMULATE_USER_EXIT;
6227
- } else {
6228
- /*
6229
- * "Certain debug exceptions may clear bit 0-3. The
6230
- * remaining contents of the DR6 register are never
6231
- * cleared by the processor".
6232
- */
6233
- vcpu->arch.dr6 &= ~15;
6234
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
6235
- kvm_queue_exception(vcpu, DB_VECTOR);
7435
+ return 0;
62367436 }
7437
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
7438
+ return 1;
62377439 }
62387440
62397441 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
62407442 {
6241
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
6242
- int r = EMULATE_DONE;
7443
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
7444
+ int r;
62437445
6244
- kvm_x86_ops->skip_emulated_instruction(vcpu);
7446
+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
7447
+ if (unlikely(!r))
7448
+ return 0;
62457449
62467450 /*
62477451 * rflags is the old, "raw" value of the flags. The new value has
....@@ -6252,12 +7456,12 @@
62527456 * that sets the TF flag".
62537457 */
62547458 if (unlikely(rflags & X86_EFLAGS_TF))
6255
- kvm_vcpu_do_singlestep(vcpu, &r);
6256
- return r == EMULATE_DONE;
7459
+ r = kvm_vcpu_do_singlestep(vcpu);
7460
+ return r;
62577461 }
62587462 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
62597463
6260
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
7464
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
62617465 {
62627466 if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
62637467 (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
....@@ -6272,7 +7476,7 @@
62727476 kvm_run->debug.arch.pc = eip;
62737477 kvm_run->debug.arch.exception = DB_VECTOR;
62747478 kvm_run->exit_reason = KVM_EXIT_DEBUG;
6275
- *r = EMULATE_USER_EXIT;
7479
+ *r = 0;
62767480 return true;
62777481 }
62787482 }
....@@ -6285,10 +7489,8 @@
62857489 vcpu->arch.db);
62867490
62877491 if (dr6 != 0) {
6288
- vcpu->arch.dr6 &= ~15;
6289
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
6290
- kvm_queue_exception(vcpu, DB_VECTOR);
6291
- *r = EMULATE_DONE;
7492
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
7493
+ *r = 1;
62927494 return true;
62937495 }
62947496 }
....@@ -6327,13 +7529,45 @@
63277529 return false;
63287530 }
63297531
7532
+/*
7533
+ * Decode an instruction for emulation. The caller is responsible for handling
7534
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
7535
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
7536
+ * code breakpoints have higher priority and thus have already been done by
7537
+ * hardware.
7538
+ *
7539
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
7540
+ * response to a machine check.
7541
+ */
7542
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
7543
+ void *insn, int insn_len)
7544
+{
7545
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7546
+ int r;
7547
+
7548
+ init_emulate_ctxt(vcpu);
7549
+
7550
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
7551
+
7552
+ r = x86_decode_insn(ctxt, insn, insn_len);
7553
+
7554
+ trace_kvm_emulate_insn_start(vcpu);
7555
+ ++vcpu->stat.insn_emulation;
7556
+
7557
+ return r;
7558
+}
7559
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
7560
+
63307561 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
63317562 int emulation_type, void *insn, int insn_len)
63327563 {
63337564 int r;
6334
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
7565
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
63357566 bool writeback = true;
6336
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
7567
+ bool write_fault_to_spt;
7568
+
7569
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
7570
+ return 1;
63377571
63387572 vcpu->arch.l1tf_flush_l1d = true;
63397573
....@@ -6341,40 +7575,36 @@
63417575 * Clear write_fault_to_shadow_pgtable here to ensure it is
63427576 * never reused.
63437577 */
7578
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
63447579 vcpu->arch.write_fault_to_shadow_pgtable = false;
6345
- kvm_clear_exception_queue(vcpu);
63467580
63477581 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
6348
- init_emulate_ctxt(vcpu);
7582
+ kvm_clear_exception_queue(vcpu);
63497583
63507584 /*
6351
- * We will reenter on the same instruction since
6352
- * we do not set complete_userspace_io. This does not
6353
- * handle watchpoints yet, those would be handled in
6354
- * the emulate_ops.
7585
+ * Return immediately if RIP hits a code breakpoint, such #DBs
7586
+ * are fault-like and are higher priority than any faults on
7587
+ * the code fetch itself.
63557588 */
63567589 if (!(emulation_type & EMULTYPE_SKIP) &&
6357
- kvm_vcpu_check_breakpoint(vcpu, &r))
7590
+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
63587591 return r;
63597592
6360
- ctxt->interruptibility = 0;
6361
- ctxt->have_exception = false;
6362
- ctxt->exception.vector = -1;
6363
- ctxt->perm_ok = false;
6364
-
6365
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
6366
-
6367
- r = x86_decode_insn(ctxt, insn, insn_len);
6368
-
6369
- trace_kvm_emulate_insn_start(vcpu);
6370
- ++vcpu->stat.insn_emulation;
7593
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
7594
+ insn, insn_len);
63717595 if (r != EMULATION_OK) {
6372
- if (emulation_type & EMULTYPE_TRAP_UD)
6373
- return EMULATE_FAIL;
6374
- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
6375
- emulation_type))
6376
- return EMULATE_DONE;
6377
- if (ctxt->have_exception) {
7596
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
7597
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
7598
+ kvm_queue_exception(vcpu, UD_VECTOR);
7599
+ return 1;
7600
+ }
7601
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
7602
+ write_fault_to_spt,
7603
+ emulation_type))
7604
+ return 1;
7605
+
7606
+ if (ctxt->have_exception &&
7607
+ !(emulation_type & EMULTYPE_SKIP)) {
63787608 /*
63797609 * #UD should result in just EMULATION_FAILED, and trap-like
63807610 * exception should not be encountered during decode.
....@@ -6382,27 +7612,32 @@
63827612 WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
63837613 exception_type(ctxt->exception.vector) == EXCPT_TRAP);
63847614 inject_emulated_exception(vcpu);
6385
- return EMULATE_DONE;
7615
+ return 1;
63867616 }
6387
- if (emulation_type & EMULTYPE_SKIP)
6388
- return EMULATE_FAIL;
63897617 return handle_emulation_failure(vcpu, emulation_type);
63907618 }
63917619 }
63927620
6393
- if ((emulation_type & EMULTYPE_VMWARE) &&
6394
- !is_vmware_backdoor_opcode(ctxt))
6395
- return EMULATE_FAIL;
7621
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
7622
+ !is_vmware_backdoor_opcode(ctxt)) {
7623
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
7624
+ return 1;
7625
+ }
63967626
7627
+ /*
7628
+ * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
7629
+ * for kvm_skip_emulated_instruction(). The caller is responsible for
7630
+ * updating interruptibility state and injecting single-step #DBs.
7631
+ */
63977632 if (emulation_type & EMULTYPE_SKIP) {
63987633 kvm_rip_write(vcpu, ctxt->_eip);
63997634 if (ctxt->eflags & X86_EFLAGS_RF)
64007635 kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
6401
- return EMULATE_DONE;
7636
+ return 1;
64027637 }
64037638
64047639 if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
6405
- return EMULATE_DONE;
7640
+ return 1;
64067641
64077642 /* this is needed for vmware backdoor interface to work since it
64087643 changes registers values during IO operation */
....@@ -6412,24 +7647,35 @@
64127647 }
64137648
64147649 restart:
6415
- /* Save the faulting GPA (cr2) in the address field */
6416
- ctxt->exception.address = cr2_or_gpa;
7650
+ if (emulation_type & EMULTYPE_PF) {
7651
+ /* Save the faulting GPA (cr2) in the address field */
7652
+ ctxt->exception.address = cr2_or_gpa;
7653
+
7654
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
7655
+ if (vcpu->arch.mmu->direct_map) {
7656
+ ctxt->gpa_available = true;
7657
+ ctxt->gpa_val = cr2_or_gpa;
7658
+ }
7659
+ } else {
7660
+ /* Sanitize the address out of an abundance of paranoia. */
7661
+ ctxt->exception.address = 0;
7662
+ }
64177663
64187664 r = x86_emulate_insn(ctxt);
64197665
64207666 if (r == EMULATION_INTERCEPTED)
6421
- return EMULATE_DONE;
7667
+ return 1;
64227668
64237669 if (r == EMULATION_FAILED) {
64247670 if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
64257671 emulation_type))
6426
- return EMULATE_DONE;
7672
+ return 1;
64277673
64287674 return handle_emulation_failure(vcpu, emulation_type);
64297675 }
64307676
64317677 if (ctxt->have_exception) {
6432
- r = EMULATE_DONE;
7678
+ r = 1;
64337679 if (inject_emulated_exception(vcpu))
64347680 return r;
64357681 } else if (vcpu->arch.pio.count) {
....@@ -6440,26 +7686,36 @@
64407686 writeback = false;
64417687 vcpu->arch.complete_userspace_io = complete_emulated_pio;
64427688 }
6443
- r = EMULATE_USER_EXIT;
7689
+ r = 0;
64447690 } else if (vcpu->mmio_needed) {
7691
+ ++vcpu->stat.mmio_exits;
7692
+
64457693 if (!vcpu->mmio_is_write)
64467694 writeback = false;
6447
- r = EMULATE_USER_EXIT;
7695
+ r = 0;
64487696 vcpu->arch.complete_userspace_io = complete_emulated_mmio;
64497697 } else if (r == EMULATION_RESTART)
64507698 goto restart;
64517699 else
6452
- r = EMULATE_DONE;
7700
+ r = 1;
64537701
64547702 if (writeback) {
6455
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
7703
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
64567704 toggle_interruptibility(vcpu, ctxt->interruptibility);
64577705 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
7706
+
7707
+ /*
7708
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
7709
+ * only supports code breakpoints and general detect #DB, both
7710
+ * of which are fault-like.
7711
+ */
64587712 if (!ctxt->have_exception ||
64597713 exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
64607714 kvm_rip_write(vcpu, ctxt->eip);
6461
- if (r == EMULATE_DONE && ctxt->tf)
6462
- kvm_vcpu_do_singlestep(vcpu, &r);
7715
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
7716
+ r = kvm_vcpu_do_singlestep(vcpu);
7717
+ if (kvm_x86_ops.update_emulated_instruction)
7718
+ kvm_x86_ops.update_emulated_instruction(vcpu);
64637719 __kvm_set_rflags(vcpu, ctxt->eflags);
64647720 }
64657721
....@@ -6509,9 +7765,9 @@
65097765 static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
65107766 unsigned short port)
65117767 {
6512
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
6513
- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
6514
- size, port, &val, 1);
7768
+ unsigned long val = kvm_rax_read(vcpu);
7769
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
7770
+
65157771 if (ret)
65167772 return ret;
65177773
....@@ -6544,16 +7800,14 @@
65447800 }
65457801
65467802 /* For size less than 4 we merge, else we zero extend */
6547
- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
6548
- : 0;
7803
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
65497804
65507805 /*
6551
- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
7806
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
65527807 * the copy and tracing
65537808 */
6554
- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
6555
- vcpu->arch.pio.port, &val, 1);
6556
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
7809
+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
7810
+ kvm_rax_write(vcpu, val);
65577811
65587812 return kvm_skip_emulated_instruction(vcpu);
65597813 }
....@@ -6565,12 +7819,11 @@
65657819 int ret;
65667820
65677821 /* For size less than 4 we merge, else we zero extend */
6568
- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
7822
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
65697823
6570
- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
6571
- &val, 1);
7824
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
65727825 if (ret) {
6573
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
7826
+ kvm_rax_write(vcpu, val);
65747827 return ret;
65757828 }
65767829
....@@ -6649,10 +7902,8 @@
66497902 }
66507903 #endif
66517904
6652
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
6653
- void *data)
7905
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
66547906 {
6655
- struct cpufreq_freqs *freq = data;
66567907 struct kvm *kvm;
66577908 struct kvm_vcpu *vcpu;
66587909 int i, send_ipi = 0;
....@@ -6696,17 +7947,12 @@
66967947 *
66977948 */
66987949
6699
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
6700
- return 0;
6701
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
6702
- return 0;
6703
-
6704
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
7950
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
67057951
67067952 mutex_lock(&kvm_lock);
67077953 list_for_each_entry(kvm, &vm_list, vm_list) {
67087954 kvm_for_each_vcpu(i, vcpu, kvm) {
6709
- if (vcpu->cpu != freq->cpu)
7955
+ if (vcpu->cpu != cpu)
67107956 continue;
67117957 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
67127958 if (vcpu->cpu != raw_smp_processor_id())
....@@ -6728,8 +7974,24 @@
67287974 * guest context is entered kvmclock will be updated,
67297975 * so the guest will not see stale values.
67307976 */
6731
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
7977
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
67327978 }
7979
+}
7980
+
7981
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
7982
+ void *data)
7983
+{
7984
+ struct cpufreq_freqs *freq = data;
7985
+ int cpu;
7986
+
7987
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
7988
+ return 0;
7989
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
7990
+ return 0;
7991
+
7992
+ for_each_cpu(cpu, freq->policy->cpus)
7993
+ __kvmclock_cpufreq_notifier(freq, cpu);
7994
+
67337995 return 0;
67347996 }
67357997
....@@ -6749,20 +8011,21 @@
67498011
67508012 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
67518013 #ifdef CONFIG_CPU_FREQ
6752
- struct cpufreq_policy policy;
8014
+ struct cpufreq_policy *policy;
67538015 int cpu;
67548016
6755
- memset(&policy, 0, sizeof(policy));
67568017 cpu = get_cpu();
6757
- cpufreq_get_policy(&policy, cpu);
6758
- if (policy.cpuinfo.max_freq)
6759
- max_tsc_khz = policy.cpuinfo.max_freq;
8018
+ policy = cpufreq_cpu_get(cpu);
8019
+ if (policy) {
8020
+ if (policy->cpuinfo.max_freq)
8021
+ max_tsc_khz = policy->cpuinfo.max_freq;
8022
+ cpufreq_cpu_put(policy);
8023
+ }
67608024 put_cpu();
67618025 #endif
67628026 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
67638027 CPUFREQ_TRANSITION_NOTIFIER);
67648028 }
6765
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
67668029
67678030 cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
67688031 kvmclock_cpu_online, kvmclock_cpu_down_prep);
....@@ -6781,7 +8044,7 @@
67818044 int user_mode = 3;
67828045
67838046 if (__this_cpu_read(current_vcpu))
6784
- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
8047
+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
67858048
67868049 return user_mode != 0;
67878050 }
....@@ -6796,10 +8059,20 @@
67968059 return ip;
67978060 }
67988061
8062
+static void kvm_handle_intel_pt_intr(void)
8063
+{
8064
+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
8065
+
8066
+ kvm_make_request(KVM_REQ_PMI, vcpu);
8067
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8068
+ (unsigned long *)&vcpu->arch.pmu.global_status);
8069
+}
8070
+
67998071 static struct perf_guest_info_callbacks kvm_guest_cbs = {
68008072 .is_in_guest = kvm_is_in_guest,
68018073 .is_user_mode = kvm_is_user_mode,
68028074 .get_guest_ip = kvm_get_guest_ip,
8075
+ .handle_intel_pt_intr = NULL,
68038076 };
68048077
68058078 #ifdef CONFIG_X86_64
....@@ -6821,6 +8094,18 @@
68218094 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
68228095
68238096 /*
8097
+ * Indirection to move queue_work() out of the tk_core.seq write held
8098
+ * region to prevent possible deadlocks against time accessors which
8099
+ * are invoked with work related locks held.
8100
+ */
8101
+static void pvclock_irq_work_fn(struct irq_work *w)
8102
+{
8103
+ queue_work(system_long_wq, &pvclock_gtod_work);
8104
+}
8105
+
8106
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
8107
+
8108
+/*
68248109 * Notification about pvclock gtod data update.
68258110 */
68268111 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
....@@ -6831,13 +8116,14 @@
68318116
68328117 update_pvclock_gtod(tk);
68338118
6834
- /* disable master clock if host does not trust, or does not
6835
- * use, TSC based clocksource.
8119
+ /*
8120
+ * Disable master clock if host does not trust, or does not use,
8121
+ * TSC based clocksource. Delegate queue_work() to irq_work as
8122
+ * this is invoked with tk_core.seq write held.
68368123 */
68378124 if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
68388125 atomic_read(&kvm_guest_has_master_clock) != 0)
6839
- queue_work(system_long_wq, &pvclock_gtod_work);
6840
-
8126
+ irq_work_queue(&pvclock_irq_work);
68418127 return 0;
68428128 }
68438129
....@@ -6848,50 +8134,79 @@
68488134
68498135 int kvm_arch_init(void *opaque)
68508136 {
8137
+ struct kvm_x86_init_ops *ops = opaque;
68518138 int r;
6852
- struct kvm_x86_ops *ops = opaque;
68538139
6854
- if (kvm_x86_ops) {
8140
+ if (kvm_x86_ops.hardware_enable) {
68558141 printk(KERN_ERR "kvm: already loaded the other module\n");
68568142 r = -EEXIST;
68578143 goto out;
68588144 }
68598145
68608146 if (!ops->cpu_has_kvm_support()) {
6861
- printk(KERN_ERR "kvm: no hardware support\n");
8147
+ pr_err_ratelimited("kvm: no hardware support\n");
68628148 r = -EOPNOTSUPP;
68638149 goto out;
68648150 }
68658151 if (ops->disabled_by_bios()) {
6866
- printk(KERN_ERR "kvm: disabled by bios\n");
8152
+ pr_err_ratelimited("kvm: disabled by bios\n");
8153
+ r = -EOPNOTSUPP;
8154
+ goto out;
8155
+ }
8156
+
8157
+ /*
8158
+ * KVM explicitly assumes that the guest has an FPU and
8159
+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
8160
+ * vCPU's FPU state as a fxregs_state struct.
8161
+ */
8162
+ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
8163
+ printk(KERN_ERR "kvm: inadequate fpu\n");
68678164 r = -EOPNOTSUPP;
68688165 goto out;
68698166 }
68708167
68718168 r = -ENOMEM;
6872
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
6873
- if (!shared_msrs) {
6874
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
8169
+ x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
8170
+ __alignof__(struct fpu), SLAB_ACCOUNT,
8171
+ NULL);
8172
+ if (!x86_fpu_cache) {
8173
+ printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
68758174 goto out;
68768175 }
68778176
6878
- r = kvm_mmu_module_init();
8177
+ x86_emulator_cache = kvm_alloc_emulator_cache();
8178
+ if (!x86_emulator_cache) {
8179
+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
8180
+ goto out_free_x86_fpu_cache;
8181
+ }
8182
+
8183
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
8184
+ if (!user_return_msrs) {
8185
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
8186
+ goto out_free_x86_emulator_cache;
8187
+ }
8188
+
8189
+ r = kvm_mmu_vendor_module_init();
68798190 if (r)
68808191 goto out_free_percpu;
6881
-
6882
- kvm_x86_ops = ops;
68838192
68848193 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
68858194 PT_DIRTY_MASK, PT64_NX_MASK, 0,
68868195 PT_PRESENT_MASK, 0, sme_me_mask);
68878196 kvm_timer_init();
68888197
8198
+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
8199
+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
68898200 perf_register_guest_info_callbacks(&kvm_guest_cbs);
68908201
6891
- if (boot_cpu_has(X86_FEATURE_XSAVE))
8202
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
68928203 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
8204
+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
8205
+ }
68938206
68948207 kvm_lapic_init();
8208
+ if (pi_inject_timer == -1)
8209
+ pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
68958210 #ifdef CONFIG_X86_64
68968211 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
68978212
....@@ -6902,7 +8217,11 @@
69028217 return 0;
69038218
69048219 out_free_percpu:
6905
- free_percpu(shared_msrs);
8220
+ free_percpu(user_return_msrs);
8221
+out_free_x86_emulator_cache:
8222
+ kmem_cache_destroy(x86_emulator_cache);
8223
+out_free_x86_fpu_cache:
8224
+ kmem_cache_destroy(x86_fpu_cache);
69068225 out:
69078226 return r;
69088227 }
....@@ -6915,6 +8234,7 @@
69158234 #endif
69168235 kvm_lapic_exit();
69178236 perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
8237
+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
69188238
69198239 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
69208240 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
....@@ -6922,11 +8242,14 @@
69228242 cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
69238243 #ifdef CONFIG_X86_64
69248244 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
8245
+ irq_work_sync(&pvclock_irq_work);
69258246 cancel_work_sync(&pvclock_gtod_work);
69268247 #endif
6927
- kvm_x86_ops = NULL;
6928
- kvm_mmu_module_exit();
6929
- free_percpu(shared_msrs);
8248
+ kvm_x86_ops.hardware_enable = NULL;
8249
+ kvm_mmu_vendor_module_exit();
8250
+ free_percpu(user_return_msrs);
8251
+ kmem_cache_destroy(x86_emulator_cache);
8252
+ kmem_cache_destroy(x86_fpu_cache);
69308253 }
69318254
69328255 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
....@@ -6990,22 +8313,52 @@
69908313 */
69918314 static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
69928315 {
6993
- struct kvm_lapic_irq lapic_irq;
8316
+ /*
8317
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
8318
+ * common code, e.g. for tracing. Defer initialization to the compiler.
8319
+ */
8320
+ struct kvm_lapic_irq lapic_irq = {
8321
+ .delivery_mode = APIC_DM_REMRD,
8322
+ .dest_mode = APIC_DEST_PHYSICAL,
8323
+ .shorthand = APIC_DEST_NOSHORT,
8324
+ .dest_id = apicid,
8325
+ };
69948326
6995
- lapic_irq.shorthand = 0;
6996
- lapic_irq.dest_mode = 0;
6997
- lapic_irq.level = 0;
6998
- lapic_irq.dest_id = apicid;
6999
- lapic_irq.msi_redir_hint = false;
7000
-
7001
- lapic_irq.delivery_mode = APIC_DM_REMRD;
70028327 kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
70038328 }
70048329
7005
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
8330
+bool kvm_apicv_activated(struct kvm *kvm)
70068331 {
7007
- vcpu->arch.apicv_active = false;
7008
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
8332
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
8333
+}
8334
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
8335
+
8336
+void kvm_apicv_init(struct kvm *kvm, bool enable)
8337
+{
8338
+ if (enable)
8339
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
8340
+ &kvm->arch.apicv_inhibit_reasons);
8341
+ else
8342
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
8343
+ &kvm->arch.apicv_inhibit_reasons);
8344
+}
8345
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
8346
+
8347
+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
8348
+{
8349
+ struct kvm_vcpu *target = NULL;
8350
+ struct kvm_apic_map *map;
8351
+
8352
+ rcu_read_lock();
8353
+ map = rcu_dereference(kvm->arch.apic_map);
8354
+
8355
+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
8356
+ target = map->phys_map[dest_id]->vcpu;
8357
+
8358
+ rcu_read_unlock();
8359
+
8360
+ if (target && READ_ONCE(target->ready))
8361
+ kvm_vcpu_yield_to(target);
70098362 }
70108363
70118364 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
....@@ -7016,11 +8369,11 @@
70168369 if (kvm_hv_hypercall_enabled(vcpu->kvm))
70178370 return kvm_hv_hypercall(vcpu);
70188371
7019
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
7020
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
7021
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
7022
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
7023
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
8372
+ nr = kvm_rax_read(vcpu);
8373
+ a0 = kvm_rbx_read(vcpu);
8374
+ a1 = kvm_rcx_read(vcpu);
8375
+ a2 = kvm_rdx_read(vcpu);
8376
+ a3 = kvm_rsi_read(vcpu);
70248377
70258378 trace_kvm_hypercall(nr, a0, a1, a2, a3);
70268379
....@@ -7033,17 +8386,23 @@
70338386 a3 &= 0xFFFFFFFF;
70348387 }
70358388
7036
- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
8389
+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
70378390 ret = -KVM_EPERM;
70388391 goto out;
70398392 }
8393
+
8394
+ ret = -KVM_ENOSYS;
70408395
70418396 switch (nr) {
70428397 case KVM_HC_VAPIC_POLL_IRQ:
70438398 ret = 0;
70448399 break;
70458400 case KVM_HC_KICK_CPU:
8401
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
8402
+ break;
8403
+
70468404 kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
8405
+ kvm_sched_yield(vcpu->kvm, a1);
70478406 ret = 0;
70488407 break;
70498408 #ifdef CONFIG_X86_64
....@@ -7052,7 +8411,17 @@
70528411 break;
70538412 #endif
70548413 case KVM_HC_SEND_IPI:
8414
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
8415
+ break;
8416
+
70558417 ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
8418
+ break;
8419
+ case KVM_HC_SCHED_YIELD:
8420
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
8421
+ break;
8422
+
8423
+ kvm_sched_yield(vcpu->kvm, a0);
8424
+ ret = 0;
70568425 break;
70578426 default:
70588427 ret = -KVM_ENOSYS;
....@@ -7061,7 +8430,7 @@
70618430 out:
70628431 if (!op_64_bit)
70638432 ret = (u32)ret;
7064
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
8433
+ kvm_rax_write(vcpu, ret);
70658434
70668435 ++vcpu->stat.hypercalls;
70678436 return kvm_skip_emulated_instruction(vcpu);
....@@ -7074,7 +8443,7 @@
70748443 char instruction[3];
70758444 unsigned long rip = kvm_rip_read(vcpu);
70768445
7077
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
8446
+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
70788447
70798448 return emulator_write_emulated(ctxt, rip, instruction, 3,
70808449 &ctxt->exception);
....@@ -7103,7 +8472,7 @@
71038472 {
71048473 int max_irr, tpr;
71058474
7106
- if (!kvm_x86_ops->update_cr8_intercept)
8475
+ if (!kvm_x86_ops.update_cr8_intercept)
71078476 return;
71088477
71098478 if (!lapic_in_kernel(vcpu))
....@@ -7122,24 +8491,32 @@
71228491
71238492 tpr = kvm_lapic_get_cr8(vcpu);
71248493
7125
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
8494
+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
71268495 }
71278496
71288497 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
71298498 {
7130
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
7131
- vcpu->arch.exception.error_code = false;
7132
- kvm_x86_ops->queue_exception(vcpu);
8499
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
8500
+ vcpu->arch.exception.has_error_code,
8501
+ vcpu->arch.exception.error_code,
8502
+ vcpu->arch.exception.injected);
8503
+
8504
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
8505
+ vcpu->arch.exception.error_code = false;
8506
+ kvm_x86_ops.queue_exception(vcpu);
71338507 }
71348508
7135
-static int inject_pending_event(struct kvm_vcpu *vcpu)
8509
+static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
71368510 {
71378511 int r;
8512
+ bool can_inject = true;
71388513
71398514 /* try to reinject previous events if any */
71408515
7141
- if (vcpu->arch.exception.injected)
8516
+ if (vcpu->arch.exception.injected) {
71428517 kvm_inject_exception(vcpu);
8518
+ can_inject = false;
8519
+ }
71438520 /*
71448521 * Do not inject an NMI or interrupt if there is a pending
71458522 * exception. Exceptions and interrupts are recognized at
....@@ -7155,11 +8532,17 @@
71558532 * fully complete the previous instruction.
71568533 */
71578534 else if (!vcpu->arch.exception.pending) {
7158
- if (vcpu->arch.nmi_injected)
7159
- kvm_x86_ops->set_nmi(vcpu);
7160
- else if (vcpu->arch.interrupt.injected)
7161
- kvm_x86_ops->set_irq(vcpu);
8535
+ if (vcpu->arch.nmi_injected) {
8536
+ kvm_x86_ops.set_nmi(vcpu);
8537
+ can_inject = false;
8538
+ } else if (vcpu->arch.interrupt.injected) {
8539
+ kvm_x86_ops.set_irq(vcpu);
8540
+ can_inject = false;
8541
+ }
71628542 }
8543
+
8544
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
8545
+ vcpu->arch.exception.pending);
71638546
71648547 /*
71658548 * Call check_nested_events() even if we reinjected a previous event
....@@ -7167,69 +8550,107 @@
71678550 * from L2 to L1 due to pending L1 events which require exit
71688551 * from L2 to L1.
71698552 */
7170
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7171
- r = kvm_x86_ops->check_nested_events(vcpu);
7172
- if (r != 0)
7173
- return r;
8553
+ if (is_guest_mode(vcpu)) {
8554
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
8555
+ if (r < 0)
8556
+ goto busy;
71748557 }
71758558
71768559 /* try to inject new event if pending */
71778560 if (vcpu->arch.exception.pending) {
7178
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
7179
- vcpu->arch.exception.has_error_code,
7180
- vcpu->arch.exception.error_code);
7181
-
7182
- WARN_ON_ONCE(vcpu->arch.exception.injected);
7183
- vcpu->arch.exception.pending = false;
7184
- vcpu->arch.exception.injected = true;
7185
-
8561
+ /*
8562
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
8563
+ * value pushed on the stack. Trap-like exception and all #DBs
8564
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
8565
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
8566
+ *
8567
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
8568
+ * describe the behavior of General Detect #DBs, which are
8569
+ * fault-like. They do _not_ set RF, a la code breakpoints.
8570
+ */
71868571 if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
71878572 __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
71888573 X86_EFLAGS_RF);
71898574
7190
- if (vcpu->arch.exception.nr == DB_VECTOR &&
7191
- (vcpu->arch.dr7 & DR7_GD)) {
7192
- vcpu->arch.dr7 &= ~DR7_GD;
7193
- kvm_update_dr7(vcpu);
8575
+ if (vcpu->arch.exception.nr == DB_VECTOR) {
8576
+ kvm_deliver_exception_payload(vcpu);
8577
+ if (vcpu->arch.dr7 & DR7_GD) {
8578
+ vcpu->arch.dr7 &= ~DR7_GD;
8579
+ kvm_update_dr7(vcpu);
8580
+ }
71948581 }
71958582
71968583 kvm_inject_exception(vcpu);
8584
+
8585
+ vcpu->arch.exception.pending = false;
8586
+ vcpu->arch.exception.injected = true;
8587
+
8588
+ can_inject = false;
71978589 }
71988590
7199
- /* Don't consider new event if we re-injected an event */
7200
- if (kvm_event_needs_reinjection(vcpu))
7201
- return 0;
7202
-
7203
- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
7204
- kvm_x86_ops->smi_allowed(vcpu)) {
7205
- vcpu->arch.smi_pending = false;
7206
- ++vcpu->arch.smi_count;
7207
- enter_smm(vcpu);
7208
- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
7209
- --vcpu->arch.nmi_pending;
7210
- vcpu->arch.nmi_injected = true;
7211
- kvm_x86_ops->set_nmi(vcpu);
7212
- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
7213
- /*
7214
- * Because interrupts can be injected asynchronously, we are
7215
- * calling check_nested_events again here to avoid a race condition.
7216
- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
7217
- * proposal and current concerns. Perhaps we should be setting
7218
- * KVM_REQ_EVENT only on certain events and not unconditionally?
7219
- */
7220
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
7221
- r = kvm_x86_ops->check_nested_events(vcpu);
7222
- if (r != 0)
7223
- return r;
7224
- }
7225
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
7226
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
7227
- false);
7228
- kvm_x86_ops->set_irq(vcpu);
7229
- }
8591
+ /*
8592
+ * Finally, inject interrupt events. If an event cannot be injected
8593
+ * due to architectural conditions (e.g. IF=0) a window-open exit
8594
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
8595
+ * and can architecturally be injected, but we cannot do it right now:
8596
+ * an interrupt could have arrived just now and we have to inject it
8597
+ * as a vmexit, or there could already an event in the queue, which is
8598
+ * indicated by can_inject. In that case we request an immediate exit
8599
+ * in order to make progress and get back here for another iteration.
8600
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
8601
+ */
8602
+ if (vcpu->arch.smi_pending) {
8603
+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
8604
+ if (r < 0)
8605
+ goto busy;
8606
+ if (r) {
8607
+ vcpu->arch.smi_pending = false;
8608
+ ++vcpu->arch.smi_count;
8609
+ enter_smm(vcpu);
8610
+ can_inject = false;
8611
+ } else
8612
+ kvm_x86_ops.enable_smi_window(vcpu);
72308613 }
72318614
7232
- return 0;
8615
+ if (vcpu->arch.nmi_pending) {
8616
+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
8617
+ if (r < 0)
8618
+ goto busy;
8619
+ if (r) {
8620
+ --vcpu->arch.nmi_pending;
8621
+ vcpu->arch.nmi_injected = true;
8622
+ kvm_x86_ops.set_nmi(vcpu);
8623
+ can_inject = false;
8624
+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
8625
+ }
8626
+ if (vcpu->arch.nmi_pending)
8627
+ kvm_x86_ops.enable_nmi_window(vcpu);
8628
+ }
8629
+
8630
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
8631
+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
8632
+ if (r < 0)
8633
+ goto busy;
8634
+ if (r) {
8635
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
8636
+ kvm_x86_ops.set_irq(vcpu);
8637
+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
8638
+ }
8639
+ if (kvm_cpu_has_injectable_intr(vcpu))
8640
+ kvm_x86_ops.enable_irq_window(vcpu);
8641
+ }
8642
+
8643
+ if (is_guest_mode(vcpu) &&
8644
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
8645
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
8646
+ *req_immediate_exit = true;
8647
+
8648
+ WARN_ON(vcpu->arch.exception.pending);
8649
+ return;
8650
+
8651
+busy:
8652
+ *req_immediate_exit = true;
8653
+ return;
72338654 }
72348655
72358656 static void process_nmi(struct kvm_vcpu *vcpu)
....@@ -7241,7 +8662,7 @@
72418662 * If an NMI is already in progress, limit further NMIs to just one.
72428663 * Otherwise, allow two (and we'll inject the first one immediately).
72438664 */
7244
- if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
8665
+ if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
72458666 limit = 1;
72468667
72478668 vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
....@@ -7331,11 +8752,11 @@
73318752 put_smstate(u32, buf, 0x7f7c, seg.limit);
73328753 put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
73338754
7334
- kvm_x86_ops->get_gdt(vcpu, &dt);
8755
+ kvm_x86_ops.get_gdt(vcpu, &dt);
73358756 put_smstate(u32, buf, 0x7f74, dt.address);
73368757 put_smstate(u32, buf, 0x7f70, dt.size);
73378758
7338
- kvm_x86_ops->get_idt(vcpu, &dt);
8759
+ kvm_x86_ops.get_idt(vcpu, &dt);
73398760 put_smstate(u32, buf, 0x7f58, dt.address);
73408761 put_smstate(u32, buf, 0x7f54, dt.size);
73418762
....@@ -7385,7 +8806,7 @@
73858806 put_smstate(u32, buf, 0x7e94, seg.limit);
73868807 put_smstate(u64, buf, 0x7e98, seg.base);
73878808
7388
- kvm_x86_ops->get_idt(vcpu, &dt);
8809
+ kvm_x86_ops.get_idt(vcpu, &dt);
73898810 put_smstate(u32, buf, 0x7e84, dt.size);
73908811 put_smstate(u64, buf, 0x7e88, dt.address);
73918812
....@@ -7395,7 +8816,7 @@
73958816 put_smstate(u32, buf, 0x7e74, seg.limit);
73968817 put_smstate(u64, buf, 0x7e78, seg.base);
73978818
7398
- kvm_x86_ops->get_gdt(vcpu, &dt);
8819
+ kvm_x86_ops.get_gdt(vcpu, &dt);
73998820 put_smstate(u32, buf, 0x7e64, dt.size);
74008821 put_smstate(u64, buf, 0x7e68, dt.address);
74018822
....@@ -7425,28 +8846,28 @@
74258846 * vCPU state (e.g. leave guest mode) after we've saved the state into
74268847 * the SMM state-save area.
74278848 */
7428
- kvm_x86_ops->pre_enter_smm(vcpu, buf);
8849
+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
74298850
74308851 vcpu->arch.hflags |= HF_SMM_MASK;
74318852 kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
74328853
7433
- if (kvm_x86_ops->get_nmi_mask(vcpu))
8854
+ if (kvm_x86_ops.get_nmi_mask(vcpu))
74348855 vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
74358856 else
7436
- kvm_x86_ops->set_nmi_mask(vcpu, true);
8857
+ kvm_x86_ops.set_nmi_mask(vcpu, true);
74378858
74388859 kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
74398860 kvm_rip_write(vcpu, 0x8000);
74408861
74418862 cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
7442
- kvm_x86_ops->set_cr0(vcpu, cr0);
8863
+ kvm_x86_ops.set_cr0(vcpu, cr0);
74438864 vcpu->arch.cr0 = cr0;
74448865
7445
- kvm_x86_ops->set_cr4(vcpu, 0);
8866
+ kvm_x86_ops.set_cr4(vcpu, 0);
74468867
74478868 /* Undocumented: IDT limit is set to zero on entry to SMM. */
74488869 dt.address = dt.size = 0;
7449
- kvm_x86_ops->set_idt(vcpu, &dt);
8870
+ kvm_x86_ops.set_idt(vcpu, &dt);
74508871
74518872 __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
74528873
....@@ -7477,10 +8898,10 @@
74778898
74788899 #ifdef CONFIG_X86_64
74798900 if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
7480
- kvm_x86_ops->set_efer(vcpu, 0);
8901
+ kvm_x86_ops.set_efer(vcpu, 0);
74818902 #endif
74828903
7483
- kvm_update_cpuid(vcpu);
8904
+ kvm_update_cpuid_runtime(vcpu);
74848905 kvm_mmu_reset_context(vcpu);
74858906 }
74868907
....@@ -7490,10 +8911,82 @@
74908911 kvm_make_request(KVM_REQ_EVENT, vcpu);
74918912 }
74928913
8914
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
8915
+ unsigned long *vcpu_bitmap)
8916
+{
8917
+ cpumask_var_t cpus;
8918
+
8919
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
8920
+
8921
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
8922
+ NULL, vcpu_bitmap, cpus);
8923
+
8924
+ free_cpumask_var(cpus);
8925
+}
8926
+
74938927 void kvm_make_scan_ioapic_request(struct kvm *kvm)
74948928 {
74958929 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
74968930 }
8931
+
8932
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
8933
+{
8934
+ if (!lapic_in_kernel(vcpu))
8935
+ return;
8936
+
8937
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
8938
+ kvm_apic_update_apicv(vcpu);
8939
+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
8940
+}
8941
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
8942
+
8943
+/*
8944
+ * NOTE: Do not hold any lock prior to calling this.
8945
+ *
8946
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
8947
+ * locked, because it calls __x86_set_memory_region() which does
8948
+ * synchronize_srcu(&kvm->srcu).
8949
+ */
8950
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
8951
+{
8952
+ struct kvm_vcpu *except;
8953
+ unsigned long old, new, expected;
8954
+
8955
+ if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
8956
+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
8957
+ return;
8958
+
8959
+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
8960
+ do {
8961
+ expected = new = old;
8962
+ if (activate)
8963
+ __clear_bit(bit, &new);
8964
+ else
8965
+ __set_bit(bit, &new);
8966
+ if (new == old)
8967
+ break;
8968
+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
8969
+ } while (old != expected);
8970
+
8971
+ if (!!old == !!new)
8972
+ return;
8973
+
8974
+ trace_kvm_apicv_update_request(activate, bit);
8975
+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
8976
+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
8977
+
8978
+ /*
8979
+ * Sending request to update APICV for all other vcpus,
8980
+ * while update the calling vcpu immediately instead of
8981
+ * waiting for another #VMEXIT to handle the request.
8982
+ */
8983
+ except = kvm_get_running_vcpu();
8984
+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
8985
+ except);
8986
+ if (except)
8987
+ kvm_vcpu_update_apicv(except);
8988
+}
8989
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
74978990
74988991 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
74998992 {
....@@ -7506,7 +8999,7 @@
75068999 kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
75079000 else {
75089001 if (vcpu->arch.apicv_active)
7509
- kvm_x86_ops->sync_pir_to_irr(vcpu);
9002
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
75109003 if (ioapic_in_kernel(vcpu->kvm))
75119004 kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
75129005 }
....@@ -7526,7 +9019,7 @@
75269019
75279020 bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
75289021 vcpu_to_synic(vcpu)->vec_bitmap, 256);
7529
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
9022
+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
75309023 }
75319024
75329025 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
....@@ -7543,28 +9036,22 @@
75439036 kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
75449037 }
75459038
9039
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
9040
+{
9041
+ if (kvm_x86_ops.guest_memory_reclaimed)
9042
+ kvm_x86_ops.guest_memory_reclaimed(kvm);
9043
+}
9044
+
75469045 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
75479046 {
7548
- struct page *page = NULL;
7549
-
75509047 if (!lapic_in_kernel(vcpu))
75519048 return;
75529049
7553
- if (!kvm_x86_ops->set_apic_access_page_addr)
9050
+ if (!kvm_x86_ops.set_apic_access_page_addr)
75549051 return;
75559052
7556
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
7557
- if (is_error_page(page))
7558
- return;
7559
- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
7560
-
7561
- /*
7562
- * Do not pin apic access page in memory, the MMU notifier
7563
- * will call us again if it is migrated or swapped out.
7564
- */
7565
- put_page(page);
9053
+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
75669054 }
7567
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
75689055
75699056 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
75709057 {
....@@ -7583,12 +9070,17 @@
75839070 bool req_int_win =
75849071 dm_request_for_irq_injection(vcpu) &&
75859072 kvm_cpu_accept_dm_intr(vcpu);
9073
+ fastpath_t exit_fastpath;
75869074
75879075 bool req_immediate_exit = false;
75889076
75899077 if (kvm_request_pending(vcpu)) {
7590
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
7591
- kvm_x86_ops->get_vmcs12_pages(vcpu);
9078
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
9079
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
9080
+ r = 0;
9081
+ goto out;
9082
+ }
9083
+ }
75929084 if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
75939085 kvm_mmu_unload(vcpu);
75949086 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
....@@ -7604,10 +9096,19 @@
76049096 }
76059097 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
76069098 kvm_mmu_sync_roots(vcpu);
7607
- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
7608
- kvm_mmu_load_cr3(vcpu);
7609
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
7610
- kvm_vcpu_flush_tlb(vcpu, true);
9099
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
9100
+ kvm_mmu_load_pgd(vcpu);
9101
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
9102
+ kvm_vcpu_flush_tlb_all(vcpu);
9103
+
9104
+ /* Flushing all ASIDs flushes the current ASID... */
9105
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
9106
+ }
9107
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
9108
+ kvm_vcpu_flush_tlb_current(vcpu);
9109
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
9110
+ kvm_vcpu_flush_tlb_guest(vcpu);
9111
+
76119112 if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
76129113 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
76139114 r = 0;
....@@ -7678,6 +9179,12 @@
76789179 */
76799180 if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
76809181 kvm_hv_process_stimers(vcpu);
9182
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
9183
+ kvm_vcpu_update_apicv(vcpu);
9184
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
9185
+ kvm_check_async_pf_completion(vcpu);
9186
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
9187
+ kvm_x86_ops.msr_filter_changed(vcpu);
76819188 }
76829189
76839190 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
....@@ -7688,32 +9195,9 @@
76889195 goto out;
76899196 }
76909197
7691
- if (inject_pending_event(vcpu) != 0)
7692
- req_immediate_exit = true;
7693
- else {
7694
- /* Enable SMI/NMI/IRQ window open exits if needed.
7695
- *
7696
- * SMIs have three cases:
7697
- * 1) They can be nested, and then there is nothing to
7698
- * do here because RSM will cause a vmexit anyway.
7699
- * 2) There is an ISA-specific reason why SMI cannot be
7700
- * injected, and the moment when this changes can be
7701
- * intercepted.
7702
- * 3) Or the SMI can be pending because
7703
- * inject_pending_event has completed the injection
7704
- * of an IRQ or NMI from the previous vmexit, and
7705
- * then we request an immediate exit to inject the
7706
- * SMI.
7707
- */
7708
- if (vcpu->arch.smi_pending && !is_smm(vcpu))
7709
- if (!kvm_x86_ops->enable_smi_window(vcpu))
7710
- req_immediate_exit = true;
7711
- if (vcpu->arch.nmi_pending)
7712
- kvm_x86_ops->enable_nmi_window(vcpu);
7713
- if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
7714
- kvm_x86_ops->enable_irq_window(vcpu);
7715
- WARN_ON(vcpu->arch.exception.pending);
7716
- }
9198
+ inject_pending_event(vcpu, &req_immediate_exit);
9199
+ if (req_int_win)
9200
+ kvm_x86_ops.enable_irq_window(vcpu);
77179201
77189202 if (kvm_lapic_enabled(vcpu)) {
77199203 update_cr8_intercept(vcpu);
....@@ -7728,7 +9212,7 @@
77289212
77299213 preempt_disable();
77309214
7731
- kvm_x86_ops->prepare_guest_switch(vcpu);
9215
+ kvm_x86_ops.prepare_guest_switch(vcpu);
77329216
77339217 /*
77349218 * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
....@@ -7744,7 +9228,7 @@
77449228 * 1) We should set ->mode before checking ->requests. Please see
77459229 * the comment in kvm_vcpu_exiting_guest_mode().
77469230 *
7747
- * 2) For APICv, we should set ->mode before checking PIR.ON. This
9231
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
77489232 * pairs with the memory barrier implicit in pi_test_and_set_on
77499233 * (see vmx_deliver_posted_interrupt).
77509234 *
....@@ -7759,10 +9243,9 @@
77599243 * notified with kvm_vcpu_kick.
77609244 */
77619245 if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
7762
- kvm_x86_ops->sync_pir_to_irr(vcpu);
9246
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
77639247
7764
- if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
7765
- || need_resched() || signal_pending(current)) {
9248
+ if (kvm_vcpu_exit_request(vcpu)) {
77669249 vcpu->mode = OUTSIDE_GUEST_MODE;
77679250 smp_wmb();
77689251 local_irq_enable();
....@@ -7774,13 +9257,14 @@
77749257
77759258 if (req_immediate_exit) {
77769259 kvm_make_request(KVM_REQ_EVENT, vcpu);
7777
- kvm_x86_ops->request_immediate_exit(vcpu);
9260
+ kvm_x86_ops.request_immediate_exit(vcpu);
77789261 }
77799262
7780
- trace_kvm_entry(vcpu->vcpu_id);
7781
- if (lapic_timer_advance_ns)
7782
- wait_lapic_expire(vcpu);
7783
- guest_enter_irqoff();
9263
+ trace_kvm_entry(vcpu);
9264
+
9265
+ fpregs_assert_state_consistent();
9266
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
9267
+ switch_fpu_return();
77849268
77859269 if (unlikely(vcpu->arch.switch_db_regs)) {
77869270 set_debugreg(0, 7);
....@@ -7794,7 +9278,7 @@
77949278 set_debugreg(0, 7);
77959279 }
77969280
7797
- kvm_x86_ops->run(vcpu);
9281
+ exit_fastpath = kvm_x86_ops.run(vcpu);
77989282
77999283 /*
78009284 * Do this here before restoring debug registers on the host. And
....@@ -7804,9 +9288,8 @@
78049288 */
78059289 if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
78069290 WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
7807
- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
9291
+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
78089292 kvm_update_dr0123(vcpu);
7809
- kvm_update_dr6(vcpu);
78109293 kvm_update_dr7(vcpu);
78119294 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
78129295 }
....@@ -7821,18 +9304,43 @@
78219304 if (hw_breakpoint_active())
78229305 hw_breakpoint_restore();
78239306
9307
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
78249308 vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
78259309
78269310 vcpu->mode = OUTSIDE_GUEST_MODE;
78279311 smp_wmb();
78289312
9313
+ kvm_x86_ops.handle_exit_irqoff(vcpu);
9314
+
9315
+ /*
9316
+ * Consume any pending interrupts, including the possible source of
9317
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
9318
+ * An instruction is required after local_irq_enable() to fully unblock
9319
+ * interrupts on processors that implement an interrupt shadow, the
9320
+ * stat.exits increment will do nicely.
9321
+ */
78299322 kvm_before_interrupt(vcpu);
7830
- kvm_x86_ops->handle_external_intr(vcpu);
9323
+ local_irq_enable();
9324
+ ++vcpu->stat.exits;
9325
+ local_irq_disable();
78319326 kvm_after_interrupt(vcpu);
78329327
7833
- ++vcpu->stat.exits;
9328
+ /*
9329
+ * Wait until after servicing IRQs to account guest time so that any
9330
+ * ticks that occurred while running the guest are properly accounted
9331
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
9332
+ * of accounting via context tracking, but the loss of accuracy is
9333
+ * acceptable for all known use cases.
9334
+ */
9335
+ vtime_account_guest_exit();
78349336
7835
- guest_exit_irqoff();
9337
+ if (lapic_in_kernel(vcpu)) {
9338
+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
9339
+ if (delta != S64_MIN) {
9340
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
9341
+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
9342
+ }
9343
+ }
78369344
78379345 local_irq_enable();
78389346 preempt_enable();
....@@ -7853,12 +9361,13 @@
78539361 if (vcpu->arch.apic_attention)
78549362 kvm_lapic_sync_from_vapic(vcpu);
78559363
7856
- vcpu->arch.gpa_available = false;
7857
- r = kvm_x86_ops->handle_exit(vcpu);
9364
+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
78589365 return r;
78599366
78609367 cancel_injection:
7861
- kvm_x86_ops->cancel_injection(vcpu);
9368
+ if (req_immediate_exit)
9369
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
9370
+ kvm_x86_ops.cancel_injection(vcpu);
78629371 if (unlikely(vcpu->arch.apic_attention))
78639372 kvm_lapic_sync_from_vapic(vcpu);
78649373 out:
....@@ -7868,13 +9377,13 @@
78689377 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
78699378 {
78709379 if (!kvm_arch_vcpu_runnable(vcpu) &&
7871
- (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
9380
+ (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
78729381 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
78739382 kvm_vcpu_block(vcpu);
78749383 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
78759384
7876
- if (kvm_x86_ops->post_block)
7877
- kvm_x86_ops->post_block(vcpu);
9385
+ if (kvm_x86_ops.post_block)
9386
+ kvm_x86_ops.post_block(vcpu);
78789387
78799388 if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
78809389 return 1;
....@@ -7886,6 +9395,7 @@
78869395 vcpu->arch.pv.pv_unhalted = false;
78879396 vcpu->arch.mp_state =
78889397 KVM_MP_STATE_RUNNABLE;
9398
+ fallthrough;
78899399 case KVM_MP_STATE_RUNNABLE:
78909400 vcpu->arch.apf.halted = false;
78919401 break;
....@@ -7893,15 +9403,14 @@
78939403 break;
78949404 default:
78959405 return -EINTR;
7896
- break;
78979406 }
78989407 return 1;
78999408 }
79009409
79019410 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
79029411 {
7903
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7904
- kvm_x86_ops->check_nested_events(vcpu);
9412
+ if (is_guest_mode(vcpu))
9413
+ kvm_x86_ops.nested_ops->check_events(vcpu);
79059414
79069415 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
79079416 !vcpu->arch.apf.halted);
....@@ -7916,6 +9425,13 @@
79169425 vcpu->arch.l1tf_flush_l1d = true;
79179426
79189427 for (;;) {
9428
+ /*
9429
+ * If another guest vCPU requests a PV TLB flush in the middle
9430
+ * of instruction emulation, the rest of the emulation could
9431
+ * use a stale page translation. Assume that any code after
9432
+ * this point can start executing an instruction.
9433
+ */
9434
+ vcpu->arch.at_instruction_boundary = false;
79199435 if (kvm_vcpu_running(vcpu)) {
79209436 r = vcpu_enter_guest(vcpu);
79219437 } else {
....@@ -7937,17 +9453,11 @@
79379453 break;
79389454 }
79399455
7940
- kvm_check_async_pf_completion(vcpu);
7941
-
7942
- if (signal_pending(current)) {
7943
- r = -EINTR;
7944
- vcpu->run->exit_reason = KVM_EXIT_INTR;
7945
- ++vcpu->stat.signal_exits;
7946
- break;
7947
- }
7948
- if (need_resched()) {
9456
+ if (__xfer_to_guest_mode_work_pending()) {
79499457 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
7950
- cond_resched();
9458
+ r = xfer_to_guest_mode_handle_work(vcpu);
9459
+ if (r)
9460
+ return r;
79519461 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
79529462 }
79539463 }
....@@ -7960,12 +9470,11 @@
79609470 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
79619471 {
79629472 int r;
9473
+
79639474 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
79649475 r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
79659476 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
7966
- if (r != EMULATE_DONE)
7967
- return 0;
7968
- return 1;
9477
+ return r;
79699478 }
79709479
79719480 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
....@@ -8038,31 +9547,55 @@
80389547 return 0;
80399548 }
80409549
9550
+static void kvm_save_current_fpu(struct fpu *fpu)
9551
+{
9552
+ /*
9553
+ * If the target FPU state is not resident in the CPU registers, just
9554
+ * memcpy() from current, else save CPU state directly to the target.
9555
+ */
9556
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
9557
+ memcpy(&fpu->state, &current->thread.fpu.state,
9558
+ fpu_kernel_xstate_size);
9559
+ else
9560
+ copy_fpregs_to_fpstate(fpu);
9561
+}
9562
+
80419563 /* Swap (qemu) user FPU context for the guest FPU context. */
80429564 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
80439565 {
8044
- preempt_disable();
8045
- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
8046
- /* PKRU is separately restored in kvm_x86_ops->run. */
8047
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
9566
+ fpregs_lock();
9567
+
9568
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
9569
+
9570
+ /* PKRU is separately restored in kvm_x86_ops.run. */
9571
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
80489572 ~XFEATURE_MASK_PKRU);
8049
- preempt_enable();
9573
+
9574
+ fpregs_mark_activate();
9575
+ fpregs_unlock();
9576
+
80509577 trace_kvm_fpu(1);
80519578 }
80529579
80539580 /* When vcpu_run ends, restore user space FPU context. */
80549581 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
80559582 {
8056
- preempt_disable();
8057
- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
8058
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
8059
- preempt_enable();
9583
+ fpregs_lock();
9584
+
9585
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
9586
+
9587
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
9588
+
9589
+ fpregs_mark_activate();
9590
+ fpregs_unlock();
9591
+
80609592 ++vcpu->stat.fpu_reload;
80619593 trace_kvm_fpu(0);
80629594 }
80639595
8064
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
9596
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
80659597 {
9598
+ struct kvm_run *kvm_run = vcpu->run;
80669599 int r;
80679600
80689601 vcpu_load(vcpu);
....@@ -8080,18 +9613,18 @@
80809613 r = -EAGAIN;
80819614 if (signal_pending(current)) {
80829615 r = -EINTR;
8083
- vcpu->run->exit_reason = KVM_EXIT_INTR;
9616
+ kvm_run->exit_reason = KVM_EXIT_INTR;
80849617 ++vcpu->stat.signal_exits;
80859618 }
80869619 goto out;
80879620 }
80889621
8089
- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
9622
+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
80909623 r = -EINVAL;
80919624 goto out;
80929625 }
80939626
8094
- if (vcpu->run->kvm_dirty_regs) {
9627
+ if (kvm_run->kvm_dirty_regs) {
80959628 r = sync_regs(vcpu);
80969629 if (r != 0)
80979630 goto out;
....@@ -8121,7 +9654,7 @@
81219654
81229655 out:
81239656 kvm_put_guest_fpu(vcpu);
8124
- if (vcpu->run->kvm_valid_regs)
9657
+ if (kvm_run->kvm_valid_regs)
81259658 store_regs(vcpu);
81269659 post_kvm_run_save(vcpu);
81279660 kvm_sigset_deactivate(vcpu);
....@@ -8140,26 +9673,26 @@
81409673 * that usually, but some bad designed PV devices (vmware
81419674 * backdoor interface) need this to work
81429675 */
8143
- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
9676
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
81449677 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
81459678 }
8146
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
8147
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
8148
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
8149
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
8150
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
8151
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
8152
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
8153
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
9679
+ regs->rax = kvm_rax_read(vcpu);
9680
+ regs->rbx = kvm_rbx_read(vcpu);
9681
+ regs->rcx = kvm_rcx_read(vcpu);
9682
+ regs->rdx = kvm_rdx_read(vcpu);
9683
+ regs->rsi = kvm_rsi_read(vcpu);
9684
+ regs->rdi = kvm_rdi_read(vcpu);
9685
+ regs->rsp = kvm_rsp_read(vcpu);
9686
+ regs->rbp = kvm_rbp_read(vcpu);
81549687 #ifdef CONFIG_X86_64
8155
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
8156
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
8157
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
8158
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
8159
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
8160
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
8161
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
8162
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
9688
+ regs->r8 = kvm_r8_read(vcpu);
9689
+ regs->r9 = kvm_r9_read(vcpu);
9690
+ regs->r10 = kvm_r10_read(vcpu);
9691
+ regs->r11 = kvm_r11_read(vcpu);
9692
+ regs->r12 = kvm_r12_read(vcpu);
9693
+ regs->r13 = kvm_r13_read(vcpu);
9694
+ regs->r14 = kvm_r14_read(vcpu);
9695
+ regs->r15 = kvm_r15_read(vcpu);
81639696 #endif
81649697
81659698 regs->rip = kvm_rip_read(vcpu);
....@@ -8179,23 +9712,23 @@
81799712 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
81809713 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
81819714
8182
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
8183
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
8184
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
8185
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
8186
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
8187
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
8188
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
8189
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
9715
+ kvm_rax_write(vcpu, regs->rax);
9716
+ kvm_rbx_write(vcpu, regs->rbx);
9717
+ kvm_rcx_write(vcpu, regs->rcx);
9718
+ kvm_rdx_write(vcpu, regs->rdx);
9719
+ kvm_rsi_write(vcpu, regs->rsi);
9720
+ kvm_rdi_write(vcpu, regs->rdi);
9721
+ kvm_rsp_write(vcpu, regs->rsp);
9722
+ kvm_rbp_write(vcpu, regs->rbp);
81909723 #ifdef CONFIG_X86_64
8191
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
8192
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
8193
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
8194
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
8195
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
8196
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
8197
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
8198
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
9724
+ kvm_r8_write(vcpu, regs->r8);
9725
+ kvm_r9_write(vcpu, regs->r9);
9726
+ kvm_r10_write(vcpu, regs->r10);
9727
+ kvm_r11_write(vcpu, regs->r11);
9728
+ kvm_r12_write(vcpu, regs->r12);
9729
+ kvm_r13_write(vcpu, regs->r13);
9730
+ kvm_r14_write(vcpu, regs->r14);
9731
+ kvm_r15_write(vcpu, regs->r15);
81999732 #endif
82009733
82019734 kvm_rip_write(vcpu, regs->rip);
....@@ -8238,10 +9771,10 @@
82389771 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
82399772 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
82409773
8241
- kvm_x86_ops->get_idt(vcpu, &dt);
9774
+ kvm_x86_ops.get_idt(vcpu, &dt);
82429775 sregs->idt.limit = dt.size;
82439776 sregs->idt.base = dt.address;
8244
- kvm_x86_ops->get_gdt(vcpu, &dt);
9777
+ kvm_x86_ops.get_gdt(vcpu, &dt);
82459778 sregs->gdt.limit = dt.size;
82469779 sregs->gdt.base = dt.address;
82479780
....@@ -8253,7 +9786,7 @@
82539786 sregs->efer = vcpu->arch.efer;
82549787 sregs->apic_base = kvm_get_apic_base(vcpu);
82559788
8256
- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
9789
+ memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
82579790
82589791 if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
82599792 set_bit(vcpu->arch.interrupt.nr,
....@@ -8300,8 +9833,12 @@
83009833 mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
83019834 goto out;
83029835
8303
- /* INITs are latched while in SMM */
8304
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
9836
+ /*
9837
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
9838
+ * INIT state; latched init should be reported using
9839
+ * KVM_SET_VCPU_EVENTS, so reject it here.
9840
+ */
9841
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
83059842 (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
83069843 mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
83079844 goto out;
....@@ -8322,21 +9859,23 @@
83229859 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
83239860 int reason, bool has_error_code, u32 error_code)
83249861 {
8325
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
9862
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
83269863 int ret;
83279864
83289865 init_emulate_ctxt(vcpu);
83299866
83309867 ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
83319868 has_error_code, error_code);
8332
-
8333
- if (ret)
8334
- return EMULATE_FAIL;
9869
+ if (ret) {
9870
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
9871
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
9872
+ vcpu->run->internal.ndata = 0;
9873
+ return 0;
9874
+ }
83359875
83369876 kvm_rip_write(vcpu, ctxt->eip);
83379877 kvm_set_rflags(vcpu, ctxt->eflags);
8338
- kvm_make_request(KVM_REQ_EVENT, vcpu);
8339
- return EMULATE_DONE;
9878
+ return 1;
83409879 }
83419880 EXPORT_SYMBOL_GPL(kvm_task_switch);
83429881
....@@ -8350,6 +9889,8 @@
83509889 */
83519890 if (!(sregs->cr4 & X86_CR4_PAE)
83529891 || !(sregs->efer & EFER_LMA))
9892
+ return -EINVAL;
9893
+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
83539894 return -EINVAL;
83549895 } else {
83559896 /*
....@@ -8382,31 +9923,31 @@
83829923
83839924 dt.size = sregs->idt.limit;
83849925 dt.address = sregs->idt.base;
8385
- kvm_x86_ops->set_idt(vcpu, &dt);
9926
+ kvm_x86_ops.set_idt(vcpu, &dt);
83869927 dt.size = sregs->gdt.limit;
83879928 dt.address = sregs->gdt.base;
8388
- kvm_x86_ops->set_gdt(vcpu, &dt);
9929
+ kvm_x86_ops.set_gdt(vcpu, &dt);
83899930
83909931 vcpu->arch.cr2 = sregs->cr2;
83919932 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
83929933 vcpu->arch.cr3 = sregs->cr3;
8393
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
9934
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
83949935
83959936 kvm_set_cr8(vcpu, sregs->cr8);
83969937
83979938 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
8398
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
9939
+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
83999940
84009941 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
8401
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
9942
+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
84029943 vcpu->arch.cr0 = sregs->cr0;
84039944
84049945 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
84059946 cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
84069947 (X86_CR4_OSXSAVE | X86_CR4_PKE));
8407
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
9948
+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
84089949 if (cpuid_update_needed)
8409
- kvm_update_cpuid(vcpu);
9950
+ kvm_update_cpuid_runtime(vcpu);
84109951
84119952 idx = srcu_read_lock(&vcpu->kvm->srcu);
84129953 if (is_pae_paging(vcpu)) {
....@@ -8510,7 +10051,7 @@
851010051 */
851110052 kvm_set_rflags(vcpu, rflags);
851210053
8513
- kvm_x86_ops->update_bp_intercept(vcpu);
10054
+ kvm_x86_ops.update_exception_bitmap(vcpu);
851410055
851510056 r = 0;
851610057
....@@ -8549,7 +10090,7 @@
854910090
855010091 vcpu_load(vcpu);
855110092
8552
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
10093
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
855310094 memcpy(fpu->fpr, fxsave->st_space, 128);
855410095 fpu->fcw = fxsave->cwd;
855510096 fpu->fsw = fxsave->swd;
....@@ -8557,7 +10098,7 @@
855710098 fpu->last_opcode = fxsave->fop;
855810099 fpu->last_ip = fxsave->rip;
855910100 fpu->last_dp = fxsave->rdp;
8560
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
10101
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
856110102
856210103 vcpu_put(vcpu);
856310104 return 0;
....@@ -8569,7 +10110,7 @@
856910110
857010111 vcpu_load(vcpu);
857110112
8572
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
10113
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
857310114
857410115 memcpy(fxsave->st_space, fpu->fpr, 128);
857510116 fxsave->cwd = fpu->fcw;
....@@ -8578,7 +10119,7 @@
857810119 fxsave->fop = fpu->last_opcode;
857910120 fxsave->rip = fpu->last_ip;
858010121 fxsave->rdp = fpu->last_dp;
8581
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
10122
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
858210123
858310124 vcpu_put(vcpu);
858410125 return 0;
....@@ -8625,9 +10166,9 @@
862510166
862610167 static void fx_init(struct kvm_vcpu *vcpu)
862710168 {
8628
- fpstate_init(&vcpu->arch.guest_fpu.state);
10169
+ fpstate_init(&vcpu->arch.guest_fpu->state);
862910170 if (boot_cpu_has(X86_FEATURE_XSAVES))
8630
- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
10171
+ vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
863110172 host_xcr0 | XSTATE_COMPACTION_ENABLED;
863210173
863310174 /*
....@@ -8638,48 +10179,122 @@
863810179 vcpu->arch.cr0 |= X86_CR0_ET;
863910180 }
864010181
8641
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
10182
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
864210183 {
8643
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
8644
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
8645
-
8646
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
8647
-
8648
- kvmclock_reset(vcpu);
8649
-
8650
- kvm_x86_ops->vcpu_free(vcpu);
8651
- free_cpumask_var(wbinvd_dirty_mask);
8652
-}
8653
-
8654
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
8655
- unsigned int id)
8656
-{
8657
- struct kvm_vcpu *vcpu;
8658
-
865910184 if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
8660
- printk_once(KERN_WARNING
8661
- "kvm: SMP vm created on host with unstable TSC; "
8662
- "guest TSC will not be reliable\n");
10185
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
10186
+ "guest TSC will not be reliable\n");
866310187
8664
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
8665
-
8666
- return vcpu;
10188
+ return 0;
866710189 }
866810190
8669
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
10191
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
867010192 {
10193
+ struct page *page;
10194
+ int r;
10195
+
10196
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
10197
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10198
+ else
10199
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
10200
+
10201
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
10202
+
10203
+ r = kvm_mmu_create(vcpu);
10204
+ if (r < 0)
10205
+ return r;
10206
+
10207
+ if (irqchip_in_kernel(vcpu->kvm)) {
10208
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
10209
+ if (r < 0)
10210
+ goto fail_mmu_destroy;
10211
+ if (kvm_apicv_activated(vcpu->kvm))
10212
+ vcpu->arch.apicv_active = true;
10213
+ } else
10214
+ static_key_slow_inc(&kvm_no_apic_vcpu);
10215
+
10216
+ r = -ENOMEM;
10217
+
10218
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
10219
+ if (!page)
10220
+ goto fail_free_lapic;
10221
+ vcpu->arch.pio_data = page_address(page);
10222
+
10223
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
10224
+ GFP_KERNEL_ACCOUNT);
10225
+ if (!vcpu->arch.mce_banks)
10226
+ goto fail_free_pio_data;
10227
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
10228
+
10229
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
10230
+ GFP_KERNEL_ACCOUNT))
10231
+ goto fail_free_mce_banks;
10232
+
10233
+ if (!alloc_emulate_ctxt(vcpu))
10234
+ goto free_wbinvd_dirty_mask;
10235
+
10236
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
10237
+ GFP_KERNEL_ACCOUNT);
10238
+ if (!vcpu->arch.user_fpu) {
10239
+ pr_err("kvm: failed to allocate userspace's fpu\n");
10240
+ goto free_emulate_ctxt;
10241
+ }
10242
+
10243
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
10244
+ GFP_KERNEL_ACCOUNT);
10245
+ if (!vcpu->arch.guest_fpu) {
10246
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
10247
+ goto free_user_fpu;
10248
+ }
10249
+ fx_init(vcpu);
10250
+
10251
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
10252
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
10253
+
10254
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
10255
+
10256
+ kvm_async_pf_hash_reset(vcpu);
10257
+ kvm_pmu_init(vcpu);
10258
+
10259
+ vcpu->arch.pending_external_vector = -1;
10260
+ vcpu->arch.preempted_in_kernel = false;
10261
+
10262
+ kvm_hv_vcpu_init(vcpu);
10263
+
10264
+ r = kvm_x86_ops.vcpu_create(vcpu);
10265
+ if (r)
10266
+ goto free_guest_fpu;
10267
+
867110268 vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
10269
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
867210270 kvm_vcpu_mtrr_init(vcpu);
867310271 vcpu_load(vcpu);
867410272 kvm_vcpu_reset(vcpu, false);
8675
- kvm_mmu_setup(vcpu);
10273
+ kvm_init_mmu(vcpu, false);
867610274 vcpu_put(vcpu);
867710275 return 0;
10276
+
10277
+free_guest_fpu:
10278
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10279
+free_user_fpu:
10280
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10281
+free_emulate_ctxt:
10282
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10283
+free_wbinvd_dirty_mask:
10284
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10285
+fail_free_mce_banks:
10286
+ kfree(vcpu->arch.mce_banks);
10287
+fail_free_pio_data:
10288
+ free_page((unsigned long)vcpu->arch.pio_data);
10289
+fail_free_lapic:
10290
+ kvm_free_lapic(vcpu);
10291
+fail_mmu_destroy:
10292
+ kvm_mmu_destroy(vcpu);
10293
+ return r;
867810294 }
867910295
868010296 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
868110297 {
8682
- struct msr_data msr;
868310298 struct kvm *kvm = vcpu->kvm;
868410299
868510300 kvm_hv_vcpu_postcreate(vcpu);
....@@ -8687,23 +10302,43 @@
868710302 if (mutex_lock_killable(&vcpu->mutex))
868810303 return;
868910304 vcpu_load(vcpu);
8690
- msr.data = 0x0;
8691
- msr.index = MSR_IA32_TSC;
8692
- msr.host_initiated = true;
8693
- kvm_write_tsc(vcpu, &msr);
10305
+ kvm_synchronize_tsc(vcpu, 0);
869410306 vcpu_put(vcpu);
10307
+
10308
+ /* poll control enabled by default */
10309
+ vcpu->arch.msr_kvm_poll_control = 1;
10310
+
869510311 mutex_unlock(&vcpu->mutex);
869610312
8697
- if (!kvmclock_periodic_sync)
8698
- return;
8699
-
8700
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
8701
- KVMCLOCK_SYNC_PERIOD);
10313
+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
10314
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
10315
+ KVMCLOCK_SYNC_PERIOD);
870210316 }
870310317
870410318 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
870510319 {
8706
- kvm_arch_vcpu_free(vcpu);
10320
+ int idx;
10321
+
10322
+ kvmclock_reset(vcpu);
10323
+
10324
+ kvm_x86_ops.vcpu_free(vcpu);
10325
+
10326
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
10327
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
10328
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
10329
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
10330
+
10331
+ kvm_hv_vcpu_uninit(vcpu);
10332
+ kvm_pmu_destroy(vcpu);
10333
+ kfree(vcpu->arch.mce_banks);
10334
+ kvm_free_lapic(vcpu);
10335
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
10336
+ kvm_mmu_destroy(vcpu);
10337
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
10338
+ free_page((unsigned long)vcpu->arch.pio_data);
10339
+ kvfree(vcpu->arch.cpuid_entries);
10340
+ if (!lapic_in_kernel(vcpu))
10341
+ static_key_slow_dec(&kvm_no_apic_vcpu);
870710342 }
870810343
870910344 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
....@@ -8719,19 +10354,18 @@
871910354 vcpu->arch.nmi_injected = false;
872010355 kvm_clear_interrupt_queue(vcpu);
872110356 kvm_clear_exception_queue(vcpu);
8722
- vcpu->arch.exception.pending = false;
872310357
872410358 memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
872510359 kvm_update_dr0123(vcpu);
872610360 vcpu->arch.dr6 = DR6_INIT;
8727
- kvm_update_dr6(vcpu);
872810361 vcpu->arch.dr7 = DR7_FIXED_1;
872910362 kvm_update_dr7(vcpu);
873010363
873110364 vcpu->arch.cr2 = 0;
873210365
873310366 kvm_make_request(KVM_REQ_EVENT, vcpu);
8734
- vcpu->arch.apf.msr_val = 0;
10367
+ vcpu->arch.apf.msr_en_val = 0;
10368
+ vcpu->arch.apf.msr_int_val = 0;
873510369 vcpu->arch.st.msr_val = 0;
873610370
873710371 kvmclock_reset(vcpu);
....@@ -8749,12 +10383,12 @@
874910383 */
875010384 if (init_event)
875110385 kvm_put_guest_fpu(vcpu);
8752
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8753
- XFEATURE_MASK_BNDREGS);
10386
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10387
+ XFEATURE_BNDREGS);
875410388 if (mpx_state_buffer)
875510389 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
8756
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
8757
- XFEATURE_MASK_BNDCSR);
10390
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
10391
+ XFEATURE_BNDCSR);
875810392 if (mpx_state_buffer)
875910393 memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
876010394 if (init_event)
....@@ -8765,7 +10399,6 @@
876510399 kvm_pmu_reset(vcpu);
876610400 vcpu->arch.smbase = 0x30000;
876710401
8768
- vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
876910402 vcpu->arch.msr_misc_features_enables = 0;
877010403
877110404 vcpu->arch.xcr0 = XFEATURE_MASK_FP;
....@@ -8777,7 +10410,7 @@
877710410
877810411 vcpu->arch.ia32_xss = 0;
877910412
8780
- kvm_x86_ops->vcpu_reset(vcpu, init_event);
10413
+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
878110414 }
878210415
878310416 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
....@@ -8801,8 +10434,8 @@
880110434 u64 max_tsc = 0;
880210435 bool stable, backwards_tsc = false;
880310436
8804
- kvm_shared_msr_cpu_online();
8805
- ret = kvm_x86_ops->hardware_enable();
10437
+ kvm_user_return_msr_cpu_online();
10438
+ ret = kvm_x86_ops.hardware_enable();
880610439 if (ret != 0)
880710440 return ret;
880810441
....@@ -8828,7 +10461,7 @@
882810461 * before any KVM threads can be running. Unfortunately, we can't
882910462 * bring the TSCs fully up to date with real time, as we aren't yet far
883010463 * enough into CPU bringup that we know how much real time has actually
8831
- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
10464
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
883210465 * variables that haven't been updated yet.
883310466 *
883410467 * So we simply find the maximum observed TSC above, then record the
....@@ -8884,19 +10517,32 @@
888410517
888510518 void kvm_arch_hardware_disable(void)
888610519 {
8887
- kvm_x86_ops->hardware_disable();
10520
+ kvm_x86_ops.hardware_disable();
888810521 drop_user_return_notifiers();
888910522 }
889010523
8891
-int kvm_arch_hardware_setup(void)
10524
+int kvm_arch_hardware_setup(void *opaque)
889210525 {
10526
+ struct kvm_x86_init_ops *ops = opaque;
889310527 int r;
889410528
8895
- r = kvm_x86_ops->hardware_setup();
10529
+ rdmsrl_safe(MSR_EFER, &host_efer);
10530
+
10531
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
10532
+ rdmsrl(MSR_IA32_XSS, host_xss);
10533
+
10534
+ r = ops->hardware_setup();
889610535 if (r != 0)
889710536 return r;
889810537
8899
- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
10538
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
10539
+
10540
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
10541
+ supported_xss = 0;
10542
+
10543
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
10544
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
10545
+#undef __kvm_cpu_cap_has
890010546
890110547 if (kvm_has_tsc_control) {
890210548 /*
....@@ -8918,12 +10564,21 @@
891810564
891910565 void kvm_arch_hardware_unsetup(void)
892010566 {
8921
- kvm_x86_ops->hardware_unsetup();
10567
+ kvm_x86_ops.hardware_unsetup();
892210568 }
892310569
8924
-void kvm_arch_check_processor_compat(void *rtn)
10570
+int kvm_arch_check_processor_compat(void *opaque)
892510571 {
8926
- kvm_x86_ops->check_processor_compatibility(rtn);
10572
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
10573
+ struct kvm_x86_init_ops *ops = opaque;
10574
+
10575
+ WARN_ON(!irqs_disabled());
10576
+
10577
+ if (__cr4_reserved_bits(cpu_has, c) !=
10578
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
10579
+ return -EIO;
10580
+
10581
+ return ops->check_processor_compatibility();
892710582 }
892810583
892910584 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
....@@ -8940,107 +10595,35 @@
894010595 struct static_key kvm_no_apic_vcpu __read_mostly;
894110596 EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
894210597
8943
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
8944
-{
8945
- struct page *page;
8946
- int r;
8947
-
8948
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
8949
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
8950
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
8951
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
8952
- else
8953
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
8954
-
8955
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
8956
- if (!page) {
8957
- r = -ENOMEM;
8958
- goto fail;
8959
- }
8960
- vcpu->arch.pio_data = page_address(page);
8961
-
8962
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
8963
-
8964
- r = kvm_mmu_create(vcpu);
8965
- if (r < 0)
8966
- goto fail_free_pio_data;
8967
-
8968
- if (irqchip_in_kernel(vcpu->kvm)) {
8969
- r = kvm_create_lapic(vcpu);
8970
- if (r < 0)
8971
- goto fail_mmu_destroy;
8972
- } else
8973
- static_key_slow_inc(&kvm_no_apic_vcpu);
8974
-
8975
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
8976
- GFP_KERNEL);
8977
- if (!vcpu->arch.mce_banks) {
8978
- r = -ENOMEM;
8979
- goto fail_free_lapic;
8980
- }
8981
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
8982
-
8983
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
8984
- r = -ENOMEM;
8985
- goto fail_free_mce_banks;
8986
- }
8987
-
8988
- fx_init(vcpu);
8989
-
8990
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
8991
-
8992
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
8993
-
8994
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
8995
-
8996
- kvm_async_pf_hash_reset(vcpu);
8997
- kvm_pmu_init(vcpu);
8998
-
8999
- vcpu->arch.pending_external_vector = -1;
9000
- vcpu->arch.preempted_in_kernel = false;
9001
-
9002
- kvm_hv_vcpu_init(vcpu);
9003
-
9004
- return 0;
9005
-
9006
-fail_free_mce_banks:
9007
- kfree(vcpu->arch.mce_banks);
9008
-fail_free_lapic:
9009
- kvm_free_lapic(vcpu);
9010
-fail_mmu_destroy:
9011
- kvm_mmu_destroy(vcpu);
9012
-fail_free_pio_data:
9013
- free_page((unsigned long)vcpu->arch.pio_data);
9014
-fail:
9015
- return r;
9016
-}
9017
-
9018
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
9019
-{
9020
- int idx;
9021
-
9022
- kvm_hv_vcpu_uninit(vcpu);
9023
- kvm_pmu_destroy(vcpu);
9024
- kfree(vcpu->arch.mce_banks);
9025
- kvm_free_lapic(vcpu);
9026
- idx = srcu_read_lock(&vcpu->kvm->srcu);
9027
- kvm_mmu_destroy(vcpu);
9028
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
9029
- free_page((unsigned long)vcpu->arch.pio_data);
9030
- if (!lapic_in_kernel(vcpu))
9031
- static_key_slow_dec(&kvm_no_apic_vcpu);
9032
-}
9033
-
903410598 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
903510599 {
10600
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
10601
+
903610602 vcpu->arch.l1tf_flush_l1d = true;
9037
- kvm_x86_ops->sched_in(vcpu, cpu);
10603
+ if (pmu->version && unlikely(pmu->event_count)) {
10604
+ pmu->need_cleanup = true;
10605
+ kvm_make_request(KVM_REQ_PMU, vcpu);
10606
+ }
10607
+ kvm_x86_ops.sched_in(vcpu, cpu);
903810608 }
10609
+
10610
+void kvm_arch_free_vm(struct kvm *kvm)
10611
+{
10612
+ kfree(kvm->arch.hyperv.hv_pa_pg);
10613
+ vfree(kvm);
10614
+}
10615
+
903910616
904010617 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
904110618 {
10619
+ int ret;
10620
+
904210621 if (type)
904310622 return -EINVAL;
10623
+
10624
+ ret = kvm_page_track_init(kvm);
10625
+ if (ret)
10626
+ return ret;
904410627
904510628 INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
904610629 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
....@@ -9059,7 +10642,7 @@
905910642 mutex_init(&kvm->arch.apic_map_lock);
906010643 spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
906110644
9062
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
10645
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
906310646 pvclock_update_vm_gtod_copy(kvm);
906410647
906510648 kvm->arch.guest_can_read_msr_platform_info = true;
....@@ -9068,13 +10651,9 @@
906810651 INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
906910652
907010653 kvm_hv_init_vm(kvm);
9071
- kvm_page_track_init(kvm);
907210654 kvm_mmu_init_vm(kvm);
907310655
9074
- if (kvm_x86_ops->vm_init)
9075
- return kvm_x86_ops->vm_init(kvm);
9076
-
9077
- return 0;
10656
+ return kvm_x86_ops.vm_init(kvm);
907810657 }
907910658
908010659 int kvm_arch_post_init_vm(struct kvm *kvm)
....@@ -9102,7 +10681,7 @@
910210681 kvm_unload_vcpu_mmu(vcpu);
910310682 }
910410683 kvm_for_each_vcpu(i, vcpu, kvm)
9105
- kvm_arch_vcpu_free(vcpu);
10684
+ kvm_vcpu_destroy(vcpu);
910610685
910710686 mutex_lock(&kvm->lock);
910810687 for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
....@@ -9122,9 +10701,9 @@
912210701 int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
912310702 {
912410703 int i, r;
9125
- unsigned long hva;
10704
+ unsigned long hva, old_npages;
912610705 struct kvm_memslots *slots = kvm_memslots(kvm);
9127
- struct kvm_memory_slot *slot, old;
10706
+ struct kvm_memory_slot *slot;
912810707
912910708 /* Called with kvm->slots_lock held. */
913010709 if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
....@@ -9132,7 +10711,7 @@
913210711
913310712 slot = id_to_memslot(slots, id);
913410713 if (size) {
9135
- if (slot->npages)
10714
+ if (slot && slot->npages)
913610715 return -EEXIST;
913710716
913810717 /*
....@@ -9144,13 +10723,13 @@
914410723 if (IS_ERR((void *)hva))
914510724 return PTR_ERR((void *)hva);
914610725 } else {
9147
- if (!slot->npages)
10726
+ if (!slot || !slot->npages)
914810727 return 0;
914910728
10729
+ old_npages = slot->npages;
915010730 hva = 0;
915110731 }
915210732
9153
- old = *slot;
915410733 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
915510734 struct kvm_userspace_memory_region m;
915610735
....@@ -9165,23 +10744,11 @@
916510744 }
916610745
916710746 if (!size)
9168
- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
10747
+ vm_munmap(hva, old_npages * PAGE_SIZE);
916910748
917010749 return 0;
917110750 }
917210751 EXPORT_SYMBOL_GPL(__x86_set_memory_region);
9173
-
9174
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
9175
-{
9176
- int r;
9177
-
9178
- mutex_lock(&kvm->slots_lock);
9179
- r = __x86_set_memory_region(kvm, id, gpa, size);
9180
- mutex_unlock(&kvm->slots_lock);
9181
-
9182
- return r;
9183
-}
9184
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
918510752
918610753 void kvm_arch_pre_destroy_vm(struct kvm *kvm)
918710754 {
....@@ -9196,46 +10763,47 @@
919610763 * unless the the memory map has changed due to process exit
919710764 * or fd copying.
919810765 */
9199
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
9200
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
9201
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10766
+ mutex_lock(&kvm->slots_lock);
10767
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
10768
+ 0, 0);
10769
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
10770
+ 0, 0);
10771
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
10772
+ mutex_unlock(&kvm->slots_lock);
920210773 }
9203
- if (kvm_x86_ops->vm_destroy)
9204
- kvm_x86_ops->vm_destroy(kvm);
10774
+ if (kvm_x86_ops.vm_destroy)
10775
+ kvm_x86_ops.vm_destroy(kvm);
10776
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
920510777 kvm_pic_destroy(kvm);
920610778 kvm_ioapic_destroy(kvm);
920710779 kvm_free_vcpus(kvm);
920810780 kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
10781
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
920910782 kvm_mmu_uninit_vm(kvm);
921010783 kvm_page_track_cleanup(kvm);
921110784 kvm_hv_destroy_vm(kvm);
921210785 }
921310786
9214
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
9215
- struct kvm_memory_slot *dont)
10787
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
921610788 {
921710789 int i;
921810790
921910791 for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
9220
- if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
9221
- kvfree(free->arch.rmap[i]);
9222
- free->arch.rmap[i] = NULL;
9223
- }
10792
+ kvfree(slot->arch.rmap[i]);
10793
+ slot->arch.rmap[i] = NULL;
10794
+
922410795 if (i == 0)
922510796 continue;
922610797
9227
- if (!dont || free->arch.lpage_info[i - 1] !=
9228
- dont->arch.lpage_info[i - 1]) {
9229
- kvfree(free->arch.lpage_info[i - 1]);
9230
- free->arch.lpage_info[i - 1] = NULL;
9231
- }
10798
+ kvfree(slot->arch.lpage_info[i - 1]);
10799
+ slot->arch.lpage_info[i - 1] = NULL;
923210800 }
923310801
9234
- kvm_page_track_free_memslot(free, dont);
10802
+ kvm_page_track_free_memslot(slot);
923510803 }
923610804
9237
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
9238
- unsigned long npages)
10805
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
10806
+ unsigned long npages)
923910807 {
924010808 int i;
924110809
....@@ -9257,13 +10825,13 @@
925710825
925810826 slot->arch.rmap[i] =
925910827 kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
9260
- GFP_KERNEL);
10828
+ GFP_KERNEL_ACCOUNT);
926110829 if (!slot->arch.rmap[i])
926210830 goto out_free;
926310831 if (i == 0)
926410832 continue;
926510833
9266
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
10834
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
926710835 if (!linfo)
926810836 goto out_free;
926910837
....@@ -9276,11 +10844,9 @@
927610844 ugfn = slot->userspace_addr >> PAGE_SHIFT;
927710845 /*
927810846 * If the gfn and userspace address are not aligned wrt each
9279
- * other, or if explicitly asked to, disable large page
9280
- * support for this slot
10847
+ * other, disable large page support for this slot.
928110848 */
9282
- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
9283
- !kvm_largepages_enabled()) {
10849
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
928410850 unsigned long j;
928510851
928610852 for (j = 0; j < lpages; ++j)
....@@ -9327,76 +10893,23 @@
932710893 const struct kvm_userspace_memory_region *mem,
932810894 enum kvm_mr_change change)
932910895 {
9330
- if (change == KVM_MR_MOVE)
9331
- return kvm_arch_create_memslot(kvm, memslot,
9332
- mem->memory_size >> PAGE_SHIFT);
9333
-
10896
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
10897
+ return kvm_alloc_memslot_metadata(memslot,
10898
+ mem->memory_size >> PAGE_SHIFT);
933410899 return 0;
933510900 }
933610901
933710902 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
9338
- struct kvm_memory_slot *new)
10903
+ struct kvm_memory_slot *old,
10904
+ struct kvm_memory_slot *new,
10905
+ enum kvm_mr_change change)
933910906 {
9340
- /* Still write protect RO slot */
9341
- if (new->flags & KVM_MEM_READONLY) {
9342
- kvm_mmu_slot_remove_write_access(kvm, new);
9343
- return;
9344
- }
9345
-
934610907 /*
9347
- * Call kvm_x86_ops dirty logging hooks when they are valid.
9348
- *
9349
- * kvm_x86_ops->slot_disable_log_dirty is called when:
9350
- *
9351
- * - KVM_MR_CREATE with dirty logging is disabled
9352
- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
9353
- *
9354
- * The reason is, in case of PML, we need to set D-bit for any slots
9355
- * with dirty logging disabled in order to eliminate unnecessary GPA
9356
- * logging in PML buffer (and potential PML buffer full VMEXT). This
9357
- * guarantees leaving PML enabled during guest's lifetime won't have
9358
- * any additonal overhead from PML when guest is running with dirty
9359
- * logging disabled for memory slots.
9360
- *
9361
- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
9362
- * to dirty logging mode.
9363
- *
9364
- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
9365
- *
9366
- * In case of write protect:
9367
- *
9368
- * Write protect all pages for dirty logging.
9369
- *
9370
- * All the sptes including the large sptes which point to this
9371
- * slot are set to readonly. We can not create any new large
9372
- * spte on this slot until the end of the logging.
9373
- *
9374
- * See the comments in fast_page_fault().
10908
+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
10909
+ * See comments below.
937510910 */
9376
- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
9377
- if (kvm_x86_ops->slot_enable_log_dirty)
9378
- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
9379
- else
9380
- kvm_mmu_slot_remove_write_access(kvm, new);
9381
- } else {
9382
- if (kvm_x86_ops->slot_disable_log_dirty)
9383
- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
9384
- }
9385
-}
9386
-
9387
-void kvm_arch_commit_memory_region(struct kvm *kvm,
9388
- const struct kvm_userspace_memory_region *mem,
9389
- const struct kvm_memory_slot *old,
9390
- const struct kvm_memory_slot *new,
9391
- enum kvm_mr_change change)
9392
-{
9393
- int nr_mmu_pages = 0;
9394
-
9395
- if (!kvm->arch.n_requested_mmu_pages)
9396
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
9397
-
9398
- if (nr_mmu_pages)
9399
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
10911
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
10912
+ return;
940010913
940110914 /*
940210915 * Dirty logging tracks sptes in 4k granularity, meaning that large
....@@ -9409,29 +10922,91 @@
940910922 * Scan sptes if dirty logging has been stopped, dropping those
941010923 * which can be collapsed into a single large-page spte. Later
941110924 * page faults will create the large-page sptes.
10925
+ *
10926
+ * There is no need to do this in any of the following cases:
10927
+ * CREATE: No dirty mappings will already exist.
10928
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
10929
+ * kvm_arch_flush_shadow_memslot()
941210930 */
9413
- if ((change != KVM_MR_DELETE) &&
9414
- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
9415
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
10931
+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
10932
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
941610933 kvm_mmu_zap_collapsible_sptes(kvm, new);
941710934
941810935 /*
9419
- * Set up write protection and/or dirty logging for the new slot.
10936
+ * Enable or disable dirty logging for the slot.
942010937 *
9421
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
9422
- * been zapped so no dirty logging staff is needed for old slot. For
9423
- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
9424
- * new and it's also covered when dealing with the new slot.
10938
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
10939
+ * slot have been zapped so no dirty logging updates are needed for
10940
+ * the old slot.
10941
+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
10942
+ * any mappings that might be created in it will consume the
10943
+ * properties of the new slot and do not need to be updated here.
942510944 *
10945
+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
10946
+ * called to enable/disable dirty logging.
10947
+ *
10948
+ * When disabling dirty logging with PML enabled, the D-bit is set
10949
+ * for sptes in the slot in order to prevent unnecessary GPA
10950
+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
10951
+ * This guarantees leaving PML enabled for the guest's lifetime
10952
+ * won't have any additional overhead from PML when the guest is
10953
+ * running with dirty logging disabled.
10954
+ *
10955
+ * When enabling dirty logging, large sptes are write-protected
10956
+ * so they can be split on first write. New large sptes cannot
10957
+ * be created for this slot until the end of the logging.
10958
+ * See the comments in fast_page_fault().
10959
+ * For small sptes, nothing is done if the dirty log is in the
10960
+ * initial-all-set state. Otherwise, depending on whether pml
10961
+ * is enabled the D-bit or the W-bit will be cleared.
10962
+ */
10963
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
10964
+ if (kvm_x86_ops.slot_enable_log_dirty) {
10965
+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
10966
+ } else {
10967
+ int level =
10968
+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
10969
+ PG_LEVEL_2M : PG_LEVEL_4K;
10970
+
10971
+ /*
10972
+ * If we're with initial-all-set, we don't need
10973
+ * to write protect any small page because
10974
+ * they're reported as dirty already. However
10975
+ * we still need to write-protect huge pages
10976
+ * so that the page split can happen lazily on
10977
+ * the first write to the huge page.
10978
+ */
10979
+ kvm_mmu_slot_remove_write_access(kvm, new, level);
10980
+ }
10981
+ } else {
10982
+ if (kvm_x86_ops.slot_disable_log_dirty)
10983
+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
10984
+ }
10985
+}
10986
+
10987
+void kvm_arch_commit_memory_region(struct kvm *kvm,
10988
+ const struct kvm_userspace_memory_region *mem,
10989
+ struct kvm_memory_slot *old,
10990
+ const struct kvm_memory_slot *new,
10991
+ enum kvm_mr_change change)
10992
+{
10993
+ if (!kvm->arch.n_requested_mmu_pages)
10994
+ kvm_mmu_change_mmu_pages(kvm,
10995
+ kvm_mmu_calculate_default_mmu_pages(kvm));
10996
+
10997
+ /*
942610998 * FIXME: const-ify all uses of struct kvm_memory_slot.
942710999 */
9428
- if (change != KVM_MR_DELETE)
9429
- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
11000
+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
11001
+
11002
+ /* Free the arrays associated with the old memslot. */
11003
+ if (change == KVM_MR_MOVE)
11004
+ kvm_arch_free_memslot(kvm, old);
943011005 }
943111006
943211007 void kvm_arch_flush_shadow_all(struct kvm *kvm)
943311008 {
9434
- kvm_mmu_invalidate_zap_all_pages(kvm);
11009
+ kvm_mmu_zap_all(kvm);
943511010 }
943611011
943711012 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
....@@ -9443,8 +11018,8 @@
944311018 static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
944411019 {
944511020 return (is_guest_mode(vcpu) &&
9446
- kvm_x86_ops->guest_apic_has_interrupt &&
9447
- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
11021
+ kvm_x86_ops.guest_apic_has_interrupt &&
11022
+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
944811023 }
944911024
945011025 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
....@@ -9463,11 +11038,12 @@
946311038
946411039 if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
946511040 (vcpu->arch.nmi_pending &&
9466
- kvm_x86_ops->nmi_allowed(vcpu)))
11041
+ kvm_x86_ops.nmi_allowed(vcpu, false)))
946711042 return true;
946811043
946911044 if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
9470
- (vcpu->arch.smi_pending && !is_smm(vcpu)))
11045
+ (vcpu->arch.smi_pending &&
11046
+ kvm_x86_ops.smi_allowed(vcpu, false)))
947111047 return true;
947211048
947311049 if (kvm_arch_interrupt_allowed(vcpu) &&
....@@ -9476,6 +11052,11 @@
947611052 return true;
947711053
947811054 if (kvm_hv_has_stimer_pending(vcpu))
11055
+ return true;
11056
+
11057
+ if (is_guest_mode(vcpu) &&
11058
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
11059
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
947911060 return true;
948011061
948111062 return false;
....@@ -9496,7 +11077,7 @@
949611077 kvm_test_request(KVM_REQ_EVENT, vcpu))
949711078 return true;
949811079
9499
- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
11080
+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
950011081 return true;
950111082
950211083 return false;
....@@ -9514,7 +11095,7 @@
951411095
951511096 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
951611097 {
9517
- return kvm_x86_ops->interrupt_allowed(vcpu);
11098
+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
951811099 }
951911100
952011101 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
....@@ -9536,7 +11117,7 @@
953611117 {
953711118 unsigned long rflags;
953811119
9539
- rflags = kvm_x86_ops->get_rflags(vcpu);
11120
+ rflags = kvm_x86_ops.get_rflags(vcpu);
954011121 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
954111122 rflags &= ~X86_EFLAGS_TF;
954211123 return rflags;
....@@ -9548,7 +11129,7 @@
954811129 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
954911130 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
955011131 rflags |= X86_EFLAGS_TF;
9551
- kvm_x86_ops->set_rflags(vcpu, rflags);
11132
+ kvm_x86_ops.set_rflags(vcpu, rflags);
955211133 }
955311134
955411135 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
....@@ -9562,7 +11143,7 @@
956211143 {
956311144 int r;
956411145
9565
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
11146
+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
956611147 work->wakeup_all)
956711148 return;
956811149
....@@ -9570,21 +11151,23 @@
957011151 if (unlikely(r))
957111152 return;
957211153
9573
- if (!vcpu->arch.mmu.direct_map &&
9574
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
11154
+ if (!vcpu->arch.mmu->direct_map &&
11155
+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
957511156 return;
957611157
9577
- vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
11158
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
957811159 }
957911160
958011161 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
958111162 {
11163
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
11164
+
958211165 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
958311166 }
958411167
958511168 static inline u32 kvm_async_pf_next_probe(u32 key)
958611169 {
9587
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
11170
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
958811171 }
958911172
959011173 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
....@@ -9602,7 +11185,7 @@
960211185 int i;
960311186 u32 key = kvm_async_pf_hash_fn(gfn);
960411187
9605
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
11188
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
960611189 (vcpu->arch.apf.gfns[key] != gfn &&
960711190 vcpu->arch.apf.gfns[key] != ~0); i++)
960811191 key = kvm_async_pf_next_probe(key);
....@@ -9620,6 +11203,10 @@
962011203 u32 i, j, k;
962111204
962211205 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
11206
+
11207
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
11208
+ return;
11209
+
962311210 while (true) {
962411211 vcpu->arch.apf.gfns[i] = ~0;
962511212 do {
....@@ -9638,21 +11225,64 @@
963811225 }
963911226 }
964011227
9641
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
11228
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
964211229 {
11230
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
964311231
9644
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
9645
- sizeof(val));
11232
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
11233
+ sizeof(reason));
964611234 }
964711235
9648
-static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
11236
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
964911237 {
11238
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
965011239
9651
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
9652
- sizeof(u32));
11240
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11241
+ &token, offset, sizeof(token));
965311242 }
965411243
9655
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
11244
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
11245
+{
11246
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
11247
+ u32 val;
11248
+
11249
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
11250
+ &val, offset, sizeof(val)))
11251
+ return false;
11252
+
11253
+ return !val;
11254
+}
11255
+
11256
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
11257
+{
11258
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
11259
+ return false;
11260
+
11261
+ if (!kvm_pv_async_pf_enabled(vcpu) ||
11262
+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
11263
+ return false;
11264
+
11265
+ return true;
11266
+}
11267
+
11268
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
11269
+{
11270
+ if (unlikely(!lapic_in_kernel(vcpu) ||
11271
+ kvm_event_needs_reinjection(vcpu) ||
11272
+ vcpu->arch.exception.pending))
11273
+ return false;
11274
+
11275
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
11276
+ return false;
11277
+
11278
+ /*
11279
+ * If interrupts are off we cannot even use an artificial
11280
+ * halt state.
11281
+ */
11282
+ return kvm_arch_interrupt_allowed(vcpu);
11283
+}
11284
+
11285
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
965611286 struct kvm_async_pf *work)
965711287 {
965811288 struct x86_exception fault;
....@@ -9660,11 +11290,8 @@
966011290 trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
966111291 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
966211292
9663
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
9664
- (vcpu->arch.apf.send_user_only &&
9665
- kvm_x86_ops->get_cpl(vcpu) == 0))
9666
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
9667
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
11293
+ if (kvm_can_deliver_async_pf(vcpu) &&
11294
+ !apf_put_user_notpresent(vcpu)) {
966811295 fault.vector = PF_VECTOR;
966911296 fault.error_code_valid = true;
967011297 fault.error_code = 0;
....@@ -9672,14 +11299,28 @@
967211299 fault.address = work->arch.token;
967311300 fault.async_page_fault = true;
967411301 kvm_inject_page_fault(vcpu, &fault);
11302
+ return true;
11303
+ } else {
11304
+ /*
11305
+ * It is not possible to deliver a paravirtualized asynchronous
11306
+ * page fault, but putting the guest in an artificial halt state
11307
+ * can be beneficial nevertheless: if an interrupt arrives, we
11308
+ * can deliver it timely and perhaps the guest will schedule
11309
+ * another process. When the instruction that triggered a page
11310
+ * fault is retried, hopefully the page will be ready in the host.
11311
+ */
11312
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
11313
+ return false;
967511314 }
967611315 }
967711316
967811317 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
967911318 struct kvm_async_pf *work)
968011319 {
9681
- struct x86_exception fault;
9682
- u32 val;
11320
+ struct kvm_lapic_irq irq = {
11321
+ .delivery_mode = APIC_DM_FIXED,
11322
+ .vector = vcpu->arch.apf.vec
11323
+ };
968311324
968411325 if (work->wakeup_all)
968511326 work->arch.token = ~0; /* broadcast wakeup */
....@@ -9687,37 +11328,30 @@
968711328 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
968811329 trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
968911330
9690
- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
9691
- !apf_get_user(vcpu, &val)) {
9692
- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
9693
- vcpu->arch.exception.pending &&
9694
- vcpu->arch.exception.nr == PF_VECTOR &&
9695
- !apf_put_user(vcpu, 0)) {
9696
- vcpu->arch.exception.injected = false;
9697
- vcpu->arch.exception.pending = false;
9698
- vcpu->arch.exception.nr = 0;
9699
- vcpu->arch.exception.has_error_code = false;
9700
- vcpu->arch.exception.error_code = 0;
9701
- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
9702
- fault.vector = PF_VECTOR;
9703
- fault.error_code_valid = true;
9704
- fault.error_code = 0;
9705
- fault.nested_page_fault = false;
9706
- fault.address = work->arch.token;
9707
- fault.async_page_fault = true;
9708
- kvm_inject_page_fault(vcpu, &fault);
9709
- }
11331
+ if ((work->wakeup_all || work->notpresent_injected) &&
11332
+ kvm_pv_async_pf_enabled(vcpu) &&
11333
+ !apf_put_user_ready(vcpu, work->arch.token)) {
11334
+ vcpu->arch.apf.pageready_pending = true;
11335
+ kvm_apic_set_irq(vcpu, &irq, NULL);
971011336 }
11337
+
971111338 vcpu->arch.apf.halted = false;
971211339 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
971311340 }
971411341
9715
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
11342
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
971611343 {
9717
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
11344
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
11345
+ if (!vcpu->arch.apf.pageready_pending)
11346
+ kvm_vcpu_kick(vcpu);
11347
+}
11348
+
11349
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
11350
+{
11351
+ if (!kvm_pv_async_pf_enabled(vcpu))
971811352 return true;
971911353 else
9720
- return kvm_can_do_async_pf(vcpu);
11354
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
972111355 }
972211356
972311357 void kvm_arch_start_assignment(struct kvm *kvm)
....@@ -9732,9 +11366,9 @@
973211366 }
973311367 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
973411368
9735
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
11369
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
973611370 {
9737
- return atomic_read(&kvm->arch.assigned_device_count);
11371
+ return arch_atomic_read(&kvm->arch.assigned_device_count);
973811372 }
973911373 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
974011374
....@@ -9758,7 +11392,7 @@
975811392
975911393 bool kvm_arch_has_irq_bypass(void)
976011394 {
9761
- return kvm_x86_ops->update_pi_irte != NULL;
11395
+ return true;
976211396 }
976311397
976411398 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
....@@ -9766,11 +11400,17 @@
976611400 {
976711401 struct kvm_kernel_irqfd *irqfd =
976811402 container_of(cons, struct kvm_kernel_irqfd, consumer);
11403
+ int ret;
976911404
977011405 irqfd->producer = prod;
11406
+ kvm_arch_start_assignment(irqfd->kvm);
11407
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
11408
+ prod->irq, irqfd->gsi, 1);
977111409
9772
- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
9773
- prod->irq, irqfd->gsi, 1);
11410
+ if (ret)
11411
+ kvm_arch_end_assignment(irqfd->kvm);
11412
+
11413
+ return ret;
977411414 }
977511415
977611416 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
....@@ -9789,26 +11429,185 @@
978911429 * when the irq is masked/disabled or the consumer side (KVM
979011430 * int this case doesn't want to receive the interrupts.
979111431 */
9792
- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
11432
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
979311433 if (ret)
979411434 printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
979511435 " fails: %d\n", irqfd->consumer.token, ret);
11436
+
11437
+ kvm_arch_end_assignment(irqfd->kvm);
979611438 }
979711439
979811440 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
979911441 uint32_t guest_irq, bool set)
980011442 {
9801
- if (!kvm_x86_ops->update_pi_irte)
9802
- return -EINVAL;
9803
-
9804
- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
11443
+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
980511444 }
980611445
980711446 bool kvm_vector_hashing_enabled(void)
980811447 {
980911448 return vector_hashing;
981011449 }
9811
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
11450
+
11451
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
11452
+{
11453
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
11454
+}
11455
+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
11456
+
11457
+
11458
+int kvm_spec_ctrl_test_value(u64 value)
11459
+{
11460
+ /*
11461
+ * test that setting IA32_SPEC_CTRL to given value
11462
+ * is allowed by the host processor
11463
+ */
11464
+
11465
+ u64 saved_value;
11466
+ unsigned long flags;
11467
+ int ret = 0;
11468
+
11469
+ local_irq_save(flags);
11470
+
11471
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
11472
+ ret = 1;
11473
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
11474
+ ret = 1;
11475
+ else
11476
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
11477
+
11478
+ local_irq_restore(flags);
11479
+
11480
+ return ret;
11481
+}
11482
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
11483
+
11484
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
11485
+{
11486
+ struct x86_exception fault;
11487
+ u32 access = error_code &
11488
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
11489
+
11490
+ if (!(error_code & PFERR_PRESENT_MASK) ||
11491
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
11492
+ /*
11493
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
11494
+ * tables probably do not match the TLB. Just proceed
11495
+ * with the error code that the processor gave.
11496
+ */
11497
+ fault.vector = PF_VECTOR;
11498
+ fault.error_code_valid = true;
11499
+ fault.error_code = error_code;
11500
+ fault.nested_page_fault = false;
11501
+ fault.address = gva;
11502
+ }
11503
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
11504
+}
11505
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
11506
+
11507
+/*
11508
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
11509
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
11510
+ * indicates whether exit to userspace is needed.
11511
+ */
11512
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
11513
+ struct x86_exception *e)
11514
+{
11515
+ if (r == X86EMUL_PROPAGATE_FAULT) {
11516
+ kvm_inject_emulated_page_fault(vcpu, e);
11517
+ return 1;
11518
+ }
11519
+
11520
+ /*
11521
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
11522
+ * while handling a VMX instruction KVM could've handled the request
11523
+ * correctly by exiting to userspace and performing I/O but there
11524
+ * doesn't seem to be a real use-case behind such requests, just return
11525
+ * KVM_EXIT_INTERNAL_ERROR for now.
11526
+ */
11527
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11528
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11529
+ vcpu->run->internal.ndata = 0;
11530
+
11531
+ return 0;
11532
+}
11533
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
11534
+
11535
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
11536
+{
11537
+ bool pcid_enabled;
11538
+ struct x86_exception e;
11539
+ unsigned i;
11540
+ unsigned long roots_to_free = 0;
11541
+ struct {
11542
+ u64 pcid;
11543
+ u64 gla;
11544
+ } operand;
11545
+ int r;
11546
+
11547
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
11548
+ if (r != X86EMUL_CONTINUE)
11549
+ return kvm_handle_memory_failure(vcpu, r, &e);
11550
+
11551
+ if (operand.pcid >> 12 != 0) {
11552
+ kvm_inject_gp(vcpu, 0);
11553
+ return 1;
11554
+ }
11555
+
11556
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
11557
+
11558
+ switch (type) {
11559
+ case INVPCID_TYPE_INDIV_ADDR:
11560
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
11561
+ is_noncanonical_address(operand.gla, vcpu)) {
11562
+ kvm_inject_gp(vcpu, 0);
11563
+ return 1;
11564
+ }
11565
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
11566
+ return kvm_skip_emulated_instruction(vcpu);
11567
+
11568
+ case INVPCID_TYPE_SINGLE_CTXT:
11569
+ if (!pcid_enabled && (operand.pcid != 0)) {
11570
+ kvm_inject_gp(vcpu, 0);
11571
+ return 1;
11572
+ }
11573
+
11574
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
11575
+ kvm_mmu_sync_roots(vcpu);
11576
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
11577
+ }
11578
+
11579
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
11580
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
11581
+ == operand.pcid)
11582
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
11583
+
11584
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
11585
+ /*
11586
+ * If neither the current cr3 nor any of the prev_roots use the
11587
+ * given PCID, then nothing needs to be done here because a
11588
+ * resync will happen anyway before switching to any other CR3.
11589
+ */
11590
+
11591
+ return kvm_skip_emulated_instruction(vcpu);
11592
+
11593
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
11594
+ /*
11595
+ * Currently, KVM doesn't mark global entries in the shadow
11596
+ * page tables, so a non-global flush just degenerates to a
11597
+ * global flush. If needed, we could optimize this later by
11598
+ * keeping track of global entries in shadow page tables.
11599
+ */
11600
+
11601
+ fallthrough;
11602
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
11603
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
11604
+ return kvm_skip_emulated_instruction(vcpu);
11605
+
11606
+ default:
11607
+ BUG(); /* We have already checked above that type <= 3 */
11608
+ }
11609
+}
11610
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
981211611
981311612 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
981411613 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
....@@ -9820,12 +11619,31 @@
982011619 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
982111620 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
982211621 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
11622
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
982311623 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
982411624 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
982511625 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
982611626 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
9827
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
11627
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
982811628 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
982911629 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
983011630 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
983111631 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
11632
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
11633
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
11634
+
11635
+static int __init kvm_x86_init(void)
11636
+{
11637
+ kvm_mmu_x86_module_init();
11638
+ return 0;
11639
+}
11640
+module_init(kvm_x86_init);
11641
+
11642
+static void __exit kvm_x86_exit(void)
11643
+{
11644
+ /*
11645
+ * If module_init() is implemented, module_exit() must also be
11646
+ * implemented to allow module unload.
11647
+ */
11648
+}
11649
+module_exit(kvm_x86_exit);