From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/arch/x86/kvm/x86.c | 4912 +++++++++++++++++++++++++++++++++++++++++------------------
1 files changed, 3,365 insertions(+), 1,547 deletions(-)
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index 417abc9..9d30158 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
@@ -13,22 +14,21 @@
* Yaniv Kamay <yaniv@qumranet.com>
* Amit Shah <amit.shah@qumranet.com>
* Ben-Ami Yassour <benami@il.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
*/
#include <linux/kvm_host.h>
#include "irq.h"
+#include "ioapic.h"
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
+#include "lapic.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -54,7 +54,9 @@
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
+#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
+#include <linux/entry-kvm.h>
#include <trace/events/kvm.h>
@@ -69,6 +71,10 @@
#include <asm/irq_remapping.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
+#include <asm/intel_pt.h>
+#include <asm/emulate_prefix.h>
+#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
#include "trace.h"
@@ -79,7 +85,7 @@
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
@@ -94,9 +100,6 @@
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -108,7 +111,7 @@
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool __read_mostly ignore_msrs = 0;
@@ -138,10 +141,14 @@
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
-/* lapic timer advance (tscdeadline mode only) in nanoseconds */
-unsigned int __read_mostly lapic_timer_advance_ns = 0;
-module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
-EXPORT_SYMBOL_GPL(lapic_timer_advance_ns);
+/*
+ * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
+ * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * advancement entirely. Any other value is used as-is and disables adaptive
+ * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ */
+static int __read_mostly lapic_timer_advance_ns = -1;
+module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
static bool __read_mostly vector_hashing = true;
module_param(vector_hashing, bool, S_IRUGO);
@@ -153,85 +160,149 @@
static bool __read_mostly force_emulation_prefix = false;
module_param(force_emulation_prefix, bool, S_IRUGO);
-#define KVM_NR_SHARED_MSRS 16
+int __read_mostly pi_inject_timer = -1;
+module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
-struct kvm_shared_msrs_global {
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
+
+struct kvm_user_return_msrs_global {
int nr;
- u32 msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
};
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
- struct kvm_shared_msr_values {
+ struct kvm_user_return_msr_values {
u64 host;
u64 curr;
- } values[KVM_NR_SHARED_MSRS];
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
+
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
+
+u64 __read_mostly host_efer;
+EXPORT_SYMBOL_GPL(host_efer);
+
+bool __read_mostly allow_smaller_maxphyaddr = 0;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
+static u64 __read_mostly host_xss;
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "pf_fixed", VCPU_STAT(pf_fixed) },
- { "pf_guest", VCPU_STAT(pf_guest) },
- { "tlb_flush", VCPU_STAT(tlb_flush) },
- { "invlpg", VCPU_STAT(invlpg) },
- { "exits", VCPU_STAT(exits) },
- { "io_exits", VCPU_STAT(io_exits) },
- { "mmio_exits", VCPU_STAT(mmio_exits) },
- { "signal_exits", VCPU_STAT(signal_exits) },
- { "irq_window", VCPU_STAT(irq_window_exits) },
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
- { "halt_exits", VCPU_STAT(halt_exits) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
- { "irq_exits", VCPU_STAT(irq_exits) },
- { "host_state_reload", VCPU_STAT(host_state_reload) },
- { "fpu_reload", VCPU_STAT(fpu_reload) },
- { "insn_emulation", VCPU_STAT(insn_emulation) },
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
- { "irq_injections", VCPU_STAT(irq_injections) },
- { "nmi_injections", VCPU_STAT(nmi_injections) },
- { "req_event", VCPU_STAT(req_event) },
- { "l1d_flush", VCPU_STAT(l1d_flush) },
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
- { "mmu_flooded", VM_STAT(mmu_flooded) },
- { "mmu_recycled", VM_STAT(mmu_recycled) },
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
- { "mmu_unsync", VM_STAT(mmu_unsync) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages, .mode = 0444) },
- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
- { "max_mmu_page_hash_collisions",
- VM_STAT(max_mmu_page_hash_collisions) },
+ VCPU_STAT("pf_fixed", pf_fixed),
+ VCPU_STAT("pf_guest", pf_guest),
+ VCPU_STAT("tlb_flush", tlb_flush),
+ VCPU_STAT("invlpg", invlpg),
+ VCPU_STAT("exits", exits),
+ VCPU_STAT("io_exits", io_exits),
+ VCPU_STAT("mmio_exits", mmio_exits),
+ VCPU_STAT("signal_exits", signal_exits),
+ VCPU_STAT("irq_window", irq_window_exits),
+ VCPU_STAT("nmi_window", nmi_window_exits),
+ VCPU_STAT("halt_exits", halt_exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("hypercalls", hypercalls),
+ VCPU_STAT("request_irq", request_irq_exits),
+ VCPU_STAT("irq_exits", irq_exits),
+ VCPU_STAT("host_state_reload", host_state_reload),
+ VCPU_STAT("fpu_reload", fpu_reload),
+ VCPU_STAT("insn_emulation", insn_emulation),
+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
+ VCPU_STAT("irq_injections", irq_injections),
+ VCPU_STAT("nmi_injections", nmi_injections),
+ VCPU_STAT("req_event", req_event),
+ VCPU_STAT("l1d_flush", l1d_flush),
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("preemption_reported", preemption_reported),
+ VCPU_STAT("preemption_other", preemption_other),
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+ VM_STAT("mmu_pte_write", mmu_pte_write),
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+ VM_STAT("mmu_flooded", mmu_flooded),
+ VM_STAT("mmu_recycled", mmu_recycled),
+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
+ VM_STAT("mmu_unsync", mmu_unsync),
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
+ VM_STAT("largepages", lpages, .mode = 0444),
+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
{ NULL }
};
u64 __read_mostly host_xcr0;
+u64 __read_mostly supported_xcr0;
+EXPORT_SYMBOL_GPL(supported_xcr0);
+
+static struct kmem_cache *x86_fpu_cache;
+
+static struct kmem_cache *x86_emulator_cache;
+
+/*
+ * When called, it means the previous get/set msr reached an invalid msr.
+ * Return true if we want to ignore/silent this failed msr access.
+ */
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
+{
+ const char *op = write ? "wrmsr" : "rdmsr";
+
+ if (ignore_msrs) {
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ /* Mask the error */
+ return true;
+ } else {
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return false;
+ }
+}
+
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
+{
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+ return kmem_cache_create_usercopy("x86_emulator", size,
+ __alignof__(struct x86_emulate_ctxt),
+ SLAB_ACCOUNT, useroffset,
+ size - useroffset, NULL);
+}
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msrs *locals
- = container_of(urn, struct kvm_shared_msrs, urn);
- struct kvm_shared_msr_values *values;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
unsigned long flags;
/*
@@ -239,84 +310,89 @@
* interrupted and executed through kvm_arch_hardware_disable()
*/
local_irq_save(flags);
- if (locals->registered) {
- locals->registered = false;
+ if (msrs->registered) {
+ msrs->registered = false;
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- values = &locals->values[slot];
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
}
-static void shared_msr_update(unsigned slot, u32 msr)
+int kvm_probe_user_return_msr(u32 msr)
{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
+
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
+{
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+ user_return_msrs_global.msrs[slot] = msr;
+ if (slot >= user_return_msrs_global.nr)
+ user_return_msrs_global.nr = slot + 1;
+}
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+
+static void kvm_user_return_msr_cpu_online(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
u64 value;
- unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int i;
- /* only read, and nobody should modify it at this time,
- * so don't need lock */
- if (slot >= shared_msrs_global.nr) {
- printk(KERN_ERR "kvm: invalid MSR slot!");
- return;
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
}
- rdmsrl_safe(msr, &value);
- smsr->values[slot].host = value;
- smsr->values[slot].curr = value;
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
-{
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
- shared_msrs_global.msrs[slot] = msr;
- if (slot >= shared_msrs_global.nr)
- shared_msrs_global.nr = slot + 1;
-}
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
-
-static void kvm_shared_msr_cpu_online(void)
-{
- unsigned i;
-
- for (i = 0; i < shared_msrs_global.nr; ++i)
- shared_msr_update(i, shared_msrs_global.msrs[i]);
-}
-
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
int err;
- value = (value & mask) | (smsr->values[slot].host & ~mask);
- if (value == smsr->values[slot].curr)
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
if (err)
return 1;
- smsr->values[slot].curr = value;
- if (!smsr->registered) {
- smsr->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&smsr->urn);
- smsr->registered = true;
+ msrs->values[slot].curr = value;
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
- if (smsr->registered)
- kvm_on_user_return(&smsr->urn);
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
}
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
@@ -348,14 +424,15 @@
}
kvm_lapic_set_base(vcpu, msr_info->data);
+ kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible void kvm_spurious_fault(void)
+asmlinkage __visible noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
- BUG();
+ BUG_ON(!kvm_rebooting);
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
@@ -384,6 +461,7 @@
#define EXCPT_TRAP 1
#define EXCPT_ABORT 2
#define EXCPT_INTERRUPT 3
+#define EXCPT_DB 4
static int exception_type(int vector)
{
@@ -394,8 +472,14 @@
mask = 1 << vector;
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
return EXCPT_TRAP;
if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
@@ -405,9 +489,59 @@
return EXCPT_FAULT;
}
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+{
+ unsigned nr = vcpu->arch.exception.nr;
+ bool has_payload = vcpu->arch.exception.has_payload;
+ unsigned long payload = vcpu->arch.exception.payload;
+
+ if (!has_payload)
+ return;
+
+ switch (nr) {
+ case DB_VECTOR:
+ /*
+ * "Certain debug exceptions may clear bit 0-3. The
+ * remaining contents of the DR6 register are never
+ * cleared by the processor".
+ */
+ vcpu->arch.dr6 &= ~DR_TRAP_BITS;
+ /*
+ * DR6.RTM is set by all #DB exceptions that don't clear it.
+ */
+ vcpu->arch.dr6 |= DR6_RTM;
+ vcpu->arch.dr6 |= payload;
+ /*
+ * Bit 16 should be set in the payload whenever the #DB
+ * exception should clear DR6.RTM. This makes the payload
+ * compatible with the pending debug exceptions under VMX.
+ * Though not currently documented in the SDM, this also
+ * makes the payload compatible with the exit qualification
+ * for #DB exceptions under VMX.
+ */
+ vcpu->arch.dr6 ^= payload & DR6_RTM;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
+ break;
+ case PF_VECTOR:
+ vcpu->arch.cr2 = payload;
+ break;
+ }
+
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
- bool reinject)
+ bool has_payload, unsigned long payload, bool reinject)
{
u32 prev_nr;
int class1, class2;
@@ -427,6 +561,14 @@
*/
WARN_ON_ONCE(vcpu->arch.exception.pending);
vcpu->arch.exception.injected = true;
+ if (WARN_ON_ONCE(has_payload)) {
+ /*
+ * A reinjected event has already
+ * delivered its payload.
+ */
+ has_payload = false;
+ payload = 0;
+ }
} else {
vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
@@ -434,6 +576,10 @@
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
vcpu->arch.exception.error_code = error_code;
+ vcpu->arch.exception.has_payload = has_payload;
+ vcpu->arch.exception.payload = payload;
+ if (!is_guest_mode(vcpu))
+ kvm_deliver_exception_payload(vcpu);
return;
}
@@ -458,6 +604,8 @@
vcpu->arch.exception.has_error_code = true;
vcpu->arch.exception.nr = DF_VECTOR;
vcpu->arch.exception.error_code = 0;
+ vcpu->arch.exception.has_payload = false;
+ vcpu->arch.exception.payload = 0;
} else
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
@@ -467,15 +615,29 @@
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, false);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
- kvm_multiple_exception(vcpu, nr, false, 0, true);
+ kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+ unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
+}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
+
+static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
+ u32 error_code, unsigned long payload)
+{
+ kvm_multiple_exception(vcpu, nr, true, error_code,
+ true, payload, false);
+}
int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
@@ -493,23 +655,38 @@
++vcpu->stat.pf_guest;
vcpu->arch.exception.nested_apf =
is_guest_mode(vcpu) && fault->async_page_fault;
- if (vcpu->arch.exception.nested_apf)
+ if (vcpu->arch.exception.nested_apf) {
vcpu->arch.apf.nested_apf_token = fault->address;
- else
- vcpu->arch.cr2 = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
+ } else {
+ kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
+ fault->address);
+ }
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu.inject_page_fault(vcpu, fault);
+ struct kvm_mmu *fault_mmu;
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+ vcpu->arch.walk_mmu;
+
+ /*
+ * Invalidate the TLB entry for the faulting address, if it exists,
+ * else the access will fault indefinitely (and to emulate hardware).
+ */
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
+ !(fault->error_code & PFERR_RSVD_MASK))
+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
+ fault_mmu->root_hpa);
+
+ fault_mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
}
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
@@ -520,13 +697,13 @@
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, false);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
- kvm_multiple_exception(vcpu, nr, true, error_code, true);
+ kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
@@ -536,7 +713,7 @@
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -618,10 +795,8 @@
ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
out:
return ret;
@@ -631,7 +806,6 @@
bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- bool changed = true;
int offset;
gfn_t gfn;
int r;
@@ -639,8 +813,7 @@
if (!is_pae_paging(vcpu))
return false;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
return true;
gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@ -648,17 +821,16 @@
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
+ return true;
- return changed;
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -676,27 +848,27 @@
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return 1;
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME)) {
- int cs_db, cs_l;
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
- if (!is_pae(vcpu))
- return 1;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
return 1;
}
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ return 1;
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
- kvm_x86_ops->set_cr0(vcpu, cr0);
+ kvm_x86_ops.set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
@@ -721,27 +893,48 @@
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
- }
-}
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
+ }
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.pkru);
+}
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
+
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
- if (vcpu->guest_xcr0_loaded) {
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.host_pkru);
+ }
+
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, host_xss);
}
+
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
@@ -779,13 +972,13 @@
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+ if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -794,63 +987,20 @@
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
-{
- u64 reserved_bits = CR4_RESERVED_BITS;
-
- if (!cpu_has(c, X86_FEATURE_XSAVE))
- reserved_bits |= X86_CR4_OSXSAVE;
-
- if (!cpu_has(c, X86_FEATURE_SMEP))
- reserved_bits |= X86_CR4_SMEP;
-
- if (!cpu_has(c, X86_FEATURE_SMAP))
- reserved_bits |= X86_CR4_SMAP;
-
- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
- reserved_bits |= X86_CR4_FSGSBASE;
-
- if (!cpu_has(c, X86_FEATURE_PKU))
- reserved_bits |= X86_CR4_PKE;
-
- if (!cpu_has(c, X86_FEATURE_LA57) &&
- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
- reserved_bits |= X86_CR4_LA57;
-
- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
- reserved_bits |= X86_CR4_UMIP;
-
- return reserved_bits;
-}
-
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -882,15 +1032,14 @@
return 1;
}
- if (kvm_x86_ops->set_cr4(vcpu, cr4))
- return 1;
+ kvm_x86_ops.set_cr4(vcpu, cr4);
if (((cr4 ^ old_cr4) & mmu_role_bits) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
@@ -911,21 +1060,21 @@
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
if (!skip_tlb_flush) {
kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
return 0;
}
if (is_long_mode(vcpu) &&
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
return 0;
}
@@ -963,13 +1112,7 @@
}
}
-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
-}
-
-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
unsigned long dr7;
@@ -977,11 +1120,12 @@
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- kvm_x86_ops->set_dr7(vcpu, dr7);
+ kvm_x86_ops.set_dr7(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
@@ -1003,17 +1147,14 @@
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- /* fall through */
case 6:
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr6_valid(val))
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
- kvm_update_dr6(vcpu);
break;
case 5:
- /* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr7_valid(val))
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -1042,15 +1183,10 @@
*val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
- /* fall through */
case 6:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *val = vcpu->arch.dr6;
- else
- *val = kvm_x86_ops->get_dr6(vcpu);
+ *val = vcpu->arch.dr6;
break;
case 5:
- /* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
break;
@@ -1061,15 +1197,15 @@
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
- u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
+ u32 ecx = kvm_rcx_read(vcpu);
u64 data;
int err;
err = kvm_pmu_rdpmc(vcpu, ecx, &data);
if (err)
return err;
- kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
- kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
+ kvm_rax_write(vcpu, (u32)data);
+ kvm_rdx_write(vcpu, data >> 32);
return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);
@@ -1078,26 +1214,66 @@
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
*
- * This list is modified at module load time to reflect the
+ * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
+ * extract the supported MSRs from the related const lists.
+ * msrs_to_save is selected from the msrs_to_save_all to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
* may depend on host virtualization features rather than host cpu features.
*/
-static u32 msrs_to_save[] = {
+static const u32 msrs_to_save_all[] = {
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
+ MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
+ MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
+ MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
+ MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
+ MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
+ MSR_IA32_UMWAIT_CONTROL,
+
+ MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
+ MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
+ MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
+ MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
+ MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
+ MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
+ MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
+ MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
+ MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
+ MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
+ MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
+ MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
+ MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
+ MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
};
+static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
static unsigned num_msrs_to_save;
-static u32 emulated_msrs[] = {
+static const u32 emulated_msrs_all[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
@@ -1113,12 +1289,18 @@
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
HV_X64_MSR_TSC_EMULATION_STATUS,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
+ MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -1128,15 +1310,41 @@
MSR_PLATFORM_INFO,
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
+ MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
+
+ /*
+ * The following list leaves out MSRs whose values are determined
+ * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
+ * We always support the "true" VMX control MSRs, even if the host
+ * processor does not, so I am putting these registers here rather
+ * than in msrs_to_save_all.
+ */
+ MSR_IA32_VMX_BASIC,
+ MSR_IA32_VMX_TRUE_PINBASED_CTLS,
+ MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
+ MSR_IA32_VMX_TRUE_EXIT_CTLS,
+ MSR_IA32_VMX_TRUE_ENTRY_CTLS,
+ MSR_IA32_VMX_MISC,
+ MSR_IA32_VMX_CR0_FIXED0,
+ MSR_IA32_VMX_CR4_FIXED0,
+ MSR_IA32_VMX_VMCS_ENUM,
+ MSR_IA32_VMX_PROCBASED_CTLS2,
+ MSR_IA32_VMX_EPT_VPID_CAP,
+ MSR_IA32_VMX_VMFUNC,
+
+ MSR_K7_HWCR,
+ MSR_KVM_POLL_CONTROL,
};
+static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
static unsigned num_emulated_msrs;
/*
* List of msr numbers which are used to expose MSR-based features that
* can be used by a hypervisor to validate requested CPU features.
*/
-static u32 msr_based_features[] = {
+static const u32 msr_based_features_all[] = {
MSR_IA32_VMX_BASIC,
MSR_IA32_VMX_TRUE_PINBASED_CTLS,
MSR_IA32_VMX_PINBASED_CTLS,
@@ -1156,18 +1364,41 @@
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
- MSR_F10H_DECFG,
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
};
+static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;
-u64 kvm_get_arch_capabilities(void)
-{
- u64 data;
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
- rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data);
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
+
+static u64 kvm_get_arch_capabilities(void)
+{
+ u64 data = 0;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
+ data &= KVM_SUPPORTED_ARCH_CAP;
+ }
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -1196,34 +1427,30 @@
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
- /*
- * On TAA affected systems, export MDS_NO=0 when:
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
- * - Updated microcode is present. This is detected by
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
- * that VERW clears CPU buffers.
- *
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
- * mitigation and don't complain:
- *
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
- *
- * If TSX is disabled on the system, guests are also mitigated against
- * TAA and clear CPU buffer mitigation is not required for guests.
- */
- if (!boot_cpu_has(X86_FEATURE_RTM))
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
data &= ~ARCH_CAP_TAA_NO;
- else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
data |= ARCH_CAP_TAA_NO;
- else if (data & ARCH_CAP_TSX_CTRL_MSR)
- data &= ~ARCH_CAP_MDS_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
+ }
- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
- data &= ~ARCH_CAP_TSX_CTRL_MSR;
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
+
return data;
}
-
-EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities);
static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
{
@@ -1235,8 +1462,7 @@
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- if (kvm_x86_ops->get_msr_feature(msr))
- return 1;
+ return kvm_x86_ops.get_msr_feature(msr);
}
return 0;
}
@@ -1248,6 +1474,14 @@
msr.index = index;
r = kvm_get_msr_feature(&msr);
+
+ if (r == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear the output for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ r = 0;
+ }
+
if (r)
return r;
@@ -1262,6 +1496,13 @@
return false;
if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ return false;
+
+ if (efer & (EFER_LME | EFER_LMA) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return false;
+
+ if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
return false;
return true;
@@ -1280,6 +1521,7 @@
{
u64 old_efer = vcpu->arch.efer;
u64 efer = msr_info->data;
+ int r;
if (efer & efer_reserved_bits)
return 1;
@@ -1296,7 +1538,11 @@
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- kvm_x86_ops->set_efer(vcpu, efer);
+ r = kvm_x86_ops.set_efer(vcpu, efer);
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@@ -1311,20 +1557,73 @@
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
+ struct kvm *kvm = vcpu->kvm;
+ bool allowed;
+ int idx;
+ u32 i;
+
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
+ return true;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ allowed = !!test_bit(index - start, bitmap);
+ break;
+ }
+
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return allowed;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
/*
- * Writes msr value into into the appropriate "register".
+ * Write @data into the MSR specified by @index. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
+ bool host_initiated)
{
- switch (msr->index) {
+ struct msr_data msr;
+
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+
+ switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
case MSR_KERNEL_GS_BASE:
case MSR_CSTAR:
case MSR_LSTAR:
- if (is_noncanonical_address(msr->data, vcpu))
+ if (is_noncanonical_address(data, vcpu))
return 1;
break;
case MSR_IA32_SYSENTER_EIP:
@@ -1341,54 +1640,313 @@
* value, and that something deterministic happens if the guest
* invokes 64-bit SYSENTER.
*/
- msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu));
+ data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
}
- return kvm_x86_ops->set_msr(vcpu, msr);
+
+ msr.data = data;
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ return kvm_x86_ops.set_msr(vcpu, &msr);
+}
+
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID)
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Read the MSR specified by @index into @data. Select MSR specific fault
+ * checks are bypassed if @host_initiated is %true.
+ * Returns 0 on success, non-0 otherwise.
+ * Assumes vcpu_load() was already called.
+ */
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
+{
+ struct msr_data msr;
+ int ret;
+
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+
+ msr.index = index;
+ msr.host_initiated = host_initiated;
+
+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
+ if (!ret)
+ *data = msr.data;
+ return ret;
+}
+
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear *data for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ ret = 0;
+ }
+
+ return ret;
+}
+
+int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
+{
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
+}
+EXPORT_SYMBOL_GPL(kvm_get_msr);
+
+int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
+{
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
+
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+{
+ if (vcpu->run->msr.error) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (is_read) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, true);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, false);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case KVM_MSR_RET_INVALID:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case KVM_MSR_RET_FILTERED:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_wrmsr, r);
+}
+
+int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data;
+ int r;
+
+ r = kvm_get_msr(vcpu, ecx, &data);
+
+ /* MSR read failed? See if we should ask user space */
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
+
+ /* MSR read failed? Inject a #GP */
+ if (r) {
+ trace_kvm_msr_read_ex(ecx);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_read(ecx, data);
+
+ kvm_rax_write(vcpu, data & -1u);
+ kvm_rdx_write(vcpu, (data >> 32) & -1u);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
+
+int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
+{
+ u32 ecx = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+ int r;
+
+ r = kvm_set_msr(vcpu, ecx, data);
+
+ /* MSR write failed? See if we should ask user space */
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
+ /* Bounce to user space */
+ return 0;
+
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
+
+ /* MSR write failed? Inject a #GP */
+ if (r > 0) {
+ trace_kvm_msr_write_ex(ecx, data);
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ trace_kvm_msr_write(ecx, data);
+ return kvm_skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+ return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
+ xfer_to_guest_mode_work_pending();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
+
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
+ return 1;
+
+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
+
+ data &= ~(1 << 12);
+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+ trace_kvm_apic_write(APIC_ICR, (u32)data);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!kvm_can_use_hv_timer(vcpu))
+ return 1;
+
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ return 0;
+}
+
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data;
+ fastpath_t ret = EXIT_FASTPATH_NONE;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
+ }
+ break;
+ case MSR_IA32_TSCDEADLINE:
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_REENTER_GUEST;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (ret != EXIT_FASTPATH_NONE)
+ trace_kvm_msr_write(msr, data);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- struct msr_data msr;
- int r;
-
- msr.index = index;
- msr.host_initiated = true;
- r = kvm_get_msr(vcpu, &msr);
- if (r)
- return r;
-
- *data = msr.data;
- return 0;
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
}
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- struct msr_data msr;
-
- msr.data = *data;
- msr.index = index;
- msr.host_initiated = true;
- return kvm_set_msr(vcpu, &msr);
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ u64 base_cycles;
+ u64 offset;
+};
+
struct pvclock_gtod_data {
seqcount_t seq;
- struct { /* extract of a clocksource struct */
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
- } clock;
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
- u64 boot_ns;
- u64 nsec_base;
+ ktime_t offs_boot;
u64 wall_time_sec;
};
@@ -1397,44 +1955,54 @@
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns;
-
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
- vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
vdata->wall_time_sec = tk->xtime_sec;
+ vdata->offs_boot = tk->offs_boot;
+
write_seqcount_end(&vdata->seq);
}
-#endif
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+static s64 get_kvmclock_base_ns(void)
{
- /*
- * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
- * vcpu_enter_guest. This function is only called from
- * the physical CPU that is running vcpu.
- */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
+#endif
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec64 boot;
+ u64 wall_nsec;
+
+ kvm->arch.wall_clock = wall_clock;
if (!wall_clock)
return;
@@ -1454,23 +2022,46 @@
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
+ * wall clock specified here. We do the reverse here.
*/
- getboottime64(&boot);
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) {
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
- boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
+ wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
version++;
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+ bool old_msr, bool host_initiated)
+{
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
+ }
+
+ vcpu->arch.time = system_time;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ vcpu->arch.pv_time_enabled = false;
+ if (!(system_time & 1))
+ return;
+
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.pv_time, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info)))
+ vcpu->arch.pv_time_enabled = true;
+
+ return;
}
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
@@ -1505,9 +2096,6 @@
*pshift = shift;
*pmultiplier = div_frac(scaled64, tps32);
-
- pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
- __func__, base_hz, scaled_hz, shift, *pmultiplier);
}
#ifdef CONFIG_X86_64
@@ -1604,7 +2192,7 @@
static inline int gtod_is_based_on_tsc(int mode)
{
- return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
@@ -1633,12 +2221,6 @@
atomic_read(&vcpu->kvm->online_vcpus),
ka->use_master_clock, gtod->clock.vclock_mode);
#endif
-}
-
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
}
/*
@@ -1679,15 +2261,14 @@
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
-
- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
+ vcpu->arch.l1_tsc_offset = offset;
+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -1697,29 +2278,28 @@
* TSC is marked unstable when we're running on Hyper-V,
* 'TSC page' clocksource is good.
*/
- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
return false;
#endif
return check_tsc_unstable();
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched;
bool already_matched;
- u64 data = msr->data;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boot_ns();
+ ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
- if (data == 0 && msr->host_initiated) {
+ if (data == 0) {
/*
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
@@ -1750,12 +2330,10 @@
vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
if (!kvm_check_tsc_unstable()) {
offset = kvm->arch.cur_tsc_offset;
- pr_debug("kvm: matched tsc offset for %llu\n", data);
} else {
u64 delta = nsec_to_cycles(vcpu, elapsed);
data += delta;
offset = kvm_compute_tsc_offset(vcpu, data);
- pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
matched = true;
already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
@@ -1774,8 +2352,6 @@
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
matched = false;
- pr_debug("kvm: new tsc generation %llu, clock %llu\n",
- kvm->arch.cur_tsc_generation, data);
}
/*
@@ -1793,9 +2369,6 @@
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
- update_ia32_tsc_adjust_msr(vcpu, offset);
-
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -1810,12 +2383,10 @@
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
@@ -1849,43 +2420,43 @@
return last;
}
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
{
long v;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
u64 tsc_pg_val;
- switch (gtod->clock.vclock_mode) {
- case VCLOCK_HVCLOCK:
+ switch (clock->vclock_mode) {
+ case VDSO_CLOCKMODE_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
/* TSC page valid */
- *mode = VCLOCK_HVCLOCK;
- v = (tsc_pg_val - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
} else {
/* TSC page invalid */
- *mode = VCLOCK_NONE;
+ *mode = VDSO_CLOCKMODE_NONE;
}
break;
- case VCLOCK_TSC:
- *mode = VCLOCK_TSC;
+ case VDSO_CLOCKMODE_TSC:
+ *mode = VDSO_CLOCKMODE_TSC;
*tsc_timestamp = read_tsc();
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
break;
default:
- *mode = VCLOCK_NONE;
+ *mode = VDSO_CLOCKMODE_NONE;
}
- if (*mode == VCLOCK_NONE)
+ if (*mode == VDSO_CLOCKMODE_NONE)
*tsc_timestamp = v = 0;
- return v * gtod->clock.mult;
+ return v * clock->mult;
}
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -1894,10 +2465,10 @@
do {
seq = read_seqcount_begin(>od->seq);
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
- ns >>= gtod->clock.shift;
- ns += gtod->boot_ns;
+ ns = gtod->raw_clock.base_cycles;
+ ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode);
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
*t = ns;
@@ -1914,8 +2485,8 @@
do {
seq = read_seqcount_begin(>od->seq);
ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(>od->clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
@@ -1932,7 +2503,7 @@
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
tsc_timestamp));
}
@@ -2057,7 +2628,7 @@
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boot_ns() + ka->kvmclock_offset;
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2073,7 +2644,7 @@
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boot_ns() + ka->kvmclock_offset;
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2172,7 +2743,7 @@
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boot_ns();
+ kernel_ns = get_kvmclock_base_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -2284,6 +2855,18 @@
KVMCLOCK_SYNC_PERIOD);
}
+/*
+ * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
+ */
+static bool can_set_mci_status(struct kvm_vcpu *vcpu)
+{
+ /* McStatusWrEn enabled? */
+ if (guest_cpuid_is_amd_or_hygon(vcpu))
+ return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
+
+ return false;
+}
+
static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -2313,14 +2896,22 @@
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
- * this to avoid an uncatched #GP in the guest
+ * this to avoid an uncatched #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
+ * correctable, single-bit ECC data errors.
*/
if ((offset & 0x3) == 0 &&
- data != 0 && (data | (1 << 10)) != ~(u64)0)
- return -1;
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
+
+ /* MCi_STATUS */
if (!msr_info->host_initiated &&
- (offset & 0x3) == 1 && data != 0)
- return -1;
+ (offset & 0x3) == 1 && data != 0) {
+ if (!can_set_mci_status(vcpu))
+ return 1;
+ }
+
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2340,104 +2931,192 @@
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
u8 *page;
- int r;
- r = -E2BIG;
if (page_num >= blob_size)
- goto out;
- r = -ENOMEM;
+ return 1;
+
page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page)) {
- r = PTR_ERR(page);
- goto out;
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
}
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
- goto out_free;
- r = 0;
-out_free:
- kfree(page);
-out:
- return r;
+ return 0;
+}
+
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
}
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
- /* Bits 3:5 are reserved, Should be zero */
- if (data & 0x38)
+ /* Bits 4:5 are reserved, Should be zero */
+ if (data & 0x30)
return 1;
- vcpu->arch.apf.msr_val = data;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+ return 1;
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return data ? 1 : 0;
+
+ vcpu->arch.apf.msr_en_val = data;
+
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
return 0;
}
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
- sizeof(u32)))
+ sizeof(u64)))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
kvm_async_pf_wakeup_all(vcpu);
+
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* Bits 8-63 are reserved */
+ if (data >> 8)
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ vcpu->arch.apf.msr_int_val = data;
+
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
return 0;
}
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.pv_time_enabled = false;
+ vcpu->arch.time = 0;
}
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+ kvm_x86_ops.tlb_flush_all(vcpu);
+}
+
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops.tlb_flush_guest(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ u64 steal;
+ u32 version;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- /* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
- kvm_vcpu_flush_tlb(vcpu, false);
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = 0;
+ int err = -EFAULT;
- vcpu->arch.st.preempted = 0;
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
- if (st->version & 1)
- st->version += 1; /* first time write, random junk */
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
- st->version += 1;
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
+
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
+ } else {
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
+ }
+
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
+
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
smp_wmb();
- st->steal += current->sched_info.run_delay -
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
- smp_wmb();
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
- st->version += 1;
-
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -2465,14 +3144,31 @@
return 1;
vcpu->arch.arch_capabilities = data;
break;
+ case MSR_IA32_PERF_CAPABILITIES: {
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+
+ if (!msr_info->host_initiated)
+ return 1;
+ if (kvm_get_msr_feature(&msr_ent))
+ return 1;
+ if (data & ~msr_ent.data)
+ return 1;
+
+ vcpu->arch.perf_capabilities = data;
+
+ return 0;
+ }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
data &= ~(u64)0x40; /* ignore flush filter disable */
data &= ~(u64)0x100; /* ignore ignne emulation enable */
data &= ~(u64)0x8; /* ignore TLB cache disable */
- data &= ~(u64)0x40000; /* ignore Mc status write enable */
- if (data != 0) {
+
+ /* Handle McStatusWrEn */
+ if (data == BIT_ULL(18)) {
+ vcpu->arch.msr_hwcr = data;
+ } else if (data != 0) {
vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
data);
return 1;
@@ -2493,9 +3189,9 @@
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
- }
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ } else if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
@@ -2520,15 +3216,46 @@
}
break;
case MSR_IA32_MISC_ENABLE:
- vcpu->arch.ia32_misc_enable_msr = data;
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
+ ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
+ return 1;
+ vcpu->arch.ia32_misc_enable_msr = data;
+ kvm_update_cpuid_runtime(vcpu);
+ } else {
+ vcpu->arch.ia32_misc_enable_msr = data;
+ }
break;
case MSR_IA32_SMBASE:
if (!msr_info->host_initiated)
return 1;
vcpu->arch.smbase = data;
break;
+ case MSR_IA32_POWER_CTL:
+ vcpu->arch.msr_ia32_power_ctl = data;
+ break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr_info);
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, data);
+ } else {
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
+ break;
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ /*
+ * KVM supports exposing PT to the guest, but does not support
+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+ * XSAVES/XRSTORS to save/restore PT MSRs.
+ */
+ if (data & ~supported_xss)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -2536,46 +3263,54 @@
vcpu->arch.smi_count = data;
break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
case MSR_KVM_WALL_CLOCK:
- vcpu->kvm->arch.wall_clock = data;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
kvm_write_wall_clock(vcpu->kvm, data);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
- case MSR_KVM_SYSTEM_TIME: {
- struct kvm_arch *ka = &vcpu->kvm->arch;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
- kvmclock_reset(vcpu);
-
- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
-
- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-
- ka->boot_vcpu_runs_old_kvmclock = tmp;
- }
-
- vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
-
- /* we verify if the enable bit is set... */
- if (!(data & 1))
- break;
-
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.pv_time, data & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = false;
- else
- vcpu->arch.pv_time_enabled = true;
-
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
break;
- }
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
+ break;
case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+ if (data & 0x1) {
+ vcpu->arch.apf.pageready_pending = false;
+ kvm_check_async_pf_completion(vcpu);
+ }
+ break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
if (unlikely(!sched_info_on()))
return 1;
@@ -2592,8 +3327,22 @@
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
+ break;
+
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ /* only enable bit supported */
+ if (data & (-1ULL << 1))
+ return 1;
+
+ vcpu->arch.msr_kvm_poll_control = data;
break;
case MSR_IA32_MCG_CTL:
@@ -2603,7 +3352,8 @@
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true; /* fall through */
+ pr = true;
+ fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
@@ -2624,6 +3374,8 @@
*/
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
@@ -2669,33 +3421,11 @@
return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu,
- "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- break;
- }
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_msr_common);
-
-
-/*
- * Reads an msr value (of 'msr_index') into 'pdata'.
- * Returns 0 on success, non-0 otherwise.
- * Assumes vcpu_load() was already called.
- */
-int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
-{
- return kvm_x86_ops->get_msr(vcpu, msr);
-}
-EXPORT_SYMBOL_GPL(kvm_get_msr);
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
{
@@ -2748,7 +3478,6 @@
case MSR_K8_SYSCFG:
case MSR_K8_TSEG_ADDR:
case MSR_K8_TSEG_MASK:
- case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
@@ -2757,6 +3486,17 @@
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
@@ -2765,7 +3505,7 @@
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ return kvm_pmu_get_msr(vcpu, msr_info);
msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
@@ -2777,9 +3517,31 @@
return 1;
msr_info->data = vcpu->arch.arch_capabilities;
break;
- case MSR_IA32_TSC:
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
break;
+ case MSR_IA32_POWER_CTL:
+ msr_info->data = vcpu->arch.msr_ia32_power_ctl;
+ break;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * return L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+ vcpu->arch.tsc_offset;
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
+ break;
+ }
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -2805,7 +3567,6 @@
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- break;
case MSR_IA32_TSCDEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
@@ -2833,21 +3594,64 @@
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
- msr_info->data = vcpu->arch.apf.msr_val;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_en_val;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_int_val;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
msr_info->data = vcpu->arch.pv_eoi.msr_val;
+ break;
+ case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
+ msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
@@ -2857,6 +3661,12 @@
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2870,6 +3680,8 @@
msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
@@ -2879,7 +3691,6 @@
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
- break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
* silicon. It is however accessed by winxp in very narrow
@@ -2912,20 +3723,13 @@
case MSR_MISC_FEATURES_ENABLES:
msr_info->data = vcpu->arch.msr_misc_features_enables;
break;
+ case MSR_K7_HWCR:
+ msr_info->data = vcpu->arch.msr_hwcr;
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
- msr_info->index);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
- msr_info->index);
- msr_info->data = 0;
- }
- break;
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
@@ -2966,7 +3770,7 @@
unsigned size;
r = -EFAULT;
- if (copy_from_user(&msrs, user_msrs, sizeof msrs))
+ if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
goto out;
r = -E2BIG;
@@ -3037,24 +3841,33 @@
case KVM_CAP_HYPERV_VP_INDEX:
case KVM_CAP_HYPERV_EVENTFD:
case KVM_CAP_HYPERV_TLBFLUSH:
+ case KVM_CAP_HYPERV_SEND_IPI:
+ case KVM_CAP_HYPERV_CPUID:
case KVM_CAP_PCI_SEGMENT:
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
- case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
case KVM_CAP_IMMEDIATE_EXIT:
+ case KVM_CAP_PMU_EVENT_FILTER:
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -3064,7 +3877,8 @@
r = KVM_CLOCK_TSC_STABLE;
break;
case KVM_CAP_X86_DISABLE_EXITS:
- r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE;
+ r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
+ KVM_X86_DISABLE_EXITS_CSTATE;
if(kvm_can_mwait_in_guest())
r |= KVM_X86_DISABLE_EXITS_MWAIT;
break;
@@ -3077,10 +3891,10 @@
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
@@ -3090,9 +3904,6 @@
break;
case KVM_CAP_MAX_VCPU_ID:
r = KVM_MAX_VCPU_ID;
- break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_USER_MEM_SLOTS;
break;
case KVM_CAP_PV_MMU: /* obsolete */
r = 0;
@@ -3110,8 +3921,20 @@
r = KVM_X2APIC_API_VALID_FLAGS;
break;
case KVM_CAP_NESTED_STATE:
- r = kvm_x86_ops->get_nested_state ?
- kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
+ r = kvm_x86_ops.nested_ops->get_state ?
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
+ break;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
+ break;
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
+ break;
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
+ case KVM_CAP_STEAL_TIME:
+ r = sched_info_on();
break;
default:
break;
@@ -3133,11 +3956,11 @@
unsigned n;
r = -EFAULT;
- if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
+ if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
goto out;
n = msr_list.nmsrs;
msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
- if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
+ if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
goto out;
r = -E2BIG;
if (n < msr_list.nmsrs)
@@ -3159,7 +3982,7 @@
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
@@ -3168,12 +3991,12 @@
goto out;
r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
}
- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
r = -EFAULT;
if (copy_to_user(argp, &kvm_mce_cap_supported,
sizeof(kvm_mce_cap_supported)))
@@ -3205,9 +4028,9 @@
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
- }
default:
r = -EINVAL;
+ break;
}
out:
return r;
@@ -3227,14 +4050,17 @@
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops->has_wbinvd_exit())
+ if (kvm_x86_ops.has_wbinvd_exit())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- kvm_x86_ops->vcpu_load(vcpu, cpu);
+ kvm_x86_ops.vcpu_load(vcpu, cpu);
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
@@ -3275,52 +4101,68 @@
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
if (vcpu->arch.st.preempted)
return;
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted)
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+ if (vcpu->preempted) {
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
- /*
- * Disable page faults because we're in atomic context here.
- * kvm_write_guest_offset_cached() would call might_fault()
- * that relies on pagefault_disable() to tell if there's a
- * bug. NOTE: the write to guest memory may not go through if
- * during postcopy live migration or if there's heavy guest
- * paging.
- */
- pagefault_disable();
- /*
- * kvm_memslots() will be called by
- * kvm_write_guest_offset_cached() so take the srcu lock.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- pagefault_enable();
- kvm_x86_ops->vcpu_put(vcpu);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
+ kvm_x86_ops.vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -3334,7 +4176,7 @@
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -3453,8 +4295,7 @@
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- if (kvm_x86_ops->setup_mce)
- kvm_x86_ops->setup_mce(vcpu);
+ kvm_x86_ops.setup_mce(vcpu);
out:
return r;
}
@@ -3516,28 +4357,56 @@
process_smi(vcpu);
/*
- * FIXME: pass injected and pending separately. This is only
- * needed for nested virtualization, whose state cannot be
- * migrated yet. For now we can combine them.
+ * In guest mode, payload delivery should be deferred,
+ * so that the L1 hypervisor can intercept #PF before
+ * CR2 is modified (or intercept #DB before DR6 is
+ * modified under nVMX). Unless the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
+ * opportunistically defer the exception payload, deliver it if the
+ * capability hasn't been requested before processing a
+ * KVM_GET_VCPU_EVENTS.
*/
- events->exception.injected =
- (vcpu->arch.exception.pending ||
- vcpu->arch.exception.injected) &&
- !kvm_exception_is_soft(vcpu->arch.exception.nr);
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+ kvm_deliver_exception_payload(vcpu);
+
+ /*
+ * The API doesn't provide the instruction length for software
+ * exceptions, so don't report them. As long as the guest RIP
+ * isn't advanced, we should expect to encounter the exception
+ * again.
+ */
+ if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+ events->exception.injected = 0;
+ events->exception.pending = 0;
+ } else {
+ events->exception.injected = vcpu->arch.exception.injected;
+ events->exception.pending = vcpu->arch.exception.pending;
+ /*
+ * For ABI compatibility, deliberately conflate
+ * pending and injected exceptions when
+ * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ events->exception.injected |=
+ vcpu->arch.exception.pending;
+ }
events->exception.nr = vcpu->arch.exception.nr;
events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.pad = 0;
events->exception.error_code = vcpu->arch.exception.error_code;
+ events->exception_has_payload = vcpu->arch.exception.has_payload;
+ events->exception_payload = vcpu->arch.exception.payload;
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -3551,10 +4420,13 @@
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SHADOW
| KVM_VCPUEVENT_VALID_SMM);
+ if (vcpu->kvm->arch.exception_payload_enabled)
+ events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
+
memset(&events->reserved, 0, sizeof(events->reserved));
}
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu);
static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
@@ -3562,12 +4434,24 @@
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
| KVM_VCPUEVENT_VALID_SHADOW
- | KVM_VCPUEVENT_VALID_SMM))
+ | KVM_VCPUEVENT_VALID_SMM
+ | KVM_VCPUEVENT_VALID_PAYLOAD))
return -EINVAL;
- if (events->exception.injected &&
- (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
- is_guest_mode(vcpu)))
+ if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
+ if (!vcpu->kvm->arch.exception_payload_enabled)
+ return -EINVAL;
+ if (events->exception.pending)
+ events->exception.injected = 0;
+ else
+ events->exception_has_payload = 0;
+ } else {
+ events->exception.pending = 0;
+ events->exception_has_payload = 0;
+ }
+
+ if ((events->exception.injected || events->exception.pending) &&
+ (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
return -EINVAL;
/* INITs are latched while in SMM */
@@ -3577,35 +4461,40 @@
return -EINVAL;
process_nmi(vcpu);
- vcpu->arch.exception.injected = false;
- vcpu->arch.exception.pending = events->exception.injected;
+ vcpu->arch.exception.injected = events->exception.injected;
+ vcpu->arch.exception.pending = events->exception.pending;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
+ vcpu->arch.exception.has_payload = events->exception_has_payload;
+ vcpu->arch.exception.payload = events->exception_payload;
vcpu->arch.interrupt.injected = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops->set_interrupt_shadow(vcpu,
+ kvm_x86_ops.set_interrupt_shadow(vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- u32 hflags = vcpu->arch.hflags;
- if (events->smi.smm)
- hflags |= HF_SMM_MASK;
- else
- hflags &= ~HF_SMM_MASK;
- kvm_set_hflags(vcpu, hflags);
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ if (events->smi.smm)
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
+ kvm_smm_changed(vcpu);
+ }
vcpu->arch.smi_pending = events->smi.pending;
@@ -3614,12 +4503,13 @@
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- }
+ }
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
}
@@ -3633,12 +4523,11 @@
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -3655,7 +4544,6 @@
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
- kvm_update_dr6(vcpu);
vcpu->arch.dr7 = dbgregs->dr7;
kvm_update_dr7(vcpu);
@@ -3666,7 +4554,7 @@
static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
u64 xstate_bv = xsave->header.xfeatures;
u64 valid;
@@ -3686,15 +4574,15 @@
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
- u64 feature = valid & -valid;
- int index = fls64(feature) - 1;
- void *src = get_xsave_addr(xsave, feature);
+ u64 xfeature_mask = valid & -valid;
+ int xfeature_nr = fls64(xfeature_mask) - 1;
+ void *src = get_xsave_addr(xsave, xfeature_nr);
if (src) {
u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, index,
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
&size, &offset, &ecx, &edx);
- if (feature == XFEATURE_MASK_PKRU)
+ if (xfeature_nr == XFEATURE_PKRU)
memcpy(dest + offset, &vcpu->arch.pkru,
sizeof(vcpu->arch.pkru));
else
@@ -3702,13 +4590,13 @@
}
- valid -= feature;
+ valid -= xfeature_mask;
}
}
static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
u64 valid;
@@ -3729,22 +4617,22 @@
*/
valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
while (valid) {
- u64 feature = valid & -valid;
- int index = fls64(feature) - 1;
- void *dest = get_xsave_addr(xsave, feature);
+ u64 xfeature_mask = valid & -valid;
+ int xfeature_nr = fls64(xfeature_mask) - 1;
+ void *dest = get_xsave_addr(xsave, xfeature_nr);
if (dest) {
u32 size, offset, ecx, edx;
- cpuid_count(XSTATE_CPUID, index,
+ cpuid_count(XSTATE_CPUID, xfeature_nr,
&size, &offset, &ecx, &edx);
- if (feature == XFEATURE_MASK_PKRU)
+ if (xfeature_nr == XFEATURE_PKRU)
memcpy(&vcpu->arch.pkru, src + offset,
sizeof(vcpu->arch.pkru));
else
memcpy(dest, src + offset, size);
}
- valid -= feature;
+ valid -= xfeature_mask;
}
}
@@ -3756,7 +4644,7 @@
fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state.fxsave,
+ &vcpu->arch.guest_fpu->state.fxsave,
sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
XFEATURE_MASK_FPSSE;
@@ -3778,15 +4666,14 @@
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0() ||
- mxcsr & ~mxcsr_feature_mask)
+ if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
if (xstate_bv & ~XFEATURE_MASK_FPSSE ||
mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+ memcpy(&vcpu->arch.guest_fpu->state.fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
}
return 0;
@@ -3847,6 +4734,10 @@
static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
struct kvm_enable_cap *cap)
{
+ int r;
+ uint16_t vmcs_version;
+ void __user *user_ptr;
+
if (cap->flags)
return -EINVAL;
@@ -3854,11 +4745,37 @@
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
+ fallthrough;
+
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
return -EINVAL;
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
+ case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
+ return -ENOTTY;
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
+ if (!r) {
+ user_ptr = (void __user *)(uintptr_t)cap->args[0];
+ if (copy_to_user(user_ptr, &vmcs_version,
+ sizeof(vmcs_version)))
+ r = -EFAULT;
+ }
+ return r;
+ case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
+ if (!kvm_x86_ops.enable_direct_tlbflush)
+ return -ENOTTY;
+
+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ if (vcpu->arch.pv_cpuid.enforce)
+ kvm_update_pv_runtime(vcpu);
+
+ return 0;
+
default:
return -EINVAL;
}
@@ -3885,7 +4802,8 @@
r = -EINVAL;
if (!lapic_in_kernel(vcpu))
goto out;
- u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+ u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+ GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.lapic)
@@ -3916,7 +4834,7 @@
struct kvm_interrupt irq;
r = -EFAULT;
- if (copy_from_user(&irq, argp, sizeof irq))
+ if (copy_from_user(&irq, argp, sizeof(irq)))
goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
break;
@@ -3934,7 +4852,7 @@
struct kvm_cpuid cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
break;
@@ -3944,7 +4862,7 @@
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
@@ -3955,14 +4873,14 @@
struct kvm_cpuid2 cpuid;
r = -EFAULT;
- if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
goto out;
r = 0;
break;
@@ -3983,13 +4901,13 @@
struct kvm_tpr_access_ctl tac;
r = -EFAULT;
- if (copy_from_user(&tac, argp, sizeof tac))
+ if (copy_from_user(&tac, argp, sizeof(tac)))
goto out;
r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
if (r)
goto out;
r = -EFAULT;
- if (copy_to_user(argp, &tac, sizeof tac))
+ if (copy_to_user(argp, &tac, sizeof(tac)))
goto out;
r = 0;
break;
@@ -4002,7 +4920,7 @@
if (!lapic_in_kernel(vcpu))
goto out;
r = -EFAULT;
- if (copy_from_user(&va, argp, sizeof va))
+ if (copy_from_user(&va, argp, sizeof(va)))
goto out;
idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
@@ -4013,7 +4931,7 @@
u64 mcg_cap;
r = -EFAULT;
- if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
+ if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
goto out;
r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
break;
@@ -4022,7 +4940,7 @@
struct kvm_x86_mce mce;
r = -EFAULT;
- if (copy_from_user(&mce, argp, sizeof mce))
+ if (copy_from_user(&mce, argp, sizeof(mce)))
goto out;
r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
break;
@@ -4072,7 +4990,7 @@
break;
}
case KVM_GET_XSAVE: {
- u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+ u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xsave)
break;
@@ -4096,7 +5014,7 @@
break;
}
case KVM_GET_XCRS: {
- u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+ u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
r = -ENOMEM;
if (!u.xcrs)
break;
@@ -4126,7 +5044,8 @@
r = -EINVAL;
user_tsc_khz = (u32)arg;
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
@@ -4159,7 +5078,7 @@
u32 user_data_size;
r = -EINVAL;
- if (!kvm_x86_ops->get_nested_state)
+ if (!kvm_x86_ops.nested_ops->get_state)
break;
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
@@ -4167,8 +5086,8 @@
if (get_user(user_data_size, &user_kvm_nested_state->size))
break;
- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
- user_data_size);
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+ user_data_size);
if (r < 0)
break;
@@ -4189,7 +5108,7 @@
int idx;
r = -EINVAL;
- if (!kvm_x86_ops->set_nested_state)
+ if (!kvm_x86_ops.nested_ops->set_state)
break;
r = -EFAULT;
@@ -4201,16 +5120,38 @@
break;
if (kvm_state.flags &
- ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
+ ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
+ | KVM_STATE_NESTED_GIF_SET))
break;
/* nested_run_pending implies guest_mode. */
- if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
+ if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
+ && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ break;
+ }
+ case KVM_GET_SUPPORTED_HV_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
+ goto out;
+
+ r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
+ goto out;
+ r = 0;
break;
}
default:
@@ -4234,14 +5175,14 @@
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -4382,9 +5323,6 @@
{
struct kvm_pit *pit = kvm->arch.vpit;
- if (!pit)
- return -ENXIO;
-
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
@@ -4396,50 +5334,13 @@
return 0;
}
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- * 1. Take a snapshot of the bit and clear it if needed.
- * 2. Write protect the corresponding page.
- * 3. Copy the snapshot to the userspace.
- * 4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- bool is_dirty = false;
- int r;
-
- mutex_lock(&kvm->slots_lock);
-
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
- if (kvm_x86_ops->flush_log_dirty)
- kvm_x86_ops->flush_log_dirty(kvm);
-
- r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
-
- /*
- * All the TLBs can be flushed out of mmu lock, see the comments in
- * kvm_mmu_slot_remove_write_access().
- */
- lockdep_assert_held(&kvm->slots_lock);
- if (is_dirty)
- kvm_flush_remote_tlbs(kvm);
-
- mutex_unlock(&kvm->slots_lock);
- return r;
+ if (kvm_x86_ops.flush_log_dirty)
+ kvm_x86_ops.flush_log_dirty(kvm);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -4454,8 +5355,8 @@
return 0;
}
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
- struct kvm_enable_cap *cap)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
{
int r;
@@ -4513,10 +5414,25 @@
kvm->arch.hlt_in_guest = true;
if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
kvm->arch.pause_in_guest = true;
+ if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
+ kvm->arch.cstate_in_guest = true;
r = 0;
break;
case KVM_CAP_MSR_PLATFORM_INFO:
kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_EXCEPTION_PAYLOAD:
+ kvm->arch.exception_payload_enabled = cap->args[0];
+ r = 0;
+ break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ r = -EINVAL;
+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER))
+ break;
+ kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
break;
default:
@@ -4525,6 +5441,180 @@
}
return r;
}
+
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
+{
+ u32 i;
+
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
+
+ kfree(msr_filter);
+}
+
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
+{
+ struct msr_bitmap_range range;
+ unsigned long *bitmap = NULL;
+ size_t bitmap_size;
+ int r;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ range = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ if (!range.flags) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ /* Everything ok, add this range identifier. */
+ msr_filter->ranges[msr_filter->count] = range;
+ msr_filter->count++;
+
+ return 0;
+err:
+ kfree(bitmap);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
+ struct kvm_msr_filter *filter)
+{
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
+ bool default_allow;
+ bool empty = true;
+ int r = 0;
+ u32 i;
+
+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
+ empty &= !filter->ranges[i].nmsrs;
+
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ if (empty && !default_allow)
+ return -EINVAL;
+
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
+ }
+
+ mutex_lock(&kvm->lock);
+
+ /* The per-VM filter is protected by kvm->lock... */
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+#ifdef CONFIG_KVM_COMPAT
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range_compat {
+ __u32 flags;
+ __u32 nmsrs;
+ __u32 base;
+ __u32 bitmap;
+};
+
+struct kvm_msr_filter_compat {
+ __u32 flags;
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
+
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
+
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct kvm *kvm = filp->private_data;
+ long r = -ENOTTY;
+
+ switch (ioctl) {
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter_compat filter_compat;
+ struct kvm_msr_filter filter;
+ int i;
+
+ if (copy_from_user(&filter_compat, user_msr_filter,
+ sizeof(filter_compat)))
+ return -EFAULT;
+
+ filter.flags = filter_compat.flags;
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ struct kvm_msr_filter_range_compat *cr;
+
+ cr = &filter_compat.ranges[i];
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
+ .flags = cr->flags,
+ .nmsrs = cr->nmsrs,
+ .base = cr->base,
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
+ };
+ }
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
+ break;
+ }
+ }
+
+ return r;
+}
+#endif
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
@@ -4555,7 +5645,7 @@
if (kvm->created_vcpus)
goto set_identity_unlock;
r = -EFAULT;
- if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
+ if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
set_identity_unlock:
@@ -4639,7 +5729,7 @@
if (r)
goto get_irqchip_out;
r = -EFAULT;
- if (copy_to_user(argp, chip, sizeof *chip))
+ if (copy_to_user(argp, chip, sizeof(*chip)))
goto get_irqchip_out;
r = 0;
get_irqchip_out:
@@ -4660,9 +5750,6 @@
if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
set_irqchip_out:
kfree(chip);
break;
@@ -4685,7 +5772,7 @@
}
case KVM_SET_PIT: {
r = -EFAULT;
- if (copy_from_user(&u.ps, argp, sizeof u.ps))
+ if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
goto out;
mutex_lock(&kvm->lock);
r = -ENXIO;
@@ -4726,6 +5813,9 @@
struct kvm_reinject_control control;
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
@@ -4790,19 +5880,10 @@
r = 0;
break;
}
- case KVM_ENABLE_CAP: {
- struct kvm_enable_cap cap;
-
- r = -EFAULT;
- if (copy_from_user(&cap, argp, sizeof(cap)))
- goto out;
- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
- break;
- }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_op)
- r = kvm_x86_ops->mem_enc_op(kvm, argp);
+ if (kvm_x86_ops.mem_enc_op)
+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -4813,8 +5894,8 @@
goto out;
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_reg_region)
- r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion);
+ if (kvm_x86_ops.mem_enc_reg_region)
+ r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -4825,8 +5906,8 @@
goto out;
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_unreg_region)
- r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion);
+ if (kvm_x86_ops.mem_enc_unreg_region)
+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -4838,6 +5919,19 @@
r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
break;
}
+ case KVM_SET_PMU_EVENT_FILTER:
+ r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
+ break;
+ case KVM_X86_SET_MSR_FILTER: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
+ break;
+ }
default:
r = -ENOTTY;
}
@@ -4847,58 +5941,96 @@
static void kvm_init_msr_list(void)
{
+ struct x86_pmu_capability x86_pmu;
u32 dummy[2];
- unsigned i, j;
+ unsigned i;
- for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
- if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
+ BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
+ "Please update the fixed PMCs in msrs_to_saved_all[]");
+
+ perf_get_x86_pmu_capability(&x86_pmu);
+
+ num_msrs_to_save = 0;
+ num_emulated_msrs = 0;
+ num_msr_based_features = 0;
+
+ for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
+ if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
continue;
/*
* Even MSRs that are valid in the host may not be exposed
* to the guests in some cases.
*/
- switch (msrs_to_save[i]) {
+ switch (msrs_to_save_all[i]) {
case MSR_IA32_BNDCFGS:
if (!kvm_mpx_supported())
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_x86_ops->rdtscp_supported())
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
+ continue;
+ break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
+ continue;
+ break;
+ case MSR_IA32_RTIT_CTL:
+ case MSR_IA32_RTIT_STATUS:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
+ continue;
+ break;
+ case MSR_IA32_RTIT_CR3_MATCH:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
+ continue;
+ break;
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ (!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
+ !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
+ continue;
+ break;
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
+ msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
+ intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
+ continue;
+ break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
continue;
break;
default:
break;
}
- if (j < i)
- msrs_to_save[j] = msrs_to_save[i];
- j++;
+ msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
}
- num_msrs_to_save = j;
- for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i]))
+ for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
continue;
- if (j < i)
- emulated_msrs[j] = emulated_msrs[i];
- j++;
+ emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
}
- num_emulated_msrs = j;
- for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) {
+ for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
struct kvm_msr_entry msr;
- msr.index = msr_based_features[i];
+ msr.index = msr_based_features_all[i];
if (kvm_get_msr_feature(&msr))
continue;
- if (j < i)
- msr_based_features[j] = msr_based_features[i];
- j++;
+ msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
}
- num_msr_based_features = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4947,13 +6079,13 @@
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops->set_segment(vcpu, var, seg);
+ kvm_x86_ops.set_segment(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops->get_segment(vcpu, var, seg);
+ kvm_x86_ops.get_segment(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -4965,7 +6097,7 @@
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
+ t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
return t_gpa;
}
@@ -4973,14 +6105,14 @@
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -4988,7 +6120,7 @@
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5037,7 +6169,7 @@
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -5062,7 +6194,7 @@
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -5083,7 +6215,7 @@
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -5136,7 +6268,7 @@
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -5149,13 +6281,6 @@
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
- /*
- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
- * is returned, but our callers are not ready for that and they blindly
- * call kvm_inject_page_fault. Ensure that they at least do not leak
- * uninitialized kernel stack memory into cr2 and error code.
- */
- memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -5163,25 +6288,23 @@
int handle_ud(struct kvm_vcpu *vcpu)
{
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
int emul_type = EMULTYPE_TRAP_UD;
- enum emulation_result er;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ return 1;
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
- emul_type = 0;
+ emul_type = EMULTYPE_TRAP_UD_FORCED;
}
- er = kvm_emulate_instruction(vcpu, emul_type);
- if (er == EMULATE_USER_EXIT)
- return 0;
- if (er != EMULATE_DONE)
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
+ return kvm_emulate_instruction(vcpu, emul_type);
}
EXPORT_SYMBOL_GPL(handle_ud);
@@ -5204,7 +6327,7 @@
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -5214,7 +6337,7 @@
*/
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.access, 0, access)) {
+ vcpu->arch.mmio_access, 0, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -5323,7 +6446,7 @@
int handled, ret;
bool write = ops->write;
struct kvm_mmio_fragment *frag;
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
/*
* If the exit was due to a NPF we may already have a GPA.
@@ -5332,10 +6455,9 @@
* operation using rep will only have the initial GPA from the NPF
* occurred.
*/
- if (vcpu->arch.gpa_available &&
- emulator_can_use_gpa(ctxt) &&
- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
- gpa = vcpu->arch.gpa_val;
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+ gpa = ctxt->gpa_val;
ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
} else {
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5456,9 +6578,10 @@
unsigned int bytes,
struct x86_exception *exception)
{
+ struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 page_line_mask;
gpa_t gpa;
- struct page *page;
char *kaddr;
bool exchanged;
@@ -5472,15 +6595,23 @@
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ /*
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
+ * enabled in the host and the access splits a cache line.
+ */
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ page_line_mask = ~(cache_line_size() - 1);
+ else
+ page_line_mask = PAGE_MASK;
+
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
- page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
+ if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
goto emul_write;
- kaddr = kmap_atomic(page);
- kaddr += offset_in_page(gpa);
+ kaddr = map.hva + offset_in_page(gpa);
+
switch (bytes) {
case 1:
exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
@@ -5497,13 +6628,12 @@
default:
BUG();
}
- kunmap_atomic(kaddr);
- kvm_release_page_dirty(page);
+
+ kvm_vcpu_unmap(vcpu, &map, true);
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
- kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_page_track_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -5557,11 +6687,9 @@
return 0;
}
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port, void *val,
- unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int ret;
if (vcpu->arch.pio.count)
@@ -5581,20 +6709,33 @@
return 0;
}
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port,
- const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, const void *val,
+ unsigned int count)
+{
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
+ return kvm_x86_ops.get_segment_base(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -5607,7 +6748,7 @@
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (kvm_x86_ops->has_wbinvd_exit()) {
+ if (kvm_x86_ops.has_wbinvd_exit()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -5712,27 +6853,27 @@
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -5810,28 +6951,33 @@
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- struct msr_data msr;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int r;
- msr.index = msr_index;
- msr.host_initiated = false;
- r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
- if (r)
- return r;
+ r = kvm_get_msr(vcpu, msr_index, pdata);
- *pdata = msr.data;
- return 0;
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- struct msr_data msr;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
- msr.data = data;
- msr.index = msr_index;
- msr.host_initiated = false;
- return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
+ r = kvm_set_msr(vcpu, msr_index, data);
+
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -5851,7 +6997,7 @@
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -5869,13 +7015,35 @@
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+ bool exact_only)
{
- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
+}
+
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+}
+
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
}
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
@@ -5890,7 +7058,7 @@
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -5900,12 +7068,26 @@
static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
{
- kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.hflags = emul_flags;
+ kvm_mmu_reset_context(vcpu);
}
-static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
+ const char *smstate)
{
- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+}
+
+static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
+{
+ kvm_smm_changed(emul_to_vcpu(ctxt));
+}
+
+static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
+{
+ return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
}
static const struct x86_emulate_ops emulate_ops = {
@@ -5944,15 +7126,21 @@
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .guest_has_long_mode = emulator_guest_has_long_mode,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
.pre_leave_smm = emulator_pre_leave_smm,
+ .post_leave_smm = emulator_post_leave_smm,
+ .set_xcr = emulator_set_xcr,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -5963,7 +7151,7 @@
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -5971,9 +7159,9 @@
static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
- return kvm_propagate_fault(vcpu, &ctxt->exception);
+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
@@ -5983,13 +7171,31 @@
return false;
}
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt;
+
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+ if (!ctxt) {
+ pr_err("kvm: failed to allocate vcpu's emulator\n");
+ return NULL;
+ }
+
+ ctxt->vcpu = vcpu;
+ ctxt->ops = &emulate_ops;
+ vcpu->arch.emulate_ctxt = ctxt;
+
+ return ctxt;
+}
+
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
@@ -6003,13 +7209,18 @@
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
-int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
+void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
@@ -6019,37 +7230,43 @@
ctxt->_eip = ctxt->eip + inc_eip;
ret = emulate_int_real(ctxt, irq);
- if (ret != X86EMUL_CONTINUE)
- return EMULATE_FAIL;
-
- ctxt->eip = ctxt->_eip;
- kvm_rip_write(vcpu, ctxt->eip);
- kvm_set_rflags(vcpu, ctxt->eflags);
-
- return EMULATE_DONE;
+ if (ret != X86EMUL_CONTINUE) {
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ } else {
+ ctxt->eip = ctxt->_eip;
+ kvm_rip_write(vcpu, ctxt->eip);
+ kvm_set_rflags(vcpu, ctxt->eflags);
+ }
}
EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
{
- int r = EMULATE_DONE;
-
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- if (emulation_type & EMULTYPE_NO_UD_ON_FAIL)
- return EMULATE_FAIL;
+ if (emulation_type & EMULTYPE_VMWARE_GP) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ if (emulation_type & EMULTYPE_SKIP) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
- r = EMULATE_USER_EXIT;
+ return 0;
}
kvm_queue_exception(vcpu, UD_VECTOR);
- return r;
+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
+
+ return 1;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
@@ -6059,13 +7276,14 @@
gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
- if (!vcpu->arch.mmu.direct_map) {
+ if (!vcpu->arch.mmu->direct_map) {
/*
* Write permission should be allowed since only
* write access need to be emulated.
@@ -6098,7 +7316,7 @@
kvm_release_pfn_clean(pfn);
/* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu.direct_map) {
+ if (vcpu->arch.mmu->direct_map) {
unsigned int indirect_shadow_pages;
spin_lock(&vcpu->kvm->mmu_lock);
@@ -6150,10 +7368,11 @@
*/
vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
if (x86_page_table_writing_insn(ctxt))
@@ -6165,7 +7384,7 @@
vcpu->arch.last_retry_eip = ctxt->eip;
vcpu->arch.last_retry_addr = cr2_or_gpa;
- if (!vcpu->arch.mmu.direct_map)
+ if (!vcpu->arch.mmu->direct_map)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6189,16 +7408,6 @@
kvm_mmu_reset_context(vcpu);
}
-static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
-{
- unsigned changed = vcpu->arch.hflags ^ emul_flags;
-
- vcpu->arch.hflags = emul_flags;
-
- if (changed & HF_SMM_MASK)
- kvm_smm_changed(vcpu);
-}
-
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -6214,34 +7423,29 @@
return dr6;
}
-static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r)
+static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
{
struct kvm_run *kvm_run = vcpu->run;
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
- } else {
- /*
- * "Certain debug exceptions may clear bit 0-3. The
- * remaining contents of the DR6 register are never
- * cleared by the processor".
- */
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
+ return 0;
}
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
+ return 1;
}
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
- int r = EMULATE_DONE;
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
+ int r;
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
+ if (unlikely(!r))
+ return 0;
/*
* rflags is the old, "raw" value of the flags. The new value has
@@ -6252,12 +7456,12 @@
* that sets the TF flag".
*/
if (unlikely(rflags & X86_EFLAGS_TF))
- kvm_vcpu_do_singlestep(vcpu, &r);
- return r == EMULATE_DONE;
+ r = kvm_vcpu_do_singlestep(vcpu);
+ return r;
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
@@ -6272,7 +7476,7 @@
kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
- *r = EMULATE_USER_EXIT;
+ *r = 0;
return true;
}
}
@@ -6285,10 +7489,8 @@
vcpu->arch.db);
if (dr6 != 0) {
- vcpu->arch.dr6 &= ~15;
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
- *r = EMULATE_DONE;
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
+ *r = 1;
return true;
}
}
@@ -6327,13 +7529,45 @@
return false;
}
+/*
+ * Decode an instruction for emulation. The caller is responsible for handling
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ * response to a machine check.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int r;
+
+ init_emulate_ctxt(vcpu);
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
{
int r;
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ bool write_fault_to_spt;
+
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -6341,40 +7575,36 @@
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
*/
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
- kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
+ kvm_clear_exception_queue(vcpu);
/*
- * We will reenter on the same instruction since
- * we do not set complete_userspace_io. This does not
- * handle watchpoints yet, those would be handled in
- * the emulate_ops.
+ * Return immediately if RIP hits a code breakpoint, such #DBs
+ * are fault-like and are higher priority than any faults on
+ * the code fetch itself.
*/
if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
return r;
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
if (r != EMULATION_OK) {
- if (emulation_type & EMULTYPE_TRAP_UD)
- return EMULATE_FAIL;
- if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
- emulation_type))
- return EMULATE_DONE;
- if (ctxt->have_exception) {
+ if ((emulation_type & EMULTYPE_TRAP_UD) ||
+ (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+ }
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ write_fault_to_spt,
+ emulation_type))
+ return 1;
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
/*
* #UD should result in just EMULATION_FAILED, and trap-like
* exception should not be encountered during decode.
@@ -6382,27 +7612,32 @@
WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP);
inject_emulated_exception(vcpu);
- return EMULATE_DONE;
+ return 1;
}
- if (emulation_type & EMULTYPE_SKIP)
- return EMULATE_FAIL;
return handle_emulation_failure(vcpu, emulation_type);
}
}
- if ((emulation_type & EMULTYPE_VMWARE) &&
- !is_vmware_backdoor_opcode(ctxt))
- return EMULATE_FAIL;
+ if ((emulation_type & EMULTYPE_VMWARE_GP) &&
+ !is_vmware_backdoor_opcode(ctxt)) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ /*
+ * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks
+ * for kvm_skip_emulated_instruction(). The caller is responsible for
+ * updating interruptibility state and injecting single-step #DBs.
+ */
if (emulation_type & EMULTYPE_SKIP) {
kvm_rip_write(vcpu, ctxt->_eip);
if (ctxt->eflags & X86_EFLAGS_RF)
kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
- return EMULATE_DONE;
+ return 1;
}
if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
- return EMULATE_DONE;
+ return 1;
/* this is needed for vmware backdoor interface to work since it
changes registers values during IO operation */
@@ -6412,24 +7647,35 @@
}
restart:
- /* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2_or_gpa;
+ if (emulation_type & EMULTYPE_PF) {
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
+ if (vcpu->arch.mmu->direct_map) {
+ ctxt->gpa_available = true;
+ ctxt->gpa_val = cr2_or_gpa;
+ }
+ } else {
+ /* Sanitize the address out of an abundance of paranoia. */
+ ctxt->exception.address = 0;
+ }
r = x86_emulate_insn(ctxt);
if (r == EMULATION_INTERCEPTED)
- return EMULATE_DONE;
+ return 1;
if (r == EMULATION_FAILED) {
if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
emulation_type))
- return EMULATE_DONE;
+ return 1;
return handle_emulation_failure(vcpu, emulation_type);
}
if (ctxt->have_exception) {
- r = EMULATE_DONE;
+ r = 1;
if (inject_emulated_exception(vcpu))
return r;
} else if (vcpu->arch.pio.count) {
@@ -6440,26 +7686,36 @@
writeback = false;
vcpu->arch.complete_userspace_io = complete_emulated_pio;
}
- r = EMULATE_USER_EXIT;
+ r = 0;
} else if (vcpu->mmio_needed) {
+ ++vcpu->stat.mmio_exits;
+
if (!vcpu->mmio_is_write)
writeback = false;
- r = EMULATE_USER_EXIT;
+ r = 0;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
} else if (r == EMULATION_RESTART)
goto restart;
else
- r = EMULATE_DONE;
+ r = 1;
if (writeback) {
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r == EMULATE_DONE && ctxt->tf)
- kvm_vcpu_do_singlestep(vcpu, &r);
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
+ r = kvm_vcpu_do_singlestep(vcpu);
+ if (kvm_x86_ops.update_emulated_instruction)
+ kvm_x86_ops.update_emulated_instruction(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -6509,9 +7765,9 @@
static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
unsigned short port)
{
- unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
- size, port, &val, 1);
+ unsigned long val = kvm_rax_read(vcpu);
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
if (ret)
return ret;
@@ -6544,16 +7800,14 @@
}
/* For size less than 4 we merge, else we zero extend */
- val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
- : 0;
+ val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
/*
- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
* the copy and tracing
*/
- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
- vcpu->arch.pio.port, &val, 1);
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
+ kvm_rax_write(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -6565,12 +7819,11 @@
int ret;
/* For size less than 4 we merge, else we zero extend */
- val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+ val = (size < 4) ? kvm_rax_read(vcpu) : 0;
- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
- &val, 1);
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
if (ret) {
- kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ kvm_rax_write(vcpu, val);
return ret;
}
@@ -6649,10 +7902,8 @@
}
#endif
-static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
{
- struct cpufreq_freqs *freq = data;
struct kvm *kvm;
struct kvm_vcpu *vcpu;
int i, send_ipi = 0;
@@ -6696,17 +7947,12 @@
*
*/
- if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
- return 0;
- if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
- return 0;
-
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
mutex_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->cpu != freq->cpu)
+ if (vcpu->cpu != cpu)
continue;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != raw_smp_processor_id())
@@ -6728,8 +7974,24 @@
* guest context is entered kvmclock will be updated,
* so the guest will not see stale values.
*/
- smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
+ smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
}
+}
+
+static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu;
+
+ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
+ return 0;
+ if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
+ return 0;
+
+ for_each_cpu(cpu, freq->policy->cpus)
+ __kvmclock_cpufreq_notifier(freq, cpu);
+
return 0;
}
@@ -6749,20 +8011,21 @@
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
+ struct cpufreq_policy *policy;
int cpu;
- memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
- pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
kvmclock_cpu_online, kvmclock_cpu_down_prep);
@@ -6781,7 +8044,7 @@
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -6796,10 +8059,20 @@
return ip;
}
+static void kvm_handle_intel_pt_intr(void)
+{
+ struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
+
+ kvm_make_request(KVM_REQ_PMI, vcpu);
+ __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
+ (unsigned long *)&vcpu->arch.pmu.global_status);
+}
+
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
+ .handle_intel_pt_intr = NULL,
};
#ifdef CONFIG_X86_64
@@ -6821,6 +8094,18 @@
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -6831,13 +8116,14 @@
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -6848,50 +8134,79 @@
int kvm_arch_init(void *opaque)
{
+ struct kvm_x86_init_ops *ops = opaque;
int r;
- struct kvm_x86_ops *ops = opaque;
- if (kvm_x86_ops) {
+ if (kvm_x86_ops.hardware_enable) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: disabled by bios\n");
+ r = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /*
+ * KVM explicitly assumes that the guest has an FPU and
+ * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
+ * vCPU's FPU state as a fxregs_state struct.
+ */
+ if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
+ printk(KERN_ERR "kvm: inadequate fpu\n");
r = -EOPNOTSUPP;
goto out;
}
r = -ENOMEM;
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
- if (!shared_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
+ __alignof__(struct fpu), SLAB_ACCOUNT,
+ NULL);
+ if (!x86_fpu_cache) {
+ printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
goto out;
}
- r = kvm_mmu_module_init();
+ x86_emulator_cache = kvm_alloc_emulator_cache();
+ if (!x86_emulator_cache) {
+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
+ goto out_free_x86_fpu_cache;
+ }
+
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+ if (!user_return_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ goto out_free_x86_emulator_cache;
+ }
+
+ r = kvm_mmu_vendor_module_init();
if (r)
goto out_free_percpu;
-
- kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
perf_register_guest_info_callbacks(&kvm_guest_cbs);
- if (boot_cpu_has(X86_FEATURE_XSAVE))
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ }
kvm_lapic_init();
+ if (pi_inject_timer == -1)
+ pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
#ifdef CONFIG_X86_64
pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
@@ -6902,7 +8217,11 @@
return 0;
out_free_percpu:
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
+out_free_x86_emulator_cache:
+ kmem_cache_destroy(x86_emulator_cache);
+out_free_x86_fpu_cache:
+ kmem_cache_destroy(x86_fpu_cache);
out:
return r;
}
@@ -6915,6 +8234,7 @@
#endif
kvm_lapic_exit();
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -6922,11 +8242,14 @@
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops = NULL;
- kvm_mmu_module_exit();
- free_percpu(shared_msrs);
+ kvm_x86_ops.hardware_enable = NULL;
+ kvm_mmu_vendor_module_exit();
+ free_percpu(user_return_msrs);
+ kmem_cache_destroy(x86_emulator_cache);
+ kmem_cache_destroy(x86_fpu_cache);
}
int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
@@ -6990,22 +8313,52 @@
*/
static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
{
- struct kvm_lapic_irq lapic_irq;
+ /*
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
+ * common code, e.g. for tracing. Defer initialization to the compiler.
+ */
+ struct kvm_lapic_irq lapic_irq = {
+ .delivery_mode = APIC_DM_REMRD,
+ .dest_mode = APIC_DEST_PHYSICAL,
+ .shorthand = APIC_DEST_NOSHORT,
+ .dest_id = apicid,
+ };
- lapic_irq.shorthand = 0;
- lapic_irq.dest_mode = 0;
- lapic_irq.level = 0;
- lapic_irq.dest_id = apicid;
- lapic_irq.msi_redir_hint = false;
-
- lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+bool kvm_apicv_activated(struct kvm *kvm)
{
- vcpu->arch.apicv_active = false;
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+
+void kvm_apicv_init(struct kvm *kvm, bool enable)
+{
+ if (enable)
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+ else
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
+
+static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
+{
+ struct kvm_vcpu *target = NULL;
+ struct kvm_apic_map *map;
+
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
+
+ if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
+ target = map->phys_map[dest_id]->vcpu;
+
+ rcu_read_unlock();
+
+ if (target && READ_ONCE(target->ready))
+ kvm_vcpu_yield_to(target);
}
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
@@ -7016,11 +8369,11 @@
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
- nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
- a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
- a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
- a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
- a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
+ nr = kvm_rax_read(vcpu);
+ a0 = kvm_rbx_read(vcpu);
+ a1 = kvm_rcx_read(vcpu);
+ a2 = kvm_rdx_read(vcpu);
+ a3 = kvm_rsi_read(vcpu);
trace_kvm_hypercall(nr, a0, a1, a2, a3);
@@ -7033,17 +8386,23 @@
a3 &= 0xFFFFFFFF;
}
- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
+
+ ret = -KVM_ENOSYS;
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_KICK_CPU:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+ break;
+
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
+ kvm_sched_yield(vcpu->kvm, a1);
ret = 0;
break;
#ifdef CONFIG_X86_64
@@ -7052,7 +8411,17 @@
break;
#endif
case KVM_HC_SEND_IPI:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+ break;
+
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
+ break;
+ case KVM_HC_SCHED_YIELD:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+ break;
+
+ kvm_sched_yield(vcpu->kvm, a0);
+ ret = 0;
break;
default:
ret = -KVM_ENOSYS;
@@ -7061,7 +8430,7 @@
out:
if (!op_64_bit)
ret = (u32)ret;
- kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
+ kvm_rax_write(vcpu, ret);
++vcpu->stat.hypercalls;
return kvm_skip_emulated_instruction(vcpu);
@@ -7074,7 +8443,7 @@
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -7103,7 +8472,7 @@
{
int max_irr, tpr;
- if (!kvm_x86_ops->update_cr8_intercept)
+ if (!kvm_x86_ops.update_cr8_intercept)
return;
if (!lapic_in_kernel(vcpu))
@@ -7122,24 +8491,32 @@
tpr = kvm_lapic_get_cr8(vcpu);
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
}
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
- vcpu->arch.exception.error_code = false;
- kvm_x86_ops->queue_exception(vcpu);
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.injected);
+
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops.queue_exception(vcpu);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu)
+static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
+ bool can_inject = true;
/* try to reinject previous events if any */
- if (vcpu->arch.exception.injected)
+ if (vcpu->arch.exception.injected) {
kvm_inject_exception(vcpu);
+ can_inject = false;
+ }
/*
* Do not inject an NMI or interrupt if there is a pending
* exception. Exceptions and interrupts are recognized at
@@ -7155,11 +8532,17 @@
* fully complete the previous instruction.
*/
else if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected)
- kvm_x86_ops->set_nmi(vcpu);
- else if (vcpu->arch.interrupt.injected)
- kvm_x86_ops->set_irq(vcpu);
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops.set_nmi(vcpu);
+ can_inject = false;
+ } else if (vcpu->arch.interrupt.injected) {
+ kvm_x86_ops.set_irq(vcpu);
+ can_inject = false;
+ }
}
+
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
+ vcpu->arch.exception.pending);
/*
* Call check_nested_events() even if we reinjected a previous event
@@ -7167,69 +8550,107 @@
* from L2 to L1 due to pending L1 events which require exit
* from L2 to L1.
*/
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu);
- if (r != 0)
- return r;
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ if (r < 0)
+ goto busy;
}
/* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
-
- WARN_ON_ONCE(vcpu->arch.exception.injected);
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception.injected = true;
-
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
- if (vcpu->arch.exception.nr == DB_VECTOR &&
- (vcpu->arch.dr7 & DR7_GD)) {
- vcpu->arch.dr7 &= ~DR7_GD;
- kvm_update_dr7(vcpu);
+ if (vcpu->arch.exception.nr == DB_VECTOR) {
+ kvm_deliver_exception_payload(vcpu);
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
}
kvm_inject_exception(vcpu);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
+ can_inject = false;
}
- /* Don't consider new event if we re-injected an event */
- if (kvm_event_needs_reinjection(vcpu))
- return 0;
-
- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
- kvm_x86_ops->smi_allowed(vcpu)) {
- vcpu->arch.smi_pending = false;
- ++vcpu->arch.smi_count;
- enter_smm(vcpu);
- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
- --vcpu->arch.nmi_pending;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
- /*
- * Because interrupts can be injected asynchronously, we are
- * calling check_nested_events again here to avoid a race condition.
- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
- * proposal and current concerns. Perhaps we should be setting
- * KVM_REQ_EVENT only on certain events and not unconditionally?
- */
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu);
- if (r != 0)
- return r;
- }
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
- false);
- kvm_x86_ops->set_irq(vcpu);
- }
+ /*
+ * Finally, inject interrupt events. If an event cannot be injected
+ * due to architectural conditions (e.g. IF=0) a window-open exit
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
+ * and can architecturally be injected, but we cannot do it right now:
+ * an interrupt could have arrived just now and we have to inject it
+ * as a vmexit, or there could already an event in the queue, which is
+ * indicated by can_inject. In that case we request an immediate exit
+ * in order to make progress and get back here for another iteration.
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
+ */
+ if (vcpu->arch.smi_pending) {
+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ can_inject = false;
+ } else
+ kvm_x86_ops.enable_smi_window(vcpu);
}
- return 0;
+ if (vcpu->arch.nmi_pending) {
+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops.set_nmi(vcpu);
+ can_inject = false;
+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+ }
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops.enable_nmi_window(vcpu);
+ }
+
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+ kvm_x86_ops.set_irq(vcpu);
+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+ }
+ if (kvm_cpu_has_injectable_intr(vcpu))
+ kvm_x86_ops.enable_irq_window(vcpu);
+ }
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ *req_immediate_exit = true;
+
+ WARN_ON(vcpu->arch.exception.pending);
+ return;
+
+busy:
+ *req_immediate_exit = true;
+ return;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -7241,7 +8662,7 @@
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
- if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -7331,11 +8752,11 @@
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
@@ -7385,7 +8806,7 @@
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
@@ -7395,7 +8816,7 @@
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
@@ -7425,28 +8846,28 @@
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
- kvm_x86_ops->pre_enter_smm(vcpu, buf);
+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
- if (kvm_x86_ops->get_nmi_mask(vcpu))
+ if (kvm_x86_ops.get_nmi_mask(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- kvm_x86_ops->set_nmi_mask(vcpu, true);
+ kvm_x86_ops.set_nmi_mask(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- kvm_x86_ops->set_cr0(vcpu, cr0);
+ kvm_x86_ops.set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_x86_ops->set_cr4(vcpu, 0);
+ kvm_x86_ops.set_cr4(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- kvm_x86_ops->set_idt(vcpu, &dt);
+ kvm_x86_ops.set_idt(vcpu, &dt);
__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
@@ -7477,10 +8898,10 @@
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- kvm_x86_ops->set_efer(vcpu, 0);
+ kvm_x86_ops.set_efer(vcpu, 0);
#endif
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
kvm_mmu_reset_context(vcpu);
}
@@ -7490,10 +8911,82 @@
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
+{
+ cpumask_var_t cpus;
+
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+ NULL, vcpu_bitmap, cpus);
+
+ free_cpumask_var(cpus);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
+
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ kvm_apic_update_apicv(vcpu);
+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+
+/*
+ * NOTE: Do not hold any lock prior to calling this.
+ *
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
+ * locked, because it calls __x86_set_memory_region() which does
+ * synchronize_srcu(&kvm->srcu).
+ */
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ struct kvm_vcpu *except;
+ unsigned long old, new, expected;
+
+ if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+ return;
+
+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
+ do {
+ expected = new = old;
+ if (activate)
+ __clear_bit(bit, &new);
+ else
+ __set_bit(bit, &new);
+ if (new == old)
+ break;
+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
+ } while (old != expected);
+
+ if (!!old == !!new)
+ return;
+
+ trace_kvm_apicv_update_request(activate, bit);
+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+
+ /*
+ * Sending request to update APICV for all other vcpus,
+ * while update the calling vcpu immediately instead of
+ * waiting for another #VMEXIT to handle the request.
+ */
+ except = kvm_get_running_vcpu();
+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
+ except);
+ if (except)
+ kvm_vcpu_update_apicv(except);
+}
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
@@ -7506,7 +8999,7 @@
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -7526,7 +9019,7 @@
bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
vcpu_to_synic(vcpu)->vec_bitmap, 256);
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -7543,28 +9036,22 @@
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (kvm_x86_ops.guest_memory_reclaimed)
+ kvm_x86_ops.guest_memory_reclaimed(kvm);
+}
+
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
- struct page *page = NULL;
-
if (!lapic_in_kernel(vcpu))
return;
- if (!kvm_x86_ops->set_apic_access_page_addr)
+ if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page))
- return;
- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
-
- /*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
- */
- put_page(page);
+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
{
@@ -7583,12 +9070,17 @@
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
- kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -7604,10 +9096,19 @@
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
- kvm_mmu_load_cr3(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_vcpu_flush_tlb(vcpu, true);
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+ kvm_mmu_load_pgd(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+ kvm_vcpu_flush_tlb_all(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
@@ -7678,6 +9179,12 @@
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
+ kvm_check_async_pf_completion(vcpu);
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+ kvm_x86_ops.msr_filter_changed(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -7688,32 +9195,9 @@
goto out;
}
- if (inject_pending_event(vcpu) != 0)
- req_immediate_exit = true;
- else {
- /* Enable SMI/NMI/IRQ window open exits if needed.
- *
- * SMIs have three cases:
- * 1) They can be nested, and then there is nothing to
- * do here because RSM will cause a vmexit anyway.
- * 2) There is an ISA-specific reason why SMI cannot be
- * injected, and the moment when this changes can be
- * intercepted.
- * 3) Or the SMI can be pending because
- * inject_pending_event has completed the injection
- * of an IRQ or NMI from the previous vmexit, and
- * then we request an immediate exit to inject the
- * SMI.
- */
- if (vcpu->arch.smi_pending && !is_smm(vcpu))
- if (!kvm_x86_ops->enable_smi_window(vcpu))
- req_immediate_exit = true;
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
- WARN_ON(vcpu->arch.exception.pending);
- }
+ inject_pending_event(vcpu, &req_immediate_exit);
+ if (req_int_win)
+ kvm_x86_ops.enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -7728,7 +9212,7 @@
preempt_disable();
- kvm_x86_ops->prepare_guest_switch(vcpu);
+ kvm_x86_ops.prepare_guest_switch(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -7744,7 +9228,7 @@
* 1) We should set ->mode before checking ->requests. Please see
* the comment in kvm_vcpu_exiting_guest_mode().
*
- * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * 2) For APICv, we should set ->mode before checking PID.ON. This
* pairs with the memory barrier implicit in pi_test_and_set_on
* (see vmx_deliver_posted_interrupt).
*
@@ -7759,10 +9243,9 @@
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
- if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
- || need_resched() || signal_pending(current)) {
+ if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
@@ -7774,13 +9257,14 @@
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops->request_immediate_exit(vcpu);
+ kvm_x86_ops.request_immediate_exit(vcpu);
}
- trace_kvm_entry(vcpu->vcpu_id);
- if (lapic_timer_advance_ns)
- wait_lapic_expire(vcpu);
- guest_enter_irqoff();
+ trace_kvm_entry(vcpu);
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -7794,7 +9278,7 @@
set_debugreg(0, 7);
}
- kvm_x86_ops->run(vcpu);
+ exit_fastpath = kvm_x86_ops.run(vcpu);
/*
* Do this here before restoring debug registers on the host. And
@@ -7804,9 +9288,8 @@
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
kvm_update_dr0123(vcpu);
- kvm_update_dr6(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
@@ -7821,18 +9304,43 @@
if (hw_breakpoint_active())
hw_breakpoint_restore();
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
+ kvm_x86_ops.handle_exit_irqoff(vcpu);
+
+ /*
+ * Consume any pending interrupts, including the possible source of
+ * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
+ * An instruction is required after local_irq_enable() to fully unblock
+ * interrupts on processors that implement an interrupt shadow, the
+ * stat.exits increment will do nicely.
+ */
kvm_before_interrupt(vcpu);
- kvm_x86_ops->handle_external_intr(vcpu);
+ local_irq_enable();
+ ++vcpu->stat.exits;
+ local_irq_disable();
kvm_after_interrupt(vcpu);
- ++vcpu->stat.exits;
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ vtime_account_guest_exit();
- guest_exit_irqoff();
+ if (lapic_in_kernel(vcpu)) {
+ s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
+ if (delta != S64_MIN) {
+ trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
+ vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
+ }
+ }
local_irq_enable();
preempt_enable();
@@ -7853,12 +9361,13 @@
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- vcpu->arch.gpa_available = false;
- r = kvm_x86_ops->handle_exit(vcpu);
+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
return r;
cancel_injection:
- kvm_x86_ops->cancel_injection(vcpu);
+ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops.cancel_injection(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -7868,13 +9377,13 @@
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
+ (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_x86_ops->post_block)
- kvm_x86_ops->post_block(vcpu);
+ if (kvm_x86_ops.post_block)
+ kvm_x86_ops.post_block(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -7886,6 +9395,7 @@
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
+ fallthrough;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
@@ -7893,15 +9403,14 @@
break;
default:
return -EINTR;
- break;
}
return 1;
}
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu);
+ if (is_guest_mode(vcpu))
+ kvm_x86_ops.nested_ops->check_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -7916,6 +9425,13 @@
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
@@ -7937,17 +9453,11 @@
break;
}
- kvm_check_async_pf_completion(vcpu);
-
- if (signal_pending(current)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- break;
- }
- if (need_resched()) {
+ if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- cond_resched();
+ r = xfer_to_guest_mode_handle_work(vcpu);
+ if (r)
+ return r;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
@@ -7960,12 +9470,11 @@
static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
{
int r;
+
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- if (r != EMULATE_DONE)
- return 0;
- return 1;
+ return r;
}
static int complete_emulated_pio(struct kvm_vcpu *vcpu)
@@ -8038,31 +9547,55 @@
return 0;
}
+static void kvm_save_current_fpu(struct fpu *fpu)
+{
+ /*
+ * If the target FPU state is not resident in the CPU registers, just
+ * memcpy() from current, else save CPU state directly to the target.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ memcpy(&fpu->state, ¤t->thread.fpu.state,
+ fpu_kernel_xstate_size);
+ else
+ copy_fpregs_to_fpstate(fpu);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
- copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
- /* PKRU is separately restored in kvm_x86_ops->run. */
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ fpregs_lock();
+
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
+
+ /* PKRU is separately restored in kvm_x86_ops.run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
- preempt_enable();
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
- copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
- copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
- preempt_enable();
+ fpregs_lock();
+
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
+
+ copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
+
+ fpregs_mark_activate();
+ fpregs_unlock();
+
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int r;
vcpu_load(vcpu);
@@ -8080,18 +9613,18 @@
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
goto out;
}
- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
r = -EINVAL;
goto out;
}
- if (vcpu->run->kvm_dirty_regs) {
+ if (kvm_run->kvm_dirty_regs) {
r = sync_regs(vcpu);
if (r != 0)
goto out;
@@ -8121,7 +9654,7 @@
out:
kvm_put_guest_fpu(vcpu);
- if (vcpu->run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -8140,26 +9673,26 @@
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
}
- regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
- regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
- regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
- regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
- regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
- regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
- regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
- regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
+ regs->rax = kvm_rax_read(vcpu);
+ regs->rbx = kvm_rbx_read(vcpu);
+ regs->rcx = kvm_rcx_read(vcpu);
+ regs->rdx = kvm_rdx_read(vcpu);
+ regs->rsi = kvm_rsi_read(vcpu);
+ regs->rdi = kvm_rdi_read(vcpu);
+ regs->rsp = kvm_rsp_read(vcpu);
+ regs->rbp = kvm_rbp_read(vcpu);
#ifdef CONFIG_X86_64
- regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
- regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
- regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
- regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
- regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
- regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
- regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
- regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
+ regs->r8 = kvm_r8_read(vcpu);
+ regs->r9 = kvm_r9_read(vcpu);
+ regs->r10 = kvm_r10_read(vcpu);
+ regs->r11 = kvm_r11_read(vcpu);
+ regs->r12 = kvm_r12_read(vcpu);
+ regs->r13 = kvm_r13_read(vcpu);
+ regs->r14 = kvm_r14_read(vcpu);
+ regs->r15 = kvm_r15_read(vcpu);
#endif
regs->rip = kvm_rip_read(vcpu);
@@ -8179,23 +9712,23 @@
vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
- kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
- kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
- kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
- kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
- kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
- kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
- kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
- kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
+ kvm_rax_write(vcpu, regs->rax);
+ kvm_rbx_write(vcpu, regs->rbx);
+ kvm_rcx_write(vcpu, regs->rcx);
+ kvm_rdx_write(vcpu, regs->rdx);
+ kvm_rsi_write(vcpu, regs->rsi);
+ kvm_rdi_write(vcpu, regs->rdi);
+ kvm_rsp_write(vcpu, regs->rsp);
+ kvm_rbp_write(vcpu, regs->rbp);
#ifdef CONFIG_X86_64
- kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
- kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
- kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
- kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
- kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
- kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
- kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
- kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
+ kvm_r8_write(vcpu, regs->r8);
+ kvm_r9_write(vcpu, regs->r9);
+ kvm_r10_write(vcpu, regs->r10);
+ kvm_r11_write(vcpu, regs->r11);
+ kvm_r12_write(vcpu, regs->r12);
+ kvm_r13_write(vcpu, regs->r13);
+ kvm_r14_write(vcpu, regs->r14);
+ kvm_r15_write(vcpu, regs->r15);
#endif
kvm_rip_write(vcpu, regs->rip);
@@ -8238,10 +9771,10 @@
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -8253,7 +9786,7 @@
sregs->efer = vcpu->arch.efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
- memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
+ memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
set_bit(vcpu->arch.interrupt.nr,
@@ -8300,8 +9833,12 @@
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
goto out;
- /* INITs are latched while in SMM */
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ /*
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+ * INIT state; latched init should be reported using
+ * KVM_SET_VCPU_EVENTS, so reject it here.
+ */
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
@@ -8322,21 +9859,23 @@
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
has_error_code, error_code);
-
- if (ret)
- return EMULATE_FAIL;
+ if (ret) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return 0;
+ }
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- return EMULATE_DONE;
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -8350,6 +9889,8 @@
*/
if (!(sregs->cr4 & X86_CR4_PAE)
|| !(sregs->efer & EFER_LMA))
+ return -EINVAL;
+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
return -EINVAL;
} else {
/*
@@ -8382,31 +9923,31 @@
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
+ kvm_x86_ops.set_idt(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
+ kvm_x86_ops.set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
(X86_CR4_OSXSAVE | X86_CR4_PKE));
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
if (cpuid_update_needed)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -8510,7 +10051,7 @@
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_bp_intercept(vcpu);
+ kvm_x86_ops.update_exception_bitmap(vcpu);
r = 0;
@@ -8549,7 +10090,7 @@
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
fpu->fsw = fxsave->swd;
@@ -8557,7 +10098,7 @@
fpu->last_opcode = fxsave->fop;
fpu->last_ip = fxsave->rip;
fpu->last_dp = fxsave->rdp;
- memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
+ memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
vcpu_put(vcpu);
return 0;
@@ -8569,7 +10110,7 @@
vcpu_load(vcpu);
- fxsave = &vcpu->arch.guest_fpu.state.fxsave;
+ fxsave = &vcpu->arch.guest_fpu->state.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -8578,7 +10119,7 @@
fxsave->fop = fpu->last_opcode;
fxsave->rip = fpu->last_ip;
fxsave->rdp = fpu->last_dp;
- memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
+ memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
vcpu_put(vcpu);
return 0;
@@ -8625,9 +10166,9 @@
static void fx_init(struct kvm_vcpu *vcpu)
{
- fpstate_init(&vcpu->arch.guest_fpu.state);
+ fpstate_init(&vcpu->arch.guest_fpu->state);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
+ vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
@@ -8638,48 +10179,122 @@
vcpu->arch.cr0 |= X86_CR0_ET;
}
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
-
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
- kvmclock_reset(vcpu);
-
- kvm_x86_ops->vcpu_free(vcpu);
- free_cpumask_var(wbinvd_dirty_mask);
-}
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
-{
- struct kvm_vcpu *vcpu;
-
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
-
- return vcpu;
+ return 0;
}
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
+ struct page *page;
+ int r;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ if (kvm_apicv_activated(vcpu->kvm))
+ vcpu->arch.apicv_active = true;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks)
+ goto fail_free_pio_data;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ if (!alloc_emulate_ctxt(vcpu))
+ goto free_wbinvd_dirty_mask;
+
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.user_fpu) {
+ pr_err("kvm: failed to allocate userspace's fpu\n");
+ goto free_emulate_ctxt;
+ }
+
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.guest_fpu) {
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
+ goto free_user_fpu;
+ }
+ fx_init(vcpu);
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ r = kvm_x86_ops.vcpu_create(vcpu);
+ if (r)
+ goto free_guest_fpu;
+
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
+ vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
kvm_vcpu_reset(vcpu, false);
- kvm_mmu_setup(vcpu);
+ kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
+
+free_guest_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+free_user_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_emulate_ctxt:
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
kvm_hv_vcpu_postcreate(vcpu);
@@ -8687,23 +10302,43 @@
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- msr.data = 0x0;
- msr.index = MSR_IA32_TSC;
- msr.host_initiated = true;
- kvm_write_tsc(vcpu, &msr);
+ kvm_synchronize_tsc(vcpu, 0);
vcpu_put(vcpu);
+
+ /* poll control enabled by default */
+ vcpu->arch.msr_kvm_poll_control = 1;
+
mutex_unlock(&vcpu->mutex);
- if (!kvmclock_periodic_sync)
- return;
-
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- kvm_arch_vcpu_free(vcpu);
+ int idx;
+
+ kvmclock_reset(vcpu);
+
+ kvm_x86_ops.vcpu_free(vcpu);
+
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ kvfree(vcpu->arch.cpuid_entries);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -8719,19 +10354,18 @@
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
- vcpu->arch.exception.pending = false;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = DR6_INIT;
- kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
vcpu->arch.cr2 = 0;
kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.apf.msr_val = 0;
+ vcpu->arch.apf.msr_en_val = 0;
+ vcpu->arch.apf.msr_int_val = 0;
vcpu->arch.st.msr_val = 0;
kvmclock_reset(vcpu);
@@ -8749,12 +10383,12 @@
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
- XFEATURE_MASK_BNDREGS);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
+ XFEATURE_BNDREGS);
if (mpx_state_buffer)
memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
- mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
- XFEATURE_MASK_BNDCSR);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
+ XFEATURE_BNDCSR);
if (mpx_state_buffer)
memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
if (init_event)
@@ -8765,7 +10399,6 @@
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
- vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
vcpu->arch.msr_misc_features_enables = 0;
vcpu->arch.xcr0 = XFEATURE_MASK_FP;
@@ -8777,7 +10410,7 @@
vcpu->arch.ia32_xss = 0;
- kvm_x86_ops->vcpu_reset(vcpu, init_event);
+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -8801,8 +10434,8 @@
u64 max_tsc = 0;
bool stable, backwards_tsc = false;
- kvm_shared_msr_cpu_online();
- ret = kvm_x86_ops->hardware_enable();
+ kvm_user_return_msr_cpu_online();
+ ret = kvm_x86_ops.hardware_enable();
if (ret != 0)
return ret;
@@ -8828,7 +10461,7 @@
* before any KVM threads can be running. Unfortunately, we can't
* bring the TSCs fully up to date with real time, as we aren't yet far
* enough into CPU bringup that we know how much real time has actually
- * elapsed; our helper function, ktime_get_boot_ns() will be using boot
+ * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
* variables that haven't been updated yet.
*
* So we simply find the maximum observed TSC above, then record the
@@ -8884,19 +10517,32 @@
void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops->hardware_disable();
+ kvm_x86_ops.hardware_disable();
drop_user_return_notifiers();
}
-int kvm_arch_hardware_setup(void)
+int kvm_arch_hardware_setup(void *opaque)
{
+ struct kvm_x86_init_ops *ops = opaque;
int r;
- r = kvm_x86_ops->hardware_setup();
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ r = ops->hardware_setup();
if (r != 0)
return r;
- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
if (kvm_has_tsc_control) {
/*
@@ -8918,12 +10564,21 @@
void kvm_arch_hardware_unsetup(void)
{
- kvm_x86_ops->hardware_unsetup();
+ kvm_x86_ops.hardware_unsetup();
}
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void *opaque)
{
- kvm_x86_ops->check_processor_compatibility(rtn);
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ struct kvm_x86_init_ops *ops = opaque;
+
+ WARN_ON(!irqs_disabled());
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return ops->check_processor_compatibility();
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -8940,107 +10595,35 @@
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int r;
-
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->arch.pio_data = page_address(page);
-
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- r = kvm_create_lapic(vcpu);
- if (r < 0)
- goto fail_mmu_destroy;
- } else
- static_key_slow_inc(&kvm_no_apic_vcpu);
-
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL);
- if (!vcpu->arch.mce_banks) {
- r = -ENOMEM;
- goto fail_free_lapic;
- }
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
-
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
- r = -ENOMEM;
- goto fail_free_mce_banks;
- }
-
- fx_init(vcpu);
-
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
- kvm_async_pf_hash_reset(vcpu);
- kvm_pmu_init(vcpu);
-
- vcpu->arch.pending_external_vector = -1;
- vcpu->arch.preempted_in_kernel = false;
-
- kvm_hv_vcpu_init(vcpu);
-
- return 0;
-
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
-fail_free_lapic:
- kvm_free_lapic(vcpu);
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- int idx;
-
- kvm_hv_vcpu_uninit(vcpu);
- kvm_pmu_destroy(vcpu);
- kfree(vcpu->arch.mce_banks);
- kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- free_page((unsigned long)vcpu->arch.pio_data);
- if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
-}
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
vcpu->arch.l1tf_flush_l1d = true;
- kvm_x86_ops->sched_in(vcpu, cpu);
+ if (pmu->version && unlikely(pmu->event_count)) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+ kvm_x86_ops.sched_in(vcpu, cpu);
}
+
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ kfree(kvm->arch.hyperv.hv_pa_pg);
+ vfree(kvm);
+}
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ int ret;
+
if (type)
return -EINVAL;
+
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ return ret;
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
@@ -9059,7 +10642,7 @@
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -9068,13 +10651,9 @@
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
kvm_hv_init_vm(kvm);
- kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- if (kvm_x86_ops->vm_init)
- return kvm_x86_ops->vm_init(kvm);
-
- return 0;
+ return kvm_x86_ops.vm_init(kvm);
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -9102,7 +10681,7 @@
kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_free(vcpu);
+ kvm_vcpu_destroy(vcpu);
mutex_lock(&kvm->lock);
for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
@@ -9122,9 +10701,9 @@
int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
{
int i, r;
- unsigned long hva;
+ unsigned long hva, old_npages;
struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *slot, old;
+ struct kvm_memory_slot *slot;
/* Called with kvm->slots_lock held. */
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@ -9132,7 +10711,7 @@
slot = id_to_memslot(slots, id);
if (size) {
- if (slot->npages)
+ if (slot && slot->npages)
return -EEXIST;
/*
@@ -9144,13 +10723,13 @@
if (IS_ERR((void *)hva))
return PTR_ERR((void *)hva);
} else {
- if (!slot->npages)
+ if (!slot || !slot->npages)
return 0;
+ old_npages = slot->npages;
hva = 0;
}
- old = *slot;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
struct kvm_userspace_memory_region m;
@@ -9165,23 +10744,11 @@
}
if (!size)
- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+ vm_munmap(hva, old_npages * PAGE_SIZE);
return 0;
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
-
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
-{
- int r;
-
- mutex_lock(&kvm->slots_lock);
- r = __x86_set_memory_region(kvm, id, gpa, size);
- mutex_unlock(&kvm->slots_lock);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
@@ -9196,46 +10763,47 @@
* unless the the memory map has changed due to process exit
* or fd copying.
*/
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
}
- if (kvm_x86_ops->vm_destroy)
- kvm_x86_ops->vm_destroy(kvm);
+ if (kvm_x86_ops.vm_destroy)
+ kvm_x86_ops.vm_destroy(kvm);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
kvm_page_track_cleanup(kvm);
kvm_hv_destroy_vm(kvm);
}
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
- struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
- kvfree(free->arch.rmap[i]);
- free->arch.rmap[i] = NULL;
- }
+ kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+
if (i == 0)
continue;
- if (!dont || free->arch.lpage_info[i - 1] !=
- dont->arch.lpage_info[i - 1]) {
- kvfree(free->arch.lpage_info[i - 1]);
- free->arch.lpage_info[i - 1] = NULL;
- }
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
- kvm_page_track_free_memslot(free, dont);
+ kvm_page_track_free_memslot(slot);
}
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned long npages)
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
int i;
@@ -9257,13 +10825,13 @@
slot->arch.rmap[i] =
kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!slot->arch.rmap[i])
goto out_free;
if (i == 0)
continue;
- linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+ linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
if (!linfo)
goto out_free;
@@ -9276,11 +10844,9 @@
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
- * other, or if explicitly asked to, disable large page
- * support for this slot
+ * other, disable large page support for this slot.
*/
- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
- !kvm_largepages_enabled()) {
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
unsigned long j;
for (j = 0; j < lpages; ++j)
@@ -9327,76 +10893,23 @@
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
- if (change == KVM_MR_MOVE)
- return kvm_arch_create_memslot(kvm, memslot,
- mem->memory_size >> PAGE_SHIFT);
-
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+ return kvm_alloc_memslot_metadata(memslot,
+ mem->memory_size >> PAGE_SHIFT);
return 0;
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
- struct kvm_memory_slot *new)
+ struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- /* Still write protect RO slot */
- if (new->flags & KVM_MEM_READONLY) {
- kvm_mmu_slot_remove_write_access(kvm, new);
- return;
- }
-
/*
- * Call kvm_x86_ops dirty logging hooks when they are valid.
- *
- * kvm_x86_ops->slot_disable_log_dirty is called when:
- *
- * - KVM_MR_CREATE with dirty logging is disabled
- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
- *
- * The reason is, in case of PML, we need to set D-bit for any slots
- * with dirty logging disabled in order to eliminate unnecessary GPA
- * logging in PML buffer (and potential PML buffer full VMEXT). This
- * guarantees leaving PML enabled during guest's lifetime won't have
- * any additonal overhead from PML when guest is running with dirty
- * logging disabled for memory slots.
- *
- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
- * to dirty logging mode.
- *
- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
- *
- * In case of write protect:
- *
- * Write protect all pages for dirty logging.
- *
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
- *
- * See the comments in fast_page_fault().
+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
+ * See comments below.
*/
- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- if (kvm_x86_ops->slot_enable_log_dirty)
- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
- else
- kvm_mmu_slot_remove_write_access(kvm, new);
- } else {
- if (kvm_x86_ops->slot_disable_log_dirty)
- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
- }
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
- const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new,
- enum kvm_mr_change change)
-{
- int nr_mmu_pages = 0;
-
- if (!kvm->arch.n_requested_mmu_pages)
- nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
-
- if (nr_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ return;
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
@@ -9409,29 +10922,91 @@
* Scan sptes if dirty logging has been stopped, dropping those
* which can be collapsed into a single large-page spte. Later
* page faults will create the large-page sptes.
+ *
+ * There is no need to do this in any of the following cases:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
*/
- if ((change != KVM_MR_DELETE) &&
- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_zap_collapsible_sptes(kvm, new);
/*
- * Set up write protection and/or dirty logging for the new slot.
+ * Enable or disable dirty logging for the slot.
*
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
- * been zapped so no dirty logging staff is needed for old slot. For
- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
- * new and it's also covered when dealing with the new slot.
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
+ * slot have been zapped so no dirty logging updates are needed for
+ * the old slot.
+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
+ * any mappings that might be created in it will consume the
+ * properties of the new slot and do not need to be updated here.
*
+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
+ * called to enable/disable dirty logging.
+ *
+ * When disabling dirty logging with PML enabled, the D-bit is set
+ * for sptes in the slot in order to prevent unnecessary GPA
+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
+ * This guarantees leaving PML enabled for the guest's lifetime
+ * won't have any additional overhead from PML when the guest is
+ * running with dirty logging disabled.
+ *
+ * When enabling dirty logging, large sptes are write-protected
+ * so they can be split on first write. New large sptes cannot
+ * be created for this slot until the end of the logging.
+ * See the comments in fast_page_fault().
+ * For small sptes, nothing is done if the dirty log is in the
+ * initial-all-set state. Otherwise, depending on whether pml
+ * is enabled the D-bit or the W-bit will be cleared.
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops.slot_enable_log_dirty) {
+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
+ } else {
+ int level =
+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
+ PG_LEVEL_2M : PG_LEVEL_4K;
+
+ /*
+ * If we're with initial-all-set, we don't need
+ * to write protect any small page because
+ * they're reported as dirty already. However
+ * we still need to write-protect huge pages
+ * so that the page split can happen lazily on
+ * the first write to the huge page.
+ */
+ kvm_mmu_slot_remove_write_access(kvm, new, level);
+ }
+ } else {
+ if (kvm_x86_ops.slot_disable_log_dirty)
+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+ }
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ if (!kvm->arch.n_requested_mmu_pages)
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
+
+ /*
* FIXME: const-ify all uses of struct kvm_memory_slot.
*/
- if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+
+ /* Free the arrays associated with the old memslot. */
+ if (change == KVM_MR_MOVE)
+ kvm_arch_free_memslot(kvm, old);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm_mmu_zap_all(kvm);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -9443,8 +11018,8 @@
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops->guest_apic_has_interrupt &&
- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+ kvm_x86_ops.guest_apic_has_interrupt &&
+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -9463,11 +11038,12 @@
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- kvm_x86_ops->nmi_allowed(vcpu)))
+ kvm_x86_ops.nmi_allowed(vcpu, false)))
return true;
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- (vcpu->arch.smi_pending && !is_smm(vcpu)))
+ (vcpu->arch.smi_pending &&
+ kvm_x86_ops.smi_allowed(vcpu, false)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -9476,6 +11052,11 @@
return true;
if (kvm_hv_has_stimer_pending(vcpu))
+ return true;
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
return true;
return false;
@@ -9496,7 +11077,7 @@
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
return true;
return false;
@@ -9514,7 +11095,7 @@
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -9536,7 +11117,7 @@
{
unsigned long rflags;
- rflags = kvm_x86_ops->get_rflags(vcpu);
+ rflags = kvm_x86_ops.get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -9548,7 +11129,7 @@
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- kvm_x86_ops->set_rflags(vcpu, rflags);
+ kvm_x86_ops.set_rflags(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -9562,7 +11143,7 @@
{
int r;
- if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -9570,21 +11151,23 @@
if (unlikely(r))
return;
- if (!vcpu->arch.mmu.direct_map &&
- work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ if (!vcpu->arch.mmu->direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
return;
- vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
}
static inline u32 kvm_async_pf_next_probe(u32 key)
{
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
}
static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -9602,7 +11185,7 @@
int i;
u32 key = kvm_async_pf_hash_fn(gfn);
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
(vcpu->arch.apf.gfns[key] != gfn &&
vcpu->arch.apf.gfns[key] != ~0); i++)
key = kvm_async_pf_next_probe(key);
@@ -9620,6 +11203,10 @@
u32 i, j, k;
i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+ return;
+
while (true) {
vcpu->arch.apf.gfns[i] = ~0;
do {
@@ -9638,21 +11225,64 @@
}
}
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
{
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
+ sizeof(reason));
}
-static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
- sizeof(u32));
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &token, offset, sizeof(token));
}
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+ u32 val;
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &val, offset, sizeof(val)))
+ return false;
+
+ return !val;
+}
+
+static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
+ return false;
+
+ if (!kvm_pv_async_pf_enabled(vcpu) ||
+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
+ return false;
+
+ return true;
+}
+
+bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!lapic_in_kernel(vcpu) ||
+ kvm_event_needs_reinjection(vcpu) ||
+ vcpu->arch.exception.pending))
+ return false;
+
+ if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
+ return false;
+
+ /*
+ * If interrupts are off we cannot even use an artificial
+ * halt state.
+ */
+ return kvm_arch_interrupt_allowed(vcpu);
+}
+
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
@@ -9660,11 +11290,8 @@
trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
- kvm_make_request(KVM_REQ_APF_HALT, vcpu);
- else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ if (kvm_can_deliver_async_pf(vcpu) &&
+ !apf_put_user_notpresent(vcpu)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
@@ -9672,14 +11299,28 @@
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
+ return true;
+ } else {
+ /*
+ * It is not possible to deliver a paravirtualized asynchronous
+ * page fault, but putting the guest in an artificial halt state
+ * can be beneficial nevertheless: if an interrupt arrives, we
+ * can deliver it timely and perhaps the guest will schedule
+ * another process. When the instruction that triggered a page
+ * fault is retried, hopefully the page will be ready in the host.
+ */
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return false;
}
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
- struct x86_exception fault;
- u32 val;
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vcpu->arch.apf.vec
+ };
if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
@@ -9687,37 +11328,30 @@
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
- !apf_get_user(vcpu, &val)) {
- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
- vcpu->arch.exception.pending &&
- vcpu->arch.exception.nr == PF_VECTOR &&
- !apf_put_user(vcpu, 0)) {
- vcpu->arch.exception.injected = false;
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception.nr = 0;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.error_code = 0;
- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- fault.async_page_fault = true;
- kvm_inject_page_fault(vcpu, &fault);
- }
+ if ((work->wakeup_all || work->notpresent_injected) &&
+ kvm_pv_async_pf_enabled(vcpu) &&
+ !apf_put_user_ready(vcpu, work->arch.token)) {
+ vcpu->arch.apf.pageready_pending = true;
+ kvm_apic_set_irq(vcpu, &irq, NULL);
}
+
vcpu->arch.apf.halted = false;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ if (!vcpu->arch.apf.pageready_pending)
+ kvm_vcpu_kick(vcpu);
+}
+
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pv_async_pf_enabled(vcpu))
return true;
else
- return kvm_can_do_async_pf(vcpu);
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
@@ -9732,9 +11366,9 @@
}
EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
{
- return atomic_read(&kvm->arch.assigned_device_count);
+ return arch_atomic_read(&kvm->arch.assigned_device_count);
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
@@ -9758,7 +11392,7 @@
bool kvm_arch_has_irq_bypass(void)
{
- return kvm_x86_ops->update_pi_irte != NULL;
+ return true;
}
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
@@ -9766,11 +11400,17 @@
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ int ret;
irqfd->producer = prod;
+ kvm_arch_start_assignment(irqfd->kvm);
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
+ if (ret)
+ kvm_arch_end_assignment(irqfd->kvm);
+
+ return ret;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@ -9789,26 +11429,185 @@
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
+
+ kvm_arch_end_assignment(irqfd->kvm);
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- if (!kvm_x86_ops->update_pi_irte)
- return -EINVAL;
-
- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
}
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+
+bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
+{
+ return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+
+
+int kvm_spec_ctrl_test_value(u64 value)
+{
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
+
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+{
+ struct x86_exception fault;
+ u32 access = error_code &
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
+}
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
@@ -9820,12 +11619,31 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);
--
Gitblit v1.6.2