From 297b60346df8beafee954a0fd7c2d64f33f3b9bc Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Sat, 11 May 2024 01:44:05 +0000 Subject: [PATCH] rtl8211F_led_control --- kernel/arch/x86/kvm/x86.c | 4915 +++++++++++++++++++++++++++++++++++++++++------------------ 1 files changed, 3,363 insertions(+), 1,552 deletions(-) diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c index 9829e0f..9d30158 100644 --- a/kernel/arch/x86/kvm/x86.c +++ b/kernel/arch/x86/kvm/x86.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * @@ -13,22 +14,21 @@ * Yaniv Kamay <yaniv@qumranet.com> * Amit Shah <amit.shah@qumranet.com> * Ben-Ami Yassour <benami@il.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * */ #include <linux/kvm_host.h> #include "irq.h" +#include "ioapic.h" #include "mmu.h" #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" #include "hyperv.h" +#include "lapic.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -54,7 +54,9 @@ #include <linux/kvm_irqfd.h> #include <linux/irqbypass.h> #include <linux/sched/stat.h> +#include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> +#include <linux/entry-kvm.h> #include <trace/events/kvm.h> @@ -69,6 +71,10 @@ #include <asm/irq_remapping.h> #include <asm/mshyperv.h> #include <asm/hypervisor.h> +#include <asm/tlbflush.h> +#include <asm/intel_pt.h> +#include <asm/emulate_prefix.h> +#include <clocksource/hyperv_timer.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -79,7 +85,7 @@ EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -94,9 +100,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; -#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ -#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ - #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -108,7 +111,7 @@ static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -struct kvm_x86_ops *kvm_x86_ops __read_mostly; +struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; @@ -138,10 +141,14 @@ static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); -/* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 0; -module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); +/* + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables + * adaptive tuning starting from default advancment of 1000ns. '0' disables + * advancement entirely. Any other value is used as-is and disables adaptive + * tuning, i.e. allows priveleged userspace to set an exact advancement time. + */ +static int __read_mostly lapic_timer_advance_ns = -1; +module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -153,85 +160,149 @@ static bool __read_mostly force_emulation_prefix = false; module_param(force_emulation_prefix, bool, S_IRUGO); -#define KVM_NR_SHARED_MSRS 16 +int __read_mostly pi_inject_timer = -1; +module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); -struct kvm_shared_msrs_global { +/* + * Restoring the host value for MSRs that are only consumed when running in + * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU + * returns to userspace, i.e. the kernel can run with the guest's value. + */ +#define KVM_MAX_NR_USER_RETURN_MSRS 16 + +struct kvm_user_return_msrs_global { int nr; - u32 msrs[KVM_NR_SHARED_MSRS]; + u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS]; }; -struct kvm_shared_msrs { +struct kvm_user_return_msrs { struct user_return_notifier urn; bool registered; - struct kvm_shared_msr_values { + struct kvm_user_return_msr_values { u64 host; u64 curr; - } values[KVM_NR_SHARED_MSRS]; + } values[KVM_MAX_NR_USER_RETURN_MSRS]; }; -static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static struct kvm_shared_msrs __percpu *shared_msrs; +static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global; +static struct kvm_user_return_msrs __percpu *user_return_msrs; + +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) + +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + +bool __read_mostly allow_smaller_maxphyaddr = 0; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + +static u64 __read_mostly host_xss; +u64 __read_mostly supported_xss; +EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { - { "pf_fixed", VCPU_STAT(pf_fixed) }, - { "pf_guest", VCPU_STAT(pf_guest) }, - { "tlb_flush", VCPU_STAT(tlb_flush) }, - { "invlpg", VCPU_STAT(invlpg) }, - { "exits", VCPU_STAT(exits) }, - { "io_exits", VCPU_STAT(io_exits) }, - { "mmio_exits", VCPU_STAT(mmio_exits) }, - { "signal_exits", VCPU_STAT(signal_exits) }, - { "irq_window", VCPU_STAT(irq_window_exits) }, - { "nmi_window", VCPU_STAT(nmi_window_exits) }, - { "halt_exits", VCPU_STAT(halt_exits) }, - { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, - { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, - { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, - { "halt_wakeup", VCPU_STAT(halt_wakeup) }, - { "hypercalls", VCPU_STAT(hypercalls) }, - { "request_irq", VCPU_STAT(request_irq_exits) }, - { "irq_exits", VCPU_STAT(irq_exits) }, - { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "fpu_reload", VCPU_STAT(fpu_reload) }, - { "insn_emulation", VCPU_STAT(insn_emulation) }, - { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, - { "irq_injections", VCPU_STAT(irq_injections) }, - { "nmi_injections", VCPU_STAT(nmi_injections) }, - { "req_event", VCPU_STAT(req_event) }, - { "l1d_flush", VCPU_STAT(l1d_flush) }, - { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, - { "mmu_pte_write", VM_STAT(mmu_pte_write) }, - { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, - { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) }, - { "mmu_flooded", VM_STAT(mmu_flooded) }, - { "mmu_recycled", VM_STAT(mmu_recycled) }, - { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, - { "mmu_unsync", VM_STAT(mmu_unsync) }, - { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages, .mode = 0444) }, - { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, - { "max_mmu_page_hash_collisions", - VM_STAT(max_mmu_page_hash_collisions) }, + VCPU_STAT("pf_fixed", pf_fixed), + VCPU_STAT("pf_guest", pf_guest), + VCPU_STAT("tlb_flush", tlb_flush), + VCPU_STAT("invlpg", invlpg), + VCPU_STAT("exits", exits), + VCPU_STAT("io_exits", io_exits), + VCPU_STAT("mmio_exits", mmio_exits), + VCPU_STAT("signal_exits", signal_exits), + VCPU_STAT("irq_window", irq_window_exits), + VCPU_STAT("nmi_window", nmi_window_exits), + VCPU_STAT("halt_exits", halt_exits), + VCPU_STAT("halt_successful_poll", halt_successful_poll), + VCPU_STAT("halt_attempted_poll", halt_attempted_poll), + VCPU_STAT("halt_poll_invalid", halt_poll_invalid), + VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("hypercalls", hypercalls), + VCPU_STAT("request_irq", request_irq_exits), + VCPU_STAT("irq_exits", irq_exits), + VCPU_STAT("host_state_reload", host_state_reload), + VCPU_STAT("fpu_reload", fpu_reload), + VCPU_STAT("insn_emulation", insn_emulation), + VCPU_STAT("insn_emulation_fail", insn_emulation_fail), + VCPU_STAT("irq_injections", irq_injections), + VCPU_STAT("nmi_injections", nmi_injections), + VCPU_STAT("req_event", req_event), + VCPU_STAT("l1d_flush", l1d_flush), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), + VCPU_STAT("preemption_reported", preemption_reported), + VCPU_STAT("preemption_other", preemption_other), + VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), + VM_STAT("mmu_pte_write", mmu_pte_write), + VM_STAT("mmu_pde_zapped", mmu_pde_zapped), + VM_STAT("mmu_flooded", mmu_flooded), + VM_STAT("mmu_recycled", mmu_recycled), + VM_STAT("mmu_cache_miss", mmu_cache_miss), + VM_STAT("mmu_unsync", mmu_unsync), + VM_STAT("remote_tlb_flush", remote_tlb_flush), + VM_STAT("largepages", lpages, .mode = 0444), + VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444), + VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions), { NULL } }; u64 __read_mostly host_xcr0; +u64 __read_mostly supported_xcr0; +EXPORT_SYMBOL_GPL(supported_xcr0); + +static struct kmem_cache *x86_fpu_cache; + +static struct kmem_cache *x86_emulator_cache; + +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return true if we want to ignore/silent this failed msr access. + */ +static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return true; + } else { + kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return false; + } +} + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); +} static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; - struct kvm_shared_msrs *locals - = container_of(urn, struct kvm_shared_msrs, urn); - struct kvm_shared_msr_values *values; + struct kvm_user_return_msrs *msrs + = container_of(urn, struct kvm_user_return_msrs, urn); + struct kvm_user_return_msr_values *values; unsigned long flags; /* @@ -239,84 +310,89 @@ * interrupted and executed through kvm_arch_hardware_disable() */ local_irq_save(flags); - if (locals->registered) { - locals->registered = false; + if (msrs->registered) { + msrs->registered = false; user_return_notifier_unregister(urn); } local_irq_restore(flags); - for (slot = 0; slot < shared_msrs_global.nr; ++slot) { - values = &locals->values[slot]; + for (slot = 0; slot < user_return_msrs_global.nr; ++slot) { + values = &msrs->values[slot]; if (values->host != values->curr) { - wrmsrl(shared_msrs_global.msrs[slot], values->host); + wrmsrl(user_return_msrs_global.msrs[slot], values->host); values->curr = values->host; } } } -static void shared_msr_update(unsigned slot, u32 msr) +int kvm_probe_user_return_msr(u32 msr) { + u64 val; + int ret; + + preempt_disable(); + ret = rdmsrl_safe(msr, &val); + if (ret) + goto out; + ret = wrmsrl_safe(msr, val); +out: + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr); + +void kvm_define_user_return_msr(unsigned slot, u32 msr) +{ + BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS); + user_return_msrs_global.msrs[slot] = msr; + if (slot >= user_return_msrs_global.nr) + user_return_msrs_global.nr = slot + 1; +} +EXPORT_SYMBOL_GPL(kvm_define_user_return_msr); + +static void kvm_user_return_msr_cpu_online(void) +{ + unsigned int cpu = smp_processor_id(); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + int i; - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; + for (i = 0; i < user_return_msrs_global.nr; ++i) { + rdmsrl_safe(user_return_msrs_global.msrs[i], &value); + msrs->values[i].host = value; + msrs->values[i].curr = value; } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; } -void kvm_define_shared_msr(unsigned slot, u32 msr) -{ - BUG_ON(slot >= KVM_NR_SHARED_MSRS); - shared_msrs_global.msrs[slot] = msr; - if (slot >= shared_msrs_global.nr) - shared_msrs_global.nr = slot + 1; -} -EXPORT_SYMBOL_GPL(kvm_define_shared_msr); - -static void kvm_shared_msr_cpu_online(void) -{ - unsigned i; - - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); -} - -int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); int err; - value = (value & mask) | (smsr->values[slot].host & ~mask); - if (value == smsr->values[slot].curr) + value = (value & mask) | (msrs->values[slot].host & ~mask); + if (value == msrs->values[slot].curr) return 0; - err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value); if (err) return 1; - smsr->values[slot].curr = value; - if (!smsr->registered) { - smsr->urn.on_user_return = kvm_on_user_return; - user_return_notifier_register(&smsr->urn); - smsr->registered = true; + msrs->values[slot].curr = value; + if (!msrs->registered) { + msrs->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&msrs->urn); + msrs->registered = true; } return 0; } -EXPORT_SYMBOL_GPL(kvm_set_shared_msr); +EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); static void drop_user_return_notifiers(void) { unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); - if (smsr->registered) - kvm_on_user_return(&smsr->urn); + if (msrs->registered) + kvm_on_user_return(&msrs->urn); } u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) @@ -348,14 +424,15 @@ } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -384,6 +461,7 @@ #define EXCPT_TRAP 1 #define EXCPT_ABORT 2 #define EXCPT_INTERRUPT 3 +#define EXCPT_DB 4 static int exception_type(int vector) { @@ -394,8 +472,14 @@ mask = 1 << vector; - /* #DB is trap, as instruction watchpoints are handled elsewhere */ - if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + /* + * #DBs can be trap-like or fault-like, the caller must check other CPU + * state, e.g. DR6, to determine whether a #DB is a trap or fault. + */ + if (mask & (1 << DB_VECTOR)) + return EXCPT_DB; + + if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) return EXCPT_TRAP; if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) @@ -405,9 +489,59 @@ return EXCPT_FAULT; } +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) +{ + unsigned nr = vcpu->arch.exception.nr; + bool has_payload = vcpu->arch.exception.has_payload; + unsigned long payload = vcpu->arch.exception.payload; + + if (!has_payload) + return; + + switch (nr) { + case DB_VECTOR: + /* + * "Certain debug exceptions may clear bit 0-3. The + * remaining contents of the DR6 register are never + * cleared by the processor". + */ + vcpu->arch.dr6 &= ~DR_TRAP_BITS; + /* + * DR6.RTM is set by all #DB exceptions that don't clear it. + */ + vcpu->arch.dr6 |= DR6_RTM; + vcpu->arch.dr6 |= payload; + /* + * Bit 16 should be set in the payload whenever the #DB + * exception should clear DR6.RTM. This makes the payload + * compatible with the pending debug exceptions under VMX. + * Though not currently documented in the SDM, this also + * makes the payload compatible with the exit qualification + * for #DB exceptions under VMX. + */ + vcpu->arch.dr6 ^= payload & DR6_RTM; + + /* + * The #DB payload is defined as compatible with the 'pending + * debug exceptions' field under VMX, not DR6. While bit 12 is + * defined in the 'pending debug exceptions' field (enabled + * breakpoint), it is reserved and must be zero in DR6. + */ + vcpu->arch.dr6 &= ~BIT(12); + break; + case PF_VECTOR: + vcpu->arch.cr2 = payload; + break; + } + + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; +} +EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, - bool reinject) + bool has_payload, unsigned long payload, bool reinject) { u32 prev_nr; int class1, class2; @@ -427,6 +561,14 @@ */ WARN_ON_ONCE(vcpu->arch.exception.pending); vcpu->arch.exception.injected = true; + if (WARN_ON_ONCE(has_payload)) { + /* + * A reinjected event has already + * delivered its payload. + */ + has_payload = false; + payload = 0; + } } else { vcpu->arch.exception.pending = true; vcpu->arch.exception.injected = false; @@ -434,6 +576,10 @@ vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.has_payload = has_payload; + vcpu->arch.exception.payload = payload; + if (!is_guest_mode(vcpu)) + kvm_deliver_exception_payload(vcpu); return; } @@ -458,6 +604,8 @@ vcpu->arch.exception.has_error_code = true; vcpu->arch.exception.nr = DF_VECTOR; vcpu->arch.exception.error_code = 0; + vcpu->arch.exception.has_payload = false; + vcpu->arch.exception.payload = 0; } else /* replace previous exception with a new one in a hope that instruction re-execution will regenerate lost @@ -467,15 +615,29 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, false); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0, true); + kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception); + +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) +{ + kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); +} +EXPORT_SYMBOL_GPL(kvm_queue_exception_p); + +static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, + u32 error_code, unsigned long payload) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, + true, payload, false); +} int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { @@ -493,23 +655,38 @@ ++vcpu->stat.pf_guest; vcpu->arch.exception.nested_apf = is_guest_mode(vcpu) && fault->async_page_fault; - if (vcpu->arch.exception.nested_apf) + if (vcpu->arch.exception.nested_apf) { vcpu->arch.apf.nested_apf_token = fault->address; - else - vcpu->arch.cr2 = fault->address; - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); + kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); + } else { + kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, + fault->address); + } } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - if (mmu_is_nested(vcpu) && !fault->nested_page_fault) - vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); - else - vcpu->arch.mmu.inject_page_fault(vcpu, fault); + struct kvm_mmu *fault_mmu; + WARN_ON_ONCE(fault->vector != PF_VECTOR); + fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : + vcpu->arch.walk_mmu; + + /* + * Invalidate the TLB entry for the faulting address, if it exists, + * else the access will fault indefinitely (and to emulate hardware). + */ + if ((fault->error_code & PFERR_PRESENT_MASK) && + !(fault->error_code & PFERR_RSVD_MASK)) + kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address, + fault_mmu->root_hpa); + + fault_mmu->inject_page_fault(vcpu, fault); return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -520,13 +697,13 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, false); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code, true); + kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true); } EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); @@ -536,7 +713,7 @@ */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -618,10 +795,8 @@ ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + out: return ret; @@ -631,7 +806,6 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - bool changed = true; int offset; gfn_t gfn; int r; @@ -639,8 +813,7 @@ if (!is_pae_paging(vcpu)) return false; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) return true; gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; @@ -648,17 +821,16 @@ r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -out: + return true; - return changed; + return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; } EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -676,27 +848,27 @@ if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { - int cs_db, cs_l; + if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && + (cr0 & X86_CR0_PG)) { + int cs_db, cs_l; - if (!is_pae(vcpu)) - return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) - return 1; - } else -#endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (!is_pae(vcpu)) + return 1; + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) return 1; } +#endif + if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && + is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) + return 1; if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); @@ -721,27 +893,48 @@ } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; - } -} -EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); + } + + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && + vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.pkru); +} +EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); + +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { - if (vcpu->guest_xcr0_loaded) { + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.host_pkru); + } + + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, host_xss); } + } -EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -779,13 +972,13 @@ vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops->get_cpl(vcpu) != 0 || + if (kvm_x86_ops.get_cpl(vcpu) != 0 || __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; @@ -794,63 +987,20 @@ } EXPORT_SYMBOL_GPL(kvm_set_xcr); -static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) -{ - u64 reserved_bits = CR4_RESERVED_BITS; - - if (!cpu_has(c, X86_FEATURE_XSAVE)) - reserved_bits |= X86_CR4_OSXSAVE; - - if (!cpu_has(c, X86_FEATURE_SMEP)) - reserved_bits |= X86_CR4_SMEP; - - if (!cpu_has(c, X86_FEATURE_SMAP)) - reserved_bits |= X86_CR4_SMAP; - - if (!cpu_has(c, X86_FEATURE_FSGSBASE)) - reserved_bits |= X86_CR4_FSGSBASE; - - if (!cpu_has(c, X86_FEATURE_PKU)) - reserved_bits |= X86_CR4_PKE; - - if (!cpu_has(c, X86_FEATURE_LA57) && - !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57))) - reserved_bits |= X86_CR4_LA57; - - if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated()) - reserved_bits |= X86_CR4_UMIP; - - return reserved_bits; -} - -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return -EINVAL; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return -EINVAL; - - if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4)) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(kvm_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -882,15 +1032,14 @@ return 1; } - if (kvm_x86_ops->set_cr4(vcpu, cr4)) - return 1; + kvm_x86_ops.set_cr4(vcpu, cr4); if (((cr4 ^ old_cr4) & mmu_role_bits) || (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -911,21 +1060,21 @@ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } return 0; } if (is_long_mode(vcpu) && - (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63))) + (cr3 & vcpu->arch.cr3_lm_rsvd_bits)) return 1; else if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); return 0; } @@ -963,13 +1112,7 @@ } } -static void kvm_update_dr6(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); -} - -static void kvm_update_dr7(struct kvm_vcpu *vcpu) +void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -977,11 +1120,12 @@ dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops->set_dr7(vcpu, dr7); + kvm_x86_ops.set_dr7(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -1003,17 +1147,14 @@ vcpu->arch.eff_db[dr] = val; break; case 4: - /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) + if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - kvm_update_dr6(vcpu); break; case 5: - /* fall through */ default: /* 7 */ - if (val & 0xffffffff00000000ULL) + if (!kvm_dr7_valid(val)) return -1; /* #GP */ vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -1042,15 +1183,10 @@ *val = vcpu->arch.db[array_index_nospec(dr, size)]; break; case 4: - /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *val = vcpu->arch.dr6; - else - *val = kvm_x86_ops->get_dr6(vcpu); + *val = vcpu->arch.dr6; break; case 5: - /* fall through */ default: /* 7 */ *val = vcpu->arch.dr7; break; @@ -1061,15 +1197,15 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) { - u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u32 ecx = kvm_rcx_read(vcpu); u64 data; int err; err = kvm_pmu_rdpmc(vcpu, ecx, &data); if (err) return err; - kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); - kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + kvm_rax_write(vcpu, (u32)data); + kvm_rdx_write(vcpu, data >> 32); return err; } EXPORT_SYMBOL_GPL(kvm_rdpmc); @@ -1078,26 +1214,66 @@ * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, - MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, - MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES + MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, + MSR_IA32_SPEC_CTRL, + MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, + MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, + MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, + MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, + MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, + MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, + MSR_IA32_UMWAIT_CONTROL, + + MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, + MSR_ARCH_PERFMON_FIXED_CTR0 + 2, + MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, + MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, + MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, + MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, + MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, + MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, + MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9, + MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11, + MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, + MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, + MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, + MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, + MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, + MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, + MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, + MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9, + MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11, + MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, + MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, + MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, + + MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, + MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, + MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, + MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, + MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, + MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1113,12 +1289,18 @@ HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, + MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1128,15 +1310,41 @@ MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, MSR_AMD64_VIRT_SPEC_CTRL, + MSR_IA32_POWER_CTL, + MSR_IA32_UCODE_REV, + + /* + * The following list leaves out MSRs whose values are determined + * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. + * We always support the "true" VMX control MSRs, even if the host + * processor does not, so I am putting these registers here rather + * than in msrs_to_save_all. + */ + MSR_IA32_VMX_BASIC, + MSR_IA32_VMX_TRUE_PINBASED_CTLS, + MSR_IA32_VMX_TRUE_PROCBASED_CTLS, + MSR_IA32_VMX_TRUE_EXIT_CTLS, + MSR_IA32_VMX_TRUE_ENTRY_CTLS, + MSR_IA32_VMX_MISC, + MSR_IA32_VMX_CR0_FIXED0, + MSR_IA32_VMX_CR4_FIXED0, + MSR_IA32_VMX_VMCS_ENUM, + MSR_IA32_VMX_PROCBASED_CTLS2, + MSR_IA32_VMX_EPT_VPID_CAP, + MSR_IA32_VMX_VMFUNC, + + MSR_K7_HWCR, + MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1156,18 +1364,41 @@ MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; -u64 kvm_get_arch_capabilities(void) -{ - u64 data; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ - rdmsrl_safe(MSR_IA32_ARCH_CAPABILITIES, &data); +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) + +static u64 kvm_get_arch_capabilities(void) +{ + u64 data = 0; + + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + data &= KVM_SUPPORTED_ARCH_CAP; + } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1196,34 +1427,30 @@ if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; - /* - * On TAA affected systems, export MDS_NO=0 when: - * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1. - * - Updated microcode is present. This is detected by - * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures - * that VERW clears CPU buffers. - * - * When MDS_NO=0 is exported, guests deploy clear CPU buffer - * mitigation and don't complain: - * - * "Vulnerable: Clear CPU buffers attempted, no microcode" - * - * If TSX is disabled on the system, guests are also mitigated against - * TAA and clear CPU buffer mitigation is not required for guests. - */ - if (!boot_cpu_has(X86_FEATURE_RTM)) + if (!boot_cpu_has(X86_FEATURE_RTM)) { + /* + * If RTM=0 because the kernel has disabled TSX, the host might + * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 + * and therefore knows that there cannot be TAA) but keep + * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, + * and we want to allow migrating those guests to tsx=off hosts. + */ data &= ~ARCH_CAP_TAA_NO; - else if (!boot_cpu_has_bug(X86_BUG_TAA)) + } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { data |= ARCH_CAP_TAA_NO; - else if (data & ARCH_CAP_TSX_CTRL_MSR) - data &= ~ARCH_CAP_MDS_NO; + } else { + /* + * Nothing to do here; we emulate TSX_CTRL if present on the + * host so the guest can choose between disabling TSX or + * using VERW to clear CPU buffers. + */ + } - /* KVM does not emulate MSR_IA32_TSX_CTRL. */ - data &= ~ARCH_CAP_TSX_CTRL_MSR; + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) + data |= ARCH_CAP_GDS_NO; + return data; } - -EXPORT_SYMBOL_GPL(kvm_get_arch_capabilities); static int kvm_get_msr_feature(struct kvm_msr_entry *msr) { @@ -1235,8 +1462,7 @@ rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops->get_msr_feature(msr)) - return 1; + return kvm_x86_ops.get_msr_feature(msr); } return 0; } @@ -1248,6 +1474,14 @@ msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + r = 0; + } + if (r) return r; @@ -1262,6 +1496,13 @@ return false; if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) + return false; + + if (efer & (EFER_LME | EFER_LMA) && + !guest_cpuid_has(vcpu, X86_FEATURE_LM)) + return false; + + if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) return false; return true; @@ -1280,6 +1521,7 @@ { u64 old_efer = vcpu->arch.efer; u64 efer = msr_info->data; + int r; if (efer & efer_reserved_bits) return 1; @@ -1296,7 +1538,11 @@ efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + r = kvm_x86_ops.set_efer(vcpu, efer); + if (r) { + WARN_ON(r > 0); + return r; + } /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1311,20 +1557,73 @@ } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); +bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) +{ + struct kvm_x86_msr_filter *msr_filter; + struct msr_bitmap_range *ranges; + struct kvm *kvm = vcpu->kvm; + bool allowed; + int idx; + u32 i; + + /* x2APIC MSRs do not support filtering. */ + if (index >= 0x800 && index <= 0x8ff) + return true; + + idx = srcu_read_lock(&kvm->srcu); + + msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); + if (!msr_filter) { + allowed = true; + goto out; + } + + allowed = msr_filter->default_allow; + ranges = msr_filter->ranges; + + for (i = 0; i < msr_filter->count; i++) { + u32 start = ranges[i].base; + u32 end = start + ranges[i].nmsrs; + u32 flags = ranges[i].flags; + unsigned long *bitmap = ranges[i].bitmap; + + if ((index >= start) && (index < end) && (flags & type)) { + allowed = !!test_bit(index - start, bitmap); + break; + } + + /* Note, VM-Exits that go down the "slow" path are accounted below. */ + ++vcpu->stat.exits; + } + +out: + srcu_read_unlock(&kvm->srcu, idx); + + return allowed; +} +EXPORT_SYMBOL_GPL(kvm_msr_allowed); + /* - * Writes msr value into into the appropriate "register". + * Write @data into the MSR specified by @index. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, + bool host_initiated) { - switch (msr->index) { + struct msr_data msr; + + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) + return KVM_MSR_RET_FILTERED; + + switch (index) { case MSR_FS_BASE: case MSR_GS_BASE: case MSR_KERNEL_GS_BASE: case MSR_CSTAR: case MSR_LSTAR: - if (is_noncanonical_address(msr->data, vcpu)) + if (is_noncanonical_address(data, vcpu)) return 1; break; case MSR_IA32_SYSENTER_EIP: @@ -1341,54 +1640,313 @@ * value, and that something deterministic happens if the guest * invokes 64-bit SYSENTER. */ - msr->data = get_canonical(msr->data, vcpu_virt_addr_bits(vcpu)); + data = get_canonical(data, vcpu_virt_addr_bits(vcpu)); } - return kvm_x86_ops->set_msr(vcpu, msr); + + msr.data = data; + msr.index = index; + msr.host_initiated = host_initiated; + + return kvm_x86_ops.set_msr(vcpu, &msr); +} + +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + if (kvm_msr_ignored_check(vcpu, index, data, true)) + ret = 0; + + return ret; +} + +/* + * Read the MSR specified by @index into @data. Select MSR specific fault + * checks are bypassed if @host_initiated is %true. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) +{ + struct msr_data msr; + int ret; + + if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) + return KVM_MSR_RET_FILTERED; + + msr.index = index; + msr.host_initiated = host_initiated; + + ret = kvm_x86_ops.get_msr(vcpu, &msr); + if (!ret) + *data = msr.data; + return ret; +} + +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + if (kvm_msr_ignored_check(vcpu, index, 0, false)) + ret = 0; + } + + return ret; +} + +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + return kvm_get_msr_ignored_check(vcpu, index, data, false); +} +EXPORT_SYMBOL_GPL(kvm_get_msr); + +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) +{ + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); + +static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read) +{ + if (vcpu->run->msr.error) { + kvm_inject_gp(vcpu, 0); + return 1; + } else if (is_read) { + kvm_rax_write(vcpu, (u32)vcpu->run->msr.data); + kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32); + } + + return kvm_skip_emulated_instruction(vcpu); +} + +static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, true); +} + +static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu) +{ + return complete_emulated_msr(vcpu, false); +} + +static u64 kvm_msr_reason(int r) +{ + switch (r) { + case KVM_MSR_RET_INVALID: + return KVM_MSR_EXIT_REASON_UNKNOWN; + case KVM_MSR_RET_FILTERED: + return KVM_MSR_EXIT_REASON_FILTER; + default: + return KVM_MSR_EXIT_REASON_INVAL; + } +} + +static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, + u32 exit_reason, u64 data, + int (*completion)(struct kvm_vcpu *vcpu), + int r) +{ + u64 msr_reason = kvm_msr_reason(r); + + /* Check if the user wanted to know about this MSR fault */ + if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) + return 0; + + vcpu->run->exit_reason = exit_reason; + vcpu->run->msr.error = 0; + memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); + vcpu->run->msr.reason = msr_reason; + vcpu->run->msr.index = index; + vcpu->run->msr.data = data; + vcpu->arch.complete_userspace_io = completion; + + return 1; +} + +static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0, + complete_emulated_rdmsr, r); +} + +static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r) +{ + return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data, + complete_emulated_wrmsr, r); +} + +int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data; + int r; + + r = kvm_get_msr(vcpu, ecx, &data); + + /* MSR read failed? See if we should ask user space */ + if (r && kvm_get_msr_user_space(vcpu, ecx, r)) { + /* Bounce to user space */ + return 0; + } + + /* MSR read failed? Inject a #GP */ + if (r) { + trace_kvm_msr_read_ex(ecx); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_read(ecx, data); + + kvm_rax_write(vcpu, data & -1u); + kvm_rdx_write(vcpu, (data >> 32) & -1u); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); + +int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) +{ + u32 ecx = kvm_rcx_read(vcpu); + u64 data = kvm_read_edx_eax(vcpu); + int r; + + r = kvm_set_msr(vcpu, ecx, data); + + /* MSR write failed? See if we should ask user space */ + if (r && kvm_set_msr_user_space(vcpu, ecx, data, r)) + /* Bounce to user space */ + return 0; + + /* Signal all other negative errors to userspace */ + if (r < 0) + return r; + + /* MSR write failed? Inject a #GP */ + if (r > 0) { + trace_kvm_msr_write_ex(ecx, data); + kvm_inject_gp(vcpu, 0); + return 1; + } + + trace_kvm_msr_write(ecx, data); + return kvm_skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); + +bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) +{ + return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || + xfer_to_guest_mode_work_pending(); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request); + +/* + * The fast path for frequent and performance sensitive wrmsr emulation, + * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces + * the latency of virtual IPI by avoiding the expensive bits of transitioning + * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the + * other cases which must be called after interrupts are enabled on the host. + */ +static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) +{ + if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic)) + return 1; + + if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && + ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && + ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && + ((u32)(data >> 32) != X2APIC_BROADCAST)) { + + data &= ~(1 << 12); + kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); + trace_kvm_apic_write(APIC_ICR, (u32)data); + return 0; + } + + return 1; +} + +static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) +{ + if (!kvm_can_use_hv_timer(vcpu)) + return 1; + + kvm_set_lapic_tscdeadline_msr(vcpu, data); + return 0; +} + +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +{ + u32 msr = kvm_rcx_read(vcpu); + u64 data; + fastpath_t ret = EXIT_FASTPATH_NONE; + + switch (msr) { + case APIC_BASE_MSR + (APIC_ICR >> 4): + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_EXIT_HANDLED; + } + break; + case MSR_IA32_TSCDEADLINE: + data = kvm_read_edx_eax(vcpu); + if (!handle_fastpath_set_tscdeadline(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_REENTER_GUEST; + } + break; + default: + break; + } + + if (ret != EXIT_FASTPATH_NONE) + trace_kvm_msr_write(msr, data); + + return ret; +} +EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); /* * Adapt set_msr() to msr_io()'s calling convention */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct msr_data msr; - int r; - - msr.index = index; - msr.host_initiated = true; - r = kvm_get_msr(vcpu, &msr); - if (r) - return r; - - *data = msr.data; - return 0; + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - struct msr_data msr; - - msr.data = *data; - msr.index = index; - msr.host_initiated = true; - return kvm_set_msr(vcpu, &msr); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 +struct pvclock_clock { + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; + u64 base_cycles; + u64 offset; +}; + struct pvclock_gtod_data { seqcount_t seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - } clock; + struct pvclock_clock clock; /* extract of a clocksource struct */ + struct pvclock_clock raw_clock; /* extract of a clocksource struct */ - u64 boot_ns; - u64 nsec_base; + ktime_t offs_boot; u64 wall_time_sec; }; @@ -1397,44 +1955,54 @@ static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; - - boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; vdata->clock.cycle_last = tk->tkr_mono.cycle_last; vdata->clock.mask = tk->tkr_mono.mask; vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; + vdata->clock.offset = tk->tkr_mono.base; - vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; + vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_clock.mask = tk->tkr_raw.mask; + vdata->raw_clock.mult = tk->tkr_raw.mult; + vdata->raw_clock.shift = tk->tkr_raw.shift; + vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; + vdata->raw_clock.offset = tk->tkr_raw.base; vdata->wall_time_sec = tk->xtime_sec; + vdata->offs_boot = tk->offs_boot; + write_seqcount_end(&vdata->seq); } -#endif -void kvm_set_pending_timer(struct kvm_vcpu *vcpu) +static s64 get_kvmclock_base_ns(void) { - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. This function is only called from - * the physical CPU that is running vcpu. - */ - kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + /* Count up from boot time, but with the frequency of the raw clock. */ + return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); } +#else +static s64 get_kvmclock_base_ns(void) +{ + /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ + return ktime_get_boottime_ns(); +} +#endif static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { int version; int r; struct pvclock_wall_clock wc; - struct timespec64 boot; + u64 wall_nsec; + + kvm->arch.wall_clock = wall_clock; if (!wall_clock) return; @@ -1454,23 +2022,46 @@ /* * The guest calculates current wall clock time by adding * system time (updated by kvm_guest_time_update below) to the - * wall clock specified here. guest system time equals host - * system time for us, thus we must fill in host boot time here. + * wall clock specified here. We do the reverse here. */ - getboottime64(&boot); + wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm); - if (kvm->arch.kvmclock_offset) { - struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset); - boot = timespec64_sub(boot, ts); - } - wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */ - wc.nsec = boot.tv_nsec; + wc.nsec = do_div(wall_nsec, 1000000000); + wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ wc.version = version; kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); version++; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); +} + +static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, + bool old_msr, bool host_initiated) +{ + struct kvm_arch *ka = &vcpu->kvm->arch; + + if (vcpu->vcpu_id == 0 && !host_initiated) { + if (ka->boot_vcpu_runs_old_kvmclock != old_msr) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + ka->boot_vcpu_runs_old_kvmclock = old_msr; + } + + vcpu->arch.time = system_time; + kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); + + /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; + if (!(system_time & 1)) + return; + + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, system_time & ~1ULL, + sizeof(struct pvclock_vcpu_time_info))) + vcpu->arch.pv_time_enabled = true; + + return; } static uint32_t div_frac(uint32_t dividend, uint32_t divisor) @@ -1505,9 +2096,6 @@ *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1604,7 +2192,7 @@ static inline int gtod_is_based_on_tsc(int mode) { - return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; + return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; } static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) @@ -1633,12 +2221,6 @@ atomic_read(&vcpu->kvm->online_vcpus), ka->use_master_clock, gtod->clock.vclock_mode); #endif -} - -static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) -{ - u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); - vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } /* @@ -1679,15 +2261,14 @@ u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); - - return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); + vcpu->arch.l1_tsc_offset = offset; + vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -1697,29 +2278,28 @@ * TSC is marked unstable when we're running on Hyper-V, * 'TSC page' clocksource is good. */ - if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) return false; #endif return check_tsc_unstable(); } -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; bool matched; bool already_matched; - u64 data = msr->data; bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); - ns = ktime_get_boot_ns(); + ns = get_kvmclock_base_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - if (data == 0 && msr->host_initiated) { + if (data == 0) { /* * detection of vcpu initialization -- need to sync * with other vCPUs. This particularly helps to keep @@ -1750,12 +2330,10 @@ vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -1774,8 +2352,6 @@ kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); } /* @@ -1793,9 +2369,6 @@ vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; - if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) - update_ia32_tsc_adjust_msr(vcpu, offset); - kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); @@ -1810,12 +2383,10 @@ spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } -EXPORT_SYMBOL_GPL(kvm_write_tsc); - static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -1849,43 +2420,43 @@ return last; } -static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) +static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, + int *mode) { long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; u64 tsc_pg_val; - switch (gtod->clock.vclock_mode) { - case VCLOCK_HVCLOCK: + switch (clock->vclock_mode) { + case VDSO_CLOCKMODE_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ - *mode = VCLOCK_HVCLOCK; - v = (tsc_pg_val - gtod->clock.cycle_last) & - gtod->clock.mask; + *mode = VDSO_CLOCKMODE_HVCLOCK; + v = (tsc_pg_val - clock->cycle_last) & + clock->mask; } else { /* TSC page invalid */ - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } break; - case VCLOCK_TSC: - *mode = VCLOCK_TSC; + case VDSO_CLOCKMODE_TSC: + *mode = VDSO_CLOCKMODE_TSC; *tsc_timestamp = read_tsc(); - v = (*tsc_timestamp - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (*tsc_timestamp - clock->cycle_last) & + clock->mask; break; default: - *mode = VCLOCK_NONE; + *mode = VDSO_CLOCKMODE_NONE; } - if (*mode == VCLOCK_NONE) + if (*mode == VDSO_CLOCKMODE_NONE) *tsc_timestamp = v = 0; - return v * gtod->clock.mult; + return v * clock->mult; } -static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) +static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1894,10 +2465,10 @@ do { seq = read_seqcount_begin(>od->seq); - ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); - ns >>= gtod->clock.shift; - ns += gtod->boot_ns; + ns = gtod->raw_clock.base_cycles; + ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); + ns >>= gtod->raw_clock.shift; + ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -1914,8 +2485,8 @@ do { seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; - ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns = gtod->clock.base_cycles; + ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1932,7 +2503,7 @@ if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, tsc_timestamp)); } @@ -2057,7 +2628,7 @@ spin_lock(&ka->pvclock_gtod_sync_lock); if (!ka->use_master_clock) { spin_unlock(&ka->pvclock_gtod_sync_lock); - return ktime_get_boot_ns() + ka->kvmclock_offset; + return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; @@ -2073,7 +2644,7 @@ &hv_clock.tsc_to_system_mul); ret = __pvclock_read_cycles(&hv_clock, rdtsc()); } else - ret = ktime_get_boot_ns() + ka->kvmclock_offset; + ret = get_kvmclock_base_ns() + ka->kvmclock_offset; put_cpu(); @@ -2172,7 +2743,7 @@ } if (!use_master_clock) { host_tsc = rdtsc(); - kernel_ns = ktime_get_boot_ns(); + kernel_ns = get_kvmclock_base_ns(); } tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); @@ -2284,6 +2855,18 @@ KVMCLOCK_SYNC_PERIOD); } +/* + * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. + */ +static bool can_set_mci_status(struct kvm_vcpu *vcpu) +{ + /* McStatusWrEn enabled? */ + if (guest_cpuid_is_amd_or_hygon(vcpu)) + return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); + + return false; +} + static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 mcg_cap = vcpu->arch.mcg_cap; @@ -2313,14 +2896,22 @@ /* only 0 or all 1s can be written to IA32_MCi_CTL * some Linux kernels though clear bit 10 in bank 4 to * workaround a BIOS/GART TBL issue on AMD K8s, ignore - * this to avoid an uncatched #GP in the guest + * this to avoid an uncatched #GP in the guest. + * + * UNIXWARE clears bit 0 of MC1_CTL to ignore + * correctable, single-bit ECC data errors. */ if ((offset & 0x3) == 0 && - data != 0 && (data | (1 << 10)) != ~(u64)0) - return -1; + data != 0 && (data | (1 << 10) | 1) != ~(u64)0) + return 1; + + /* MCi_STATUS */ if (!msr_info->host_initiated && - (offset & 0x3) == 1 && data != 0) - return -1; + (offset & 0x3) == 1 && data != 0) { + if (!can_set_mci_status(vcpu)) + return 1; + } + vcpu->arch.mce_banks[offset] = data; break; } @@ -2340,104 +2931,192 @@ u32 page_num = data & ~PAGE_MASK; u64 page_addr = data & PAGE_MASK; u8 *page; - int r; - r = -E2BIG; if (page_num >= blob_size) - goto out; - r = -ENOMEM; + return 1; + page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE); - if (IS_ERR(page)) { - r = PTR_ERR(page); - goto out; + if (IS_ERR(page)) + return PTR_ERR(page); + + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { + kfree(page); + return 1; } - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) - goto out_free; - r = 0; -out_free: - kfree(page); -out: - return r; + return 0; +} + +static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; } static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 3:5 are reserved, Should be zero */ - if (data & 0x38) + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) return 1; - vcpu->arch.apf.msr_val = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) + return 1; - if (!(data & KVM_ASYNC_PF_ENABLED)) { + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && + (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) + return 1; + + if (!lapic_in_kernel(vcpu)) + return data ? 1 : 0; + + vcpu->arch.apf.msr_en_val = data; + + if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) + sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + kvm_async_pf_wakeup_all(vcpu); + + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + return 0; } static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush_all(vcpu); +} + +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_guest(vcpu); } static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + u64 steal; + u32 version; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { + u8 st_preempted = 0; + int err = -EFAULT; - vcpu->arch.st.preempted = 0; + if (!user_access_begin(st, sizeof(*st))) + return; - if (st->version & 1) - st->version += 1; /* first time write, random junk */ + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+q" (st_preempted), + "+&r" (err), + "+m" (st->preempted)); + if (err) + goto out; - st->version += 1; + user_access_end(); + + vcpu->arch.st.preempted = 0; + + trace_kvm_pv_tlb_flush(vcpu->vcpu_id, + st_preempted & KVM_VCPU_FLUSH_TLB); + if (st_preempted & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; + } else { + if (!user_access_begin(st, sizeof(*st))) + return; + + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; + } + + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ + + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); + version += 1; + unsafe_put_user(version, &st->version, out); - st->version += 1; - - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -2465,14 +3144,31 @@ return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ - data &= ~(u64)0x40000; /* ignore Mc status write enable */ - if (data != 0) { + + /* Handle McStatusWrEn */ + if (data == BIT_ULL(18)) { + vcpu->arch.msr_hwcr = data; + } else if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); return 1; @@ -2493,9 +3189,9 @@ /* Values other than LBR and BTF are vendor-specific, thus reserved and should throw a #GP */ return 1; - } - vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + } else if (report_ignored_msrs) + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case 0x200 ... 0x2ff: return kvm_mtrr_set_msr(vcpu, msr, data); @@ -2520,15 +3216,46 @@ } break; case MSR_IA32_MISC_ENABLE: - vcpu->arch.ia32_misc_enable_msr = data; + if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && + ((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { + if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) + return 1; + vcpu->arch.ia32_misc_enable_msr = data; + kvm_update_cpuid_runtime(vcpu); + } else { + vcpu->arch.ia32_misc_enable_msr = data; + } break; case MSR_IA32_SMBASE: if (!msr_info->host_initiated) return 1; vcpu->arch.smbase = data; break; + case MSR_IA32_POWER_CTL: + vcpu->arch.msr_ia32_power_ctl = data; + break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, msr_info); + if (msr_info->host_initiated) { + kvm_synchronize_tsc(vcpu, data); + } else { + u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset; + adjust_tsc_offset_guest(vcpu, adj); + vcpu->arch.ia32_tsc_adjust_msr += adj; + } + break; + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + /* + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. + */ + if (data & ~supported_xss) + return 1; + vcpu->arch.ia32_xss = data; + kvm_update_cpuid_runtime(vcpu); break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) @@ -2536,46 +3263,54 @@ vcpu->arch.smi_count = data; break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + + kvm_write_wall_clock(vcpu->kvm, data); + break; case MSR_KVM_WALL_CLOCK: - vcpu->kvm->arch.wall_clock = data; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + kvm_write_wall_clock(vcpu->kvm, data); break; case MSR_KVM_SYSTEM_TIME_NEW: - case MSR_KVM_SYSTEM_TIME: { - struct kvm_arch *ka = &vcpu->kvm->arch; + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; - kvmclock_reset(vcpu); - - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { - bool tmp = (msr == MSR_KVM_SYSTEM_TIME); - - if (ka->boot_vcpu_runs_old_kvmclock != tmp) - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - - ka->boot_vcpu_runs_old_kvmclock = tmp; - } - - vcpu->arch.time = data; - kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); - - /* we verify if the enable bit is set... */ - if (!(data & 1)) - break; - - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, - &vcpu->arch.pv_time, data & ~1ULL, - sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else - vcpu->arch.pv_time_enabled = true; - + kvm_write_system_time(vcpu, data, false, msr_info->host_initiated); break; - } + case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + kvm_write_system_time(vcpu, data, true, msr_info->host_initiated); + break; case MSR_KVM_ASYNC_PF_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + if (data & 0x1) { + vcpu->arch.apf.pageready_pending = false; + kvm_check_async_pf_completion(vcpu); + } + break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; if (unlikely(!sched_info_on())) return 1; @@ -2592,8 +3327,22 @@ break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) return 1; + break; + + case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + + /* only enable bit supported */ + if (data & (-1ULL << 1)) + return 1; + + vcpu->arch.msr_kvm_poll_control = data; break; case MSR_IA32_MCG_CTL: @@ -2603,7 +3352,8 @@ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: - pr = true; /* fall through */ + pr = true; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr)) @@ -2624,6 +3374,8 @@ */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -2669,33 +3421,11 @@ return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } EXPORT_SYMBOL_GPL(kvm_set_msr_common); - - -/* - * Reads an msr value (of 'msr_index') into 'pdata'. - * Returns 0 on success, non-0 otherwise. - * Assumes vcpu_load() was already called. - */ -int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) -{ - return kvm_x86_ops->get_msr(vcpu, msr); -} -EXPORT_SYMBOL_GPL(kvm_get_msr); static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) { @@ -2748,7 +3478,6 @@ case MSR_K8_SYSCFG: case MSR_K8_TSEG_ADDR: case MSR_K8_TSEG_MASK: - case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: @@ -2757,6 +3486,17 @@ case MSR_IA32_PERF_CTL: case MSR_AMD64_DC_CFG: case MSR_F15H_EX_CFG: + /* + * Intel Sandy Bridge CPUs must support the RAPL (running average power + * limit) MSRs. Just return 0, as we do not want to expose the host + * data here. Do not conditionalize this on CPUID, as KVM does not do + * so for existing CPU-specific MSRs. + */ + case MSR_RAPL_POWER_UNIT: + case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ + case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ + case MSR_PKG_ENERGY_STATUS: /* Total package */ + case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ msr_info->data = 0; break; case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: @@ -2765,7 +3505,7 @@ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + return kvm_pmu_get_msr(vcpu, msr_info); msr_info->data = 0; break; case MSR_IA32_UCODE_REV: @@ -2777,9 +3517,31 @@ return 1; msr_info->data = vcpu->arch.arch_capabilities; break; - case MSR_IA32_TSC: - msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; break; + case MSR_IA32_POWER_CTL: + msr_info->data = vcpu->arch.msr_ia32_power_ctl; + break; + case MSR_IA32_TSC: { + /* + * Intel SDM states that MSR_IA32_TSC read adds the TSC offset + * even when not intercepted. AMD manual doesn't explicitly + * state this but appears to behave the same. + * + * On userspace reads and writes, however, we unconditionally + * return L1's TSC value to ensure backwards-compatible + * behavior for migration. + */ + u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset : + vcpu->arch.tsc_offset; + + msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset; + break; + } case MSR_MTRRcap: case 0x200 ... 0x2ff: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2805,7 +3567,6 @@ break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; @@ -2833,21 +3594,64 @@ msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->kvm->arch.wall_clock; + break; case MSR_KVM_WALL_CLOCK_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) + return 1; + + msr_info->data = vcpu->arch.time; + break; case MSR_KVM_SYSTEM_TIME_NEW: + if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) + return 1; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) + return 1; + + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + msr_info->data = vcpu->arch.apf.msr_int_val; + break; + case MSR_KVM_ASYNC_PF_ACK: + if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) + return 1; + + msr_info->data = 0; break; case MSR_KVM_STEAL_TIME: + if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) + return 1; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) + return 1; + msr_info->data = vcpu->arch.pv_eoi.msr_val; + break; + case MSR_KVM_POLL_CONTROL: + if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) + return 1; + + msr_info->data = vcpu->arch.msr_kvm_poll_control; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: @@ -2857,6 +3661,12 @@ case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2870,6 +3680,8 @@ msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -2879,7 +3691,6 @@ return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -2912,20 +3723,13 @@ case MSR_MISC_FEATURES_ENABLES: msr_info->data = vcpu->arch.msr_misc_features_enables; break; + case MSR_K7_HWCR: + msr_info->data = vcpu->arch.msr_hwcr; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) - return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } - break; + return kvm_pmu_get_msr(vcpu, msr_info); + return KVM_MSR_RET_INVALID; } return 0; } @@ -2966,7 +3770,7 @@ unsigned size; r = -EFAULT; - if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + if (copy_from_user(&msrs, user_msrs, sizeof(msrs))) goto out; r = -E2BIG; @@ -3037,24 +3841,33 @@ case KVM_CAP_HYPERV_VP_INDEX: case KVM_CAP_HYPERV_EVENTFD: case KVM_CAP_HYPERV_TLBFLUSH: + case KVM_CAP_HYPERV_SEND_IPI: + case KVM_CAP_HYPERV_CPUID: case KVM_CAP_PCI_SEGMENT: case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: - case KVM_CAP_ENABLE_CAP_VM: case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: + case KVM_CAP_PMU_EVENT_FILTER: case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: + case KVM_CAP_EXCEPTION_PAYLOAD: + case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: + case KVM_CAP_X86_USER_SPACE_MSR: + case KVM_CAP_X86_MSR_FILTER: + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -3064,7 +3877,8 @@ r = KVM_CLOCK_TSC_STABLE; break; case KVM_CAP_X86_DISABLE_EXITS: - r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE; + r |= KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE | + KVM_X86_DISABLE_EXITS_CSTATE; if(kvm_can_mwait_in_guest()) r |= KVM_X86_DISABLE_EXITS_MWAIT; break; @@ -3077,10 +3891,10 @@ * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); + r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = !kvm_x86_ops.cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3090,9 +3904,6 @@ break; case KVM_CAP_MAX_VCPU_ID: r = KVM_MAX_VCPU_ID; - break; - case KVM_CAP_NR_MEMSLOTS: - r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -3110,8 +3921,20 @@ r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; + break; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + r = kvm_x86_ops.enable_direct_tlbflush != NULL; + break; + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; + break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; + case KVM_CAP_STEAL_TIME: + r = sched_info_on(); break; default: break; @@ -3133,11 +3956,11 @@ unsigned n; r = -EFAULT; - if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list))) goto out; n = msr_list.nmsrs; msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; - if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list))) goto out; r = -E2BIG; if (n < msr_list.nmsrs) @@ -3159,7 +3982,7 @@ struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries, @@ -3168,12 +3991,12 @@ goto out; r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) @@ -3205,9 +4028,9 @@ case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; - } default: r = -EINVAL; + break; } out: return r; @@ -3227,14 +4050,17 @@ { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) + if (kvm_x86_ops.has_wbinvd_exit()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_x86_ops.vcpu_load(vcpu, cpu); + + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3275,52 +4101,68 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + /* + * The vCPU can be marked preempted if and only if the VM-Exit was on + * an instruction boundary and will not trigger guest emulation of any + * kind (see vcpu_run). Vendor specific code controls (conservatively) + * when this is true, for example allowing the vCPU to be marked + * preempted if and only if the VM-Exit was due to a host interrupt. + */ + if (!vcpu->arch.at_instruction_boundary) { + vcpu->stat.preemption_other++; + return; + } + + vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (unlikely(slots->generation != ghc->generation || + gpa != ghc->gpa || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) + return; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); + + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + if (vcpu->preempted) { + vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); - /* - * Disable page faults because we're in atomic context here. - * kvm_write_guest_offset_cached() would call might_fault() - * that relies on pagefault_disable() to tell if there's a - * bug. NOTE: the write to guest memory may not go through if - * during postcopy live migration or if there's heavy guest - * paging. - */ - pagefault_disable(); - /* - * kvm_memslots() will be called by - * kvm_write_guest_offset_cached() so take the srcu lock. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - pagefault_enable(); - kvm_x86_ops->vcpu_put(vcpu); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + kvm_x86_ops.vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -3334,7 +4176,7 @@ struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -3453,8 +4295,7 @@ for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - if (kvm_x86_ops->setup_mce) - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops.setup_mce(vcpu); out: return r; } @@ -3516,28 +4357,56 @@ process_smi(vcpu); /* - * FIXME: pass injected and pending separately. This is only - * needed for nested virtualization, whose state cannot be - * migrated yet. For now we can combine them. + * In guest mode, payload delivery should be deferred, + * so that the L1 hypervisor can intercept #PF before + * CR2 is modified (or intercept #DB before DR6 is + * modified under nVMX). Unless the per-VM capability, + * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of + * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we + * opportunistically defer the exception payload, deliver it if the + * capability hasn't been requested before processing a + * KVM_GET_VCPU_EVENTS. */ - events->exception.injected = - (vcpu->arch.exception.pending || - vcpu->arch.exception.injected) && - !kvm_exception_is_soft(vcpu->arch.exception.nr); + if (!vcpu->kvm->arch.exception_payload_enabled && + vcpu->arch.exception.pending && vcpu->arch.exception.has_payload) + kvm_deliver_exception_payload(vcpu); + + /* + * The API doesn't provide the instruction length for software + * exceptions, so don't report them. As long as the guest RIP + * isn't advanced, we should expect to encounter the exception + * again. + */ + if (kvm_exception_is_soft(vcpu->arch.exception.nr)) { + events->exception.injected = 0; + events->exception.pending = 0; + } else { + events->exception.injected = vcpu->arch.exception.injected; + events->exception.pending = vcpu->arch.exception.pending; + /* + * For ABI compatibility, deliberately conflate + * pending and injected exceptions when + * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. + */ + if (!vcpu->kvm->arch.exception_payload_enabled) + events->exception.injected |= + vcpu->arch.exception.pending; + } events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; - events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; + events->exception_has_payload = vcpu->arch.exception.has_payload; + events->exception_payload = vcpu->arch.exception.payload; events->interrupt.injected = vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -3551,10 +4420,13 @@ events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SHADOW | KVM_VCPUEVENT_VALID_SMM); + if (vcpu->kvm->arch.exception_payload_enabled) + events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; + memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); +static void kvm_smm_changed(struct kvm_vcpu *vcpu); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -3562,12 +4434,24 @@ if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW - | KVM_VCPUEVENT_VALID_SMM)) + | KVM_VCPUEVENT_VALID_SMM + | KVM_VCPUEVENT_VALID_PAYLOAD)) return -EINVAL; - if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || - is_guest_mode(vcpu))) + if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { + if (!vcpu->kvm->arch.exception_payload_enabled) + return -EINVAL; + if (events->exception.pending) + events->exception.injected = 0; + else + events->exception_has_payload = 0; + } else { + events->exception.pending = 0; + events->exception_has_payload = 0; + } + + if ((events->exception.injected || events->exception.pending) && + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) return -EINVAL; /* INITs are latched while in SMM */ @@ -3577,35 +4461,40 @@ return -EINVAL; process_nmi(vcpu); - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.injected = events->exception.injected; + vcpu->arch.exception.pending = events->exception.pending; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; vcpu->arch.exception.error_code = events->exception.error_code; + vcpu->arch.exception.has_payload = events->exception_has_payload; + vcpu->arch.exception.payload = events->exception_payload; vcpu->arch.interrupt.injected = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, + kvm_x86_ops.set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - u32 hflags = vcpu->arch.hflags; - if (events->smi.smm) - hflags |= HF_SMM_MASK; - else - hflags &= ~HF_SMM_MASK; - kvm_set_hflags(vcpu, hflags); + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + + kvm_x86_ops.nested_ops->leave_nested(vcpu); + kvm_smm_changed(vcpu); + } vcpu->arch.smi_pending = events->smi.pending; @@ -3614,12 +4503,13 @@ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - } + } + + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } @@ -3633,12 +4523,11 @@ { unsigned long val; + memset(dbgregs, 0, sizeof(*dbgregs)); memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); kvm_get_dr(vcpu, 6, &val); dbgregs->dr6 = val; dbgregs->dr7 = vcpu->arch.dr7; - dbgregs->flags = 0; - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -3655,7 +4544,6 @@ memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); @@ -3666,7 +4554,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = xsave->header.xfeatures; u64 valid; @@ -3686,15 +4574,15 @@ */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *src = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *src = get_xsave_addr(xsave, xfeature_nr); if (src) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(dest + offset, &vcpu->arch.pkru, sizeof(vcpu->arch.pkru)); else @@ -3702,13 +4590,13 @@ } - valid -= feature; + valid -= xfeature_mask; } } static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3729,22 +4617,22 @@ */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *dest = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *dest = get_xsave_addr(xsave, xfeature_nr); if (dest) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(&vcpu->arch.pkru, src + offset, sizeof(vcpu->arch.pkru)); else memcpy(dest, src + offset, size); } - valid -= feature; + valid -= xfeature_mask; } } @@ -3756,7 +4644,7 @@ fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state.fxsave, + &vcpu->arch.guest_fpu->state.fxsave, sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XFEATURE_MASK_FPSSE; @@ -3778,15 +4666,14 @@ * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) + if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { if (xstate_bv & ~XFEATURE_MASK_FPSSE || mxcsr & ~mxcsr_feature_mask) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state.fxsave, + memcpy(&vcpu->arch.guest_fpu->state.fxsave, guest_xsave->region, sizeof(struct fxregs_state)); } return 0; @@ -3847,6 +4734,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, struct kvm_enable_cap *cap) { + int r; + uint16_t vmcs_version; + void __user *user_ptr; + if (cap->flags) return -EINVAL; @@ -3854,11 +4745,37 @@ case KVM_CAP_HYPERV_SYNIC2: if (cap->args[0]) return -EINVAL; + fallthrough; + case KVM_CAP_HYPERV_SYNIC: if (!irqchip_in_kernel(vcpu->kvm)) return -EINVAL; return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); + case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: + if (!kvm_x86_ops.nested_ops->enable_evmcs) + return -ENOTTY; + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); + if (!r) { + user_ptr = (void __user *)(uintptr_t)cap->args[0]; + if (copy_to_user(user_ptr, &vmcs_version, + sizeof(vmcs_version))) + r = -EFAULT; + } + return r; + case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: + if (!kvm_x86_ops.enable_direct_tlbflush) + return -ENOTTY; + + return kvm_x86_ops.enable_direct_tlbflush(vcpu); + + case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: + vcpu->arch.pv_cpuid.enforce = cap->args[0]; + if (vcpu->arch.pv_cpuid.enforce) + kvm_update_pv_runtime(vcpu); + + return 0; + default: return -EINVAL; } @@ -3885,7 +4802,8 @@ r = -EINVAL; if (!lapic_in_kernel(vcpu)) goto out; - u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), + GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.lapic) @@ -3916,7 +4834,7 @@ struct kvm_interrupt irq; r = -EFAULT; - if (copy_from_user(&irq, argp, sizeof irq)) + if (copy_from_user(&irq, argp, sizeof(irq))) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; @@ -3934,7 +4852,7 @@ struct kvm_cpuid cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); break; @@ -3944,7 +4862,7 @@ struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); @@ -3955,14 +4873,14 @@ struct kvm_cpuid2 cpuid; r = -EFAULT; - if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) goto out; r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid, cpuid_arg->entries); if (r) goto out; r = -EFAULT; - if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid)) + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) goto out; r = 0; break; @@ -3983,13 +4901,13 @@ struct kvm_tpr_access_ctl tac; r = -EFAULT; - if (copy_from_user(&tac, argp, sizeof tac)) + if (copy_from_user(&tac, argp, sizeof(tac))) goto out; r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &tac, sizeof tac)) + if (copy_to_user(argp, &tac, sizeof(tac))) goto out; r = 0; break; @@ -4002,7 +4920,7 @@ if (!lapic_in_kernel(vcpu)) goto out; r = -EFAULT; - if (copy_from_user(&va, argp, sizeof va)) + if (copy_from_user(&va, argp, sizeof(va))) goto out; idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); @@ -4013,7 +4931,7 @@ u64 mcg_cap; r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap))) goto out; r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); break; @@ -4022,7 +4940,7 @@ struct kvm_x86_mce mce; r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) + if (copy_from_user(&mce, argp, sizeof(mce))) goto out; r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); break; @@ -4072,7 +4990,7 @@ break; } case KVM_GET_XSAVE: { - u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xsave) break; @@ -4096,7 +5014,7 @@ break; } case KVM_GET_XCRS: { - u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); r = -ENOMEM; if (!u.xcrs) break; @@ -4126,7 +5044,8 @@ r = -EINVAL; user_tsc_khz = (u32)arg; - if (user_tsc_khz >= kvm_max_guest_tsc_khz) + if (kvm_has_tsc_control && + user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; if (user_tsc_khz == 0) @@ -4159,7 +5078,7 @@ u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops->get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4167,8 +5086,8 @@ if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@ -4189,7 +5108,7 @@ int idx; r = -EINVAL; - if (!kvm_x86_ops->set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@ -4201,16 +5120,38 @@ break; if (kvm_state.flags & - ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE)) + ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING + | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ - if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING) + if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) + && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; + } + case KVM_GET_SUPPORTED_HV_CPUID: { + struct kvm_cpuid2 __user *cpuid_arg = argp; + struct kvm_cpuid2 cpuid; + + r = -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid))) + goto out; + + r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid, + cpuid_arg->entries); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid))) + goto out; + r = 0; break; } default: @@ -4234,14 +5175,14 @@ if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); + ret = kvm_x86_ops.set_tss_addr(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); + return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4382,9 +5323,6 @@ { struct kvm_pit *pit = kvm->arch.vpit; - if (!pit) - return -ENXIO; - /* pit->pit_state.lock was overloaded to prevent userspace from getting * an inconsistent state after running multiple KVM_REINJECT_CONTROL * ioctls in parallel. Use a separate lock if that ioctl isn't rare. @@ -4396,50 +5334,13 @@ return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool is_dirty = false; - int r; - - mutex_lock(&kvm->slots_lock); - /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &is_dirty); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (is_dirty) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; + if (kvm_x86_ops.flush_log_dirty) + kvm_x86_ops.flush_log_dirty(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -4454,8 +5355,8 @@ return 0; } -static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, - struct kvm_enable_cap *cap) +int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) { int r; @@ -4513,10 +5414,25 @@ kvm->arch.hlt_in_guest = true; if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) kvm->arch.pause_in_guest = true; + if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) + kvm->arch.cstate_in_guest = true; r = 0; break; case KVM_CAP_MSR_PLATFORM_INFO: kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; + r = 0; + break; + case KVM_CAP_EXCEPTION_PAYLOAD: + kvm->arch.exception_payload_enabled = cap->args[0]; + r = 0; + break; + case KVM_CAP_X86_USER_SPACE_MSR: + r = -EINVAL; + if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL | + KVM_MSR_EXIT_REASON_UNKNOWN | + KVM_MSR_EXIT_REASON_FILTER)) + break; + kvm->arch.user_space_msr_mask = cap->args[0]; r = 0; break; default: @@ -4525,6 +5441,180 @@ } return r; } + +static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) +{ + struct kvm_x86_msr_filter *msr_filter; + + msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); + if (!msr_filter) + return NULL; + + msr_filter->default_allow = default_allow; + return msr_filter; +} + +static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) +{ + u32 i; + + if (!msr_filter) + return; + + for (i = 0; i < msr_filter->count; i++) + kfree(msr_filter->ranges[i].bitmap); + + kfree(msr_filter); +} + +static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, + struct kvm_msr_filter_range *user_range) +{ + struct msr_bitmap_range range; + unsigned long *bitmap = NULL; + size_t bitmap_size; + int r; + + if (!user_range->nmsrs) + return 0; + + bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); + if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) + return -EINVAL; + + bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); + if (IS_ERR(bitmap)) + return PTR_ERR(bitmap); + + range = (struct msr_bitmap_range) { + .flags = user_range->flags, + .base = user_range->base, + .nmsrs = user_range->nmsrs, + .bitmap = bitmap, + }; + + if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) { + r = -EINVAL; + goto err; + } + + if (!range.flags) { + r = -EINVAL; + goto err; + } + + /* Everything ok, add this range identifier. */ + msr_filter->ranges[msr_filter->count] = range; + msr_filter->count++; + + return 0; +err: + kfree(bitmap); + return r; +} + +static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, + struct kvm_msr_filter *filter) +{ + struct kvm_x86_msr_filter *new_filter, *old_filter; + bool default_allow; + bool empty = true; + int r = 0; + u32 i; + + if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY) + return -EINVAL; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) + empty &= !filter->ranges[i].nmsrs; + + default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); + if (empty && !default_allow) + return -EINVAL; + + new_filter = kvm_alloc_msr_filter(default_allow); + if (!new_filter) + return -ENOMEM; + + for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { + r = kvm_add_msr_filter(new_filter, &filter->ranges[i]); + if (r) { + kvm_free_msr_filter(new_filter); + return r; + } + } + + mutex_lock(&kvm->lock); + + /* The per-VM filter is protected by kvm->lock... */ + old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1); + + rcu_assign_pointer(kvm->arch.msr_filter, new_filter); + synchronize_srcu(&kvm->srcu); + + kvm_free_msr_filter(old_filter); + + kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); + mutex_unlock(&kvm->lock); + + return 0; +} + +#ifdef CONFIG_KVM_COMPAT +/* for KVM_X86_SET_MSR_FILTER */ +struct kvm_msr_filter_range_compat { + __u32 flags; + __u32 nmsrs; + __u32 base; + __u32 bitmap; +}; + +struct kvm_msr_filter_compat { + __u32 flags; + struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; +}; + +#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) + +long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct kvm *kvm = filp->private_data; + long r = -ENOTTY; + + switch (ioctl) { + case KVM_X86_SET_MSR_FILTER_COMPAT: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter_compat filter_compat; + struct kvm_msr_filter filter; + int i; + + if (copy_from_user(&filter_compat, user_msr_filter, + sizeof(filter_compat))) + return -EFAULT; + + filter.flags = filter_compat.flags; + for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { + struct kvm_msr_filter_range_compat *cr; + + cr = &filter_compat.ranges[i]; + filter.ranges[i] = (struct kvm_msr_filter_range) { + .flags = cr->flags, + .nmsrs = cr->nmsrs, + .base = cr->base, + .bitmap = (__u8 *)(ulong)cr->bitmap, + }; + } + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } + } + + return r; +} +#endif long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -4555,7 +5645,7 @@ if (kvm->created_vcpus) goto set_identity_unlock; r = -EFAULT; - if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + if (copy_from_user(&ident_addr, argp, sizeof(ident_addr))) goto set_identity_unlock; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); set_identity_unlock: @@ -4639,7 +5729,7 @@ if (r) goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, chip, sizeof *chip)) + if (copy_to_user(argp, chip, sizeof(*chip))) goto get_irqchip_out; r = 0; get_irqchip_out: @@ -4660,9 +5750,6 @@ if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); - if (r) - goto set_irqchip_out; - r = 0; set_irqchip_out: kfree(chip); break; @@ -4685,7 +5772,7 @@ } case KVM_SET_PIT: { r = -EFAULT; - if (copy_from_user(&u.ps, argp, sizeof u.ps)) + if (copy_from_user(&u.ps, argp, sizeof(u.ps))) goto out; mutex_lock(&kvm->lock); r = -ENXIO; @@ -4726,6 +5813,9 @@ struct kvm_reinject_control control; r = -EFAULT; if (copy_from_user(&control, argp, sizeof(control))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); break; @@ -4790,19 +5880,10 @@ r = 0; break; } - case KVM_ENABLE_CAP: { - struct kvm_enable_cap cap; - - r = -EFAULT; - if (copy_from_user(&cap, argp, sizeof(cap))) - goto out; - r = kvm_vm_ioctl_enable_cap(kvm, &cap); - break; - } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops->mem_enc_op) - r = kvm_x86_ops->mem_enc_op(kvm, argp); + if (kvm_x86_ops.mem_enc_op) + r = kvm_x86_ops.mem_enc_op(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -4813,8 +5894,8 @@ goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_reg_region) - r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_reg_region) + r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -4825,8 +5906,8 @@ goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_unreg_region) - r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_unreg_region) + r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -4838,6 +5919,19 @@ r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd); break; } + case KVM_SET_PMU_EVENT_FILTER: + r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); + break; + case KVM_X86_SET_MSR_FILTER: { + struct kvm_msr_filter __user *user_msr_filter = argp; + struct kvm_msr_filter filter; + + if (copy_from_user(&filter, user_msr_filter, sizeof(filter))) + return -EFAULT; + + r = kvm_vm_ioctl_set_msr_filter(kvm, &filter); + break; + } default: r = -ENOTTY; } @@ -4847,58 +5941,96 @@ static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, + "Please update the fixed PMCs in msrs_to_saved_all[]"); + + perf_get_x86_pmu_capability(&x86_pmu); + + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) + continue; + break; + case MSR_IA32_UMWAIT_CONTROL: + if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) + continue; + break; + case MSR_IA32_RTIT_CTL: + case MSR_IA32_RTIT_STATUS: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) + continue; + break; + case MSR_IA32_RTIT_CR3_MATCH: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) + continue; + break; + case MSR_IA32_RTIT_OUTPUT_BASE: + case MSR_IA32_RTIT_OUTPUT_MASK: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && + !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) + continue; + break; + case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= + intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) + continue; + break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) continue; break; default: break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4947,13 +6079,13 @@ static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->set_segment(vcpu, var, seg); + kvm_x86_ops.set_segment(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->get_segment(vcpu, var, seg); + kvm_x86_ops.get_segment(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -4965,7 +6097,7 @@ /* NPT walks are always user-walks */ access |= PFERR_USER_MASK; - t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception); + t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception); return t_gpa; } @@ -4973,14 +6105,14 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -4988,7 +6120,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5037,7 +6169,7 @@ struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5062,7 +6194,7 @@ gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5083,7 +6215,7 @@ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -5136,7 +6268,7 @@ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -5149,13 +6281,6 @@ /* kvm_write_guest_virt_system can pull in tons of pages. */ vcpu->arch.l1tf_flush_l1d = true; - /* - * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED - * is returned, but our callers are not ready for that and they blindly - * call kvm_inject_page_fault. Ensure that they at least do not leak - * uninitialized kernel stack memory into cr2 and error code. - */ - memset(exception, 0, sizeof(*exception)); return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, PFERR_WRITE_MASK, exception); } @@ -5163,25 +6288,23 @@ int handle_ud(struct kvm_vcpu *vcpu) { + static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; int emul_type = EMULTYPE_TRAP_UD; - enum emulation_result er; char sig[5]; /* ud2; .ascii "kvm" */ struct x86_exception e; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0))) + return 1; if (force_emulation_prefix && kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), sig, sizeof(sig), &e) == 0 && - memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) { + memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) { kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig)); - emul_type = 0; + emul_type = EMULTYPE_TRAP_UD_FORCED; } - er = kvm_emulate_instruction(vcpu, emul_type); - if (er == EMULATE_USER_EXIT) - return 0; - if (er != EMULATE_DONE) - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; + return kvm_emulate_instruction(vcpu, emul_type); } EXPORT_SYMBOL_GPL(handle_ud); @@ -5204,7 +6327,7 @@ gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -5214,7 +6337,7 @@ */ if (vcpu_match_mmio_gva(vcpu, gva) && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.access, 0, access)) { + vcpu->arch.mmio_access, 0, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -5323,7 +6446,7 @@ int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@ -5332,10 +6455,9 @@ * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5456,9 +6578,10 @@ unsigned int bytes, struct x86_exception *exception) { + struct kvm_host_map map; struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + u64 page_line_mask; gpa_t gpa; - struct page *page; char *kaddr; bool exchanged; @@ -5472,15 +6595,23 @@ (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto emul_write; - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + /* + * Emulate the atomic as a straight write to avoid #AC if SLD is + * enabled in the host and the access splits a cache line. + */ + if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) + page_line_mask = ~(cache_line_size() - 1); + else + page_line_mask = PAGE_MASK; + + if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) goto emul_write; - page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); - if (is_error_page(page)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map)) goto emul_write; - kaddr = kmap_atomic(page); - kaddr += offset_in_page(gpa); + kaddr = map.hva + offset_in_page(gpa); + switch (bytes) { case 1: exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); @@ -5497,13 +6628,12 @@ default: BUG(); } - kunmap_atomic(kaddr); - kvm_release_page_dirty(page); + + kvm_vcpu_unmap(vcpu, &map, true); if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_page_track_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -5557,11 +6687,9 @@ return 0; } -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int ret; if (vcpu->arch.pio.count) @@ -5581,20 +6709,33 @@ return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); +} + +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + return kvm_x86_ops.get_segment_base(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -5607,7 +6748,7 @@ if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops->has_wbinvd_exit()) { + if (kvm_x86_ops.has_wbinvd_exit()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -5712,27 +6853,27 @@ static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); + return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -5810,28 +6951,33 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - struct msr_data msr; + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int r; - msr.index = msr_index; - msr.host_initiated = false; - r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); - if (r) - return r; + r = kvm_get_msr(vcpu, msr_index, pdata); - *pdata = msr.data; - return 0; + if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - struct msr_data msr; + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + int r; - msr.data = data; - msr.index = msr_index; - msr.host_initiated = false; - return kvm_set_msr(emul_to_vcpu(ctxt), &msr); + r = kvm_set_msr(vcpu, msr_index, data); + + if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) { + /* Bounce to user space */ + return X86EMUL_IO_NEEDED; + } + + return r; } static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) @@ -5851,7 +6997,7 @@ static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -5869,13 +7015,35 @@ struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); +} + +static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM); +} + +static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); +} + +static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); +} + +static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) +{ + return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); } static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) @@ -5890,7 +7058,7 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); + kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -5900,12 +7068,26 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.hflags = emul_flags; + kvm_mmu_reset_context(vcpu); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, + const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); + return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); +} + +static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) +{ + kvm_smm_changed(emul_to_vcpu(ctxt)); +} + +static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) +{ + return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); } static const struct x86_emulate_ops emulate_ops = { @@ -5944,15 +7126,21 @@ .fix_hypercall = emulator_fix_hypercall, .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, + .guest_has_long_mode = emulator_guest_has_long_mode, + .guest_has_movbe = emulator_guest_has_movbe, + .guest_has_fxsr = emulator_guest_has_fxsr, + .guest_has_rdpid = emulator_guest_has_rdpid, .set_nmi_mask = emulator_set_nmi_mask, .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, + .post_leave_smm = emulator_post_leave_smm, + .set_xcr = emulator_set_xcr, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -5963,7 +7151,7 @@ if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + kvm_x86_ops.set_interrupt_shadow(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -5971,9 +7159,9 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, @@ -5983,13 +7171,31 @@ return false; } +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@ -6003,13 +7209,18 @@ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->interruptibility = 0; + ctxt->have_exception = false; + ctxt->exception.vector = -1; + ctxt->perm_ok = false; + init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) +void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -6019,37 +7230,43 @@ ctxt->_eip = ctxt->eip + inc_eip; ret = emulate_int_real(ctxt, irq); - if (ret != X86EMUL_CONTINUE) - return EMULATE_FAIL; - - ctxt->eip = ctxt->_eip; - kvm_rip_write(vcpu, ctxt->eip); - kvm_set_rflags(vcpu, ctxt->eflags); - - return EMULATE_DONE; + if (ret != X86EMUL_CONTINUE) { + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); + } else { + ctxt->eip = ctxt->_eip; + kvm_rip_write(vcpu, ctxt->eip); + kvm_set_rflags(vcpu, ctxt->eflags); + } } EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) { - int r = EMULATE_DONE; - ++vcpu->stat.insn_emulation_fail; trace_kvm_emulate_insn_failed(vcpu); - if (emulation_type & EMULTYPE_NO_UD_ON_FAIL) - return EMULATE_FAIL; + if (emulation_type & EMULTYPE_VMWARE_GP) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (emulation_type & EMULTYPE_SKIP) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; - r = EMULATE_USER_EXIT; + return 0; } kvm_queue_exception(vcpu, UD_VECTOR); - return r; + if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } + + return 1; } static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, @@ -6059,13 +7276,14 @@ gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; - if (!vcpu->arch.mmu.direct_map) { + if (!vcpu->arch.mmu->direct_map) { /* * Write permission should be allowed since only * write access need to be emulated. @@ -6098,7 +7316,7 @@ kvm_release_pfn_clean(pfn); /* The instructions are well-emulated on direct mmu. */ - if (vcpu->arch.mmu.direct_map) { + if (vcpu->arch.mmu->direct_map) { unsigned int indirect_shadow_pages; spin_lock(&vcpu->kvm->mmu_lock); @@ -6150,10 +7368,11 @@ */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6165,7 +7384,7 @@ vcpu->arch.last_retry_eip = ctxt->eip; vcpu->arch.last_retry_addr = cr2_or_gpa; - if (!vcpu->arch.mmu.direct_map) + if (!vcpu->arch.mmu->direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); @@ -6189,16 +7408,6 @@ kvm_mmu_reset_context(vcpu); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) -{ - unsigned changed = vcpu->arch.hflags ^ emul_flags; - - vcpu->arch.hflags = emul_flags; - - if (changed & HF_SMM_MASK) - kvm_smm_changed(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -6214,34 +7423,29 @@ return dr6; } -static void kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu, int *r) +static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) { struct kvm_run *kvm_run = vcpu->run; if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM; - kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; + kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; - } else { - /* - * "Certain debug exceptions may clear bit 0-3. The - * remaining contents of the DR6 register are never - * cleared by the processor". - */ - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + return 0; } + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); + return 1; } int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - int r = EMULATE_DONE; + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); + int r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops.skip_emulated_instruction(vcpu); + if (unlikely(!r)) + return 0; /* * rflags is the old, "raw" value of the flags. The new value has @@ -6252,12 +7456,12 @@ * that sets the TF flag". */ if (unlikely(rflags & X86_EFLAGS_TF)) - kvm_vcpu_do_singlestep(vcpu, &r); - return r == EMULATE_DONE; + r = kvm_vcpu_do_singlestep(vcpu); + return r; } EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); -static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) +static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { @@ -6272,7 +7476,7 @@ kvm_run->debug.arch.pc = eip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; - *r = EMULATE_USER_EXIT; + *r = 0; return true; } } @@ -6285,10 +7489,8 @@ vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); - *r = EMULATE_DONE; + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); + *r = 1; return true; } } @@ -6327,13 +7529,45 @@ return false; } +/* + * Decode an instruction for emulation. The caller is responsible for handling + * code breakpoints. Note, manually detecting code breakpoints is unnecessary + * (and wrong) when emulating on an intercepted fault-like exception[*], as + * code breakpoints have higher priority and thus have already been done by + * hardware. + * + * [*] Except #MC, which is higher priority, but KVM should never emulate in + * response to a machine check. + */ +int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, + void *insn, int insn_len) +{ + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; + int r; + + init_emulate_ctxt(vcpu); + + ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; + + r = x86_decode_insn(ctxt, insn, insn_len); + + trace_kvm_emulate_insn_start(vcpu); + ++vcpu->stat.insn_emulation; + + return r; +} +EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); + int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; - bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + bool write_fault_to_spt; + + if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len))) + return 1; vcpu->arch.l1tf_flush_l1d = true; @@ -6341,40 +7575,36 @@ * Clear write_fault_to_shadow_pgtable here to ensure it is * never reused. */ + write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; vcpu->arch.write_fault_to_shadow_pgtable = false; - kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - init_emulate_ctxt(vcpu); + kvm_clear_exception_queue(vcpu); /* - * We will reenter on the same instruction since - * we do not set complete_userspace_io. This does not - * handle watchpoints yet, those would be handled in - * the emulate_ops. + * Return immediately if RIP hits a code breakpoint, such #DBs + * are fault-like and are higher priority than any faults on + * the code fetch itself. */ if (!(emulation_type & EMULTYPE_SKIP) && - kvm_vcpu_check_breakpoint(vcpu, &r)) + kvm_vcpu_check_code_breakpoint(vcpu, &r)) return r; - ctxt->interruptibility = 0; - ctxt->have_exception = false; - ctxt->exception.vector = -1; - ctxt->perm_ok = false; - - ctxt->ud = emulation_type & EMULTYPE_TRAP_UD; - - r = x86_decode_insn(ctxt, insn, insn_len); - - trace_kvm_emulate_insn_start(vcpu); - ++vcpu->stat.insn_emulation; + r = x86_decode_emulated_instruction(vcpu, emulation_type, + insn, insn_len); if (r != EMULATION_OK) { - if (emulation_type & EMULTYPE_TRAP_UD) - return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, - emulation_type)) - return EMULATE_DONE; - if (ctxt->have_exception) { + if ((emulation_type & EMULTYPE_TRAP_UD) || + (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + if (reexecute_instruction(vcpu, cr2_or_gpa, + write_fault_to_spt, + emulation_type)) + return 1; + + if (ctxt->have_exception && + !(emulation_type & EMULTYPE_SKIP)) { /* * #UD should result in just EMULATION_FAILED, and trap-like * exception should not be encountered during decode. @@ -6382,27 +7612,32 @@ WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || exception_type(ctxt->exception.vector) == EXCPT_TRAP); inject_emulated_exception(vcpu); - return EMULATE_DONE; + return 1; } - if (emulation_type & EMULTYPE_SKIP) - return EMULATE_FAIL; return handle_emulation_failure(vcpu, emulation_type); } } - if ((emulation_type & EMULTYPE_VMWARE) && - !is_vmware_backdoor_opcode(ctxt)) - return EMULATE_FAIL; + if ((emulation_type & EMULTYPE_VMWARE_GP) && + !is_vmware_backdoor_opcode(ctxt)) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + /* + * Note, EMULTYPE_SKIP is intended for use *only* by vendor callbacks + * for kvm_skip_emulated_instruction(). The caller is responsible for + * updating interruptibility state and injecting single-step #DBs. + */ if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); if (ctxt->eflags & X86_EFLAGS_RF) kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); - return EMULATE_DONE; + return 1; } if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) - return EMULATE_DONE; + return 1; /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ @@ -6412,24 +7647,35 @@ } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) - return EMULATE_DONE; + return 1; if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt, emulation_type)) - return EMULATE_DONE; + return 1; return handle_emulation_failure(vcpu, emulation_type); } if (ctxt->have_exception) { - r = EMULATE_DONE; + r = 1; if (inject_emulated_exception(vcpu)) return r; } else if (vcpu->arch.pio.count) { @@ -6440,26 +7686,36 @@ writeback = false; vcpu->arch.complete_userspace_io = complete_emulated_pio; } - r = EMULATE_USER_EXIT; + r = 0; } else if (vcpu->mmio_needed) { + ++vcpu->stat.mmio_exits; + if (!vcpu->mmio_is_write) writeback = false; - r = EMULATE_USER_EXIT; + r = 0; vcpu->arch.complete_userspace_io = complete_emulated_mmio; } else if (r == EMULATION_RESTART) goto restart; else - r = EMULATE_DONE; + r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + + /* + * Note, EXCPT_DB is assumed to be fault-like as the emulator + * only supports code breakpoints and general detect #DB, both + * of which are fault-like. + */ if (!ctxt->have_exception || exception_type(ctxt->exception.vector) == EXCPT_TRAP) { kvm_rip_write(vcpu, ctxt->eip); - if (r == EMULATE_DONE && ctxt->tf) - kvm_vcpu_do_singlestep(vcpu, &r); + if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) + r = kvm_vcpu_do_singlestep(vcpu); + if (kvm_x86_ops.update_emulated_instruction) + kvm_x86_ops.update_emulated_instruction(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -6509,9 +7765,9 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + unsigned long val = kvm_rax_read(vcpu); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@ -6544,16 +7800,14 @@ } /* For size less than 4 we merge, else we zero extend */ - val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) - : 0; + val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform * the copy and tracing */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); + kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); } @@ -6565,12 +7819,11 @@ int ret; /* For size less than 4 we merge, else we zero extend */ - val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { - kvm_register_write(vcpu, VCPU_REGS_RAX, val); + kvm_rax_write(vcpu, val); return ret; } @@ -6649,10 +7902,8 @@ } #endif -static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) +static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) { - struct cpufreq_freqs *freq = data; struct kvm *kvm; struct kvm_vcpu *vcpu; int i, send_ipi = 0; @@ -6696,17 +7947,12 @@ * */ - if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) - return 0; - if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) - return 0; - - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(cpu, tsc_khz_changed, freq, 1); mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu != freq->cpu) + if (vcpu->cpu != cpu) continue; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != raw_smp_processor_id()) @@ -6728,8 +7974,24 @@ * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); + smp_call_function_single(cpu, tsc_khz_changed, freq, 1); } +} + +static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + int cpu; + + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) + return 0; + if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) + return 0; + + for_each_cpu(cpu, freq->policy->cpus) + __kvmclock_cpufreq_notifier(freq, cpu); + return 0; } @@ -6749,20 +8011,21 @@ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { #ifdef CONFIG_CPU_FREQ - struct cpufreq_policy policy; + struct cpufreq_policy *policy; int cpu; - memset(&policy, 0, sizeof(policy)); cpu = get_cpu(); - cpufreq_get_policy(&policy, cpu); - if (policy.cpuinfo.max_freq) - max_tsc_khz = policy.cpuinfo.max_freq; + policy = cpufreq_cpu_get(cpu); + if (policy) { + if (policy->cpuinfo.max_freq) + max_tsc_khz = policy->cpuinfo.max_freq; + cpufreq_cpu_put(policy); + } put_cpu(); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); @@ -6781,7 +8044,7 @@ int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); + user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -6796,10 +8059,20 @@ return ip; } +static void kvm_handle_intel_pt_intr(void) +{ + struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu); + + kvm_make_request(KVM_REQ_PMI, vcpu); + __set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT, + (unsigned long *)&vcpu->arch.pmu.global_status); +} + static struct perf_guest_info_callbacks kvm_guest_cbs = { .is_in_guest = kvm_is_in_guest, .is_user_mode = kvm_is_user_mode, .get_guest_ip = kvm_get_guest_ip, + .handle_intel_pt_intr = NULL, }; #ifdef CONFIG_X86_64 @@ -6821,6 +8094,18 @@ static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); /* + * Indirection to move queue_work() out of the tk_core.seq write held + * region to prevent possible deadlocks against time accessors which + * are invoked with work related locks held. + */ +static void pvclock_irq_work_fn(struct irq_work *w) +{ + queue_work(system_long_wq, &pvclock_gtod_work); +} + +static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); + +/* * Notification about pvclock gtod data update. */ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, @@ -6831,13 +8116,14 @@ update_pvclock_gtod(tk); - /* disable master clock if host does not trust, or does not - * use, TSC based clocksource. + /* + * Disable master clock if host does not trust, or does not use, + * TSC based clocksource. Delegate queue_work() to irq_work as + * this is invoked with tk_core.seq write held. */ if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) - queue_work(system_long_wq, &pvclock_gtod_work); - + irq_work_queue(&pvclock_irq_work); return 0; } @@ -6848,57 +8134,79 @@ int kvm_arch_init(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - struct kvm_x86_ops *ops = opaque; - if (kvm_x86_ops) { + if (kvm_x86_ops.hardware_enable) { printk(KERN_ERR "kvm: already loaded the other module\n"); r = -EEXIST; goto out; } if (!ops->cpu_has_kvm_support()) { - printk(KERN_ERR "kvm: no hardware support\n"); + pr_err_ratelimited("kvm: no hardware support\n"); r = -EOPNOTSUPP; goto out; } if (ops->disabled_by_bios()) { - printk(KERN_ERR "kvm: disabled by bios\n"); + pr_err_ratelimited("kvm: disabled by bios\n"); + r = -EOPNOTSUPP; + goto out; + } + + /* + * KVM explicitly assumes that the guest has an FPU and + * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the + * vCPU's FPU state as a fxregs_state struct. + */ + if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { + printk(KERN_ERR "kvm: inadequate fpu\n"); r = -EOPNOTSUPP; goto out; } r = -ENOMEM; - shared_msrs = alloc_percpu(struct kvm_shared_msrs); - if (!shared_msrs) { - printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), + __alignof__(struct fpu), SLAB_ACCOUNT, + NULL); + if (!x86_fpu_cache) { + printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n"); goto out; } -#ifdef CONFIG_PREEMPT_RT_FULL - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n"); - return -EOPNOTSUPP; + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + goto out_free_x86_fpu_cache; } -#endif - r = kvm_mmu_module_init(); + user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); + if (!user_return_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n"); + goto out_free_x86_emulator_cache; + } + + r = kvm_mmu_vendor_module_init(); if (r) goto out_free_percpu; - - kvm_x86_ops = ops; kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK, 0, sme_me_mask); kvm_timer_init(); + if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest()) + kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr; perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } kvm_lapic_init(); + if (pi_inject_timer == -1) + pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); @@ -6909,7 +8217,11 @@ return 0; out_free_percpu: - free_percpu(shared_msrs); + free_percpu(user_return_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); +out_free_x86_fpu_cache: + kmem_cache_destroy(x86_fpu_cache); out: return r; } @@ -6922,6 +8234,7 @@ #endif kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + kvm_guest_cbs.handle_intel_pt_intr = NULL; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, @@ -6929,11 +8242,14 @@ cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); + irq_work_sync(&pvclock_irq_work); cancel_work_sync(&pvclock_gtod_work); #endif - kvm_x86_ops = NULL; - kvm_mmu_module_exit(); - free_percpu(shared_msrs); + kvm_x86_ops.hardware_enable = NULL; + kvm_mmu_vendor_module_exit(); + free_percpu(user_return_msrs); + kmem_cache_destroy(x86_emulator_cache); + kmem_cache_destroy(x86_fpu_cache); } int kvm_vcpu_halt(struct kvm_vcpu *vcpu) @@ -6997,22 +8313,52 @@ */ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) { - struct kvm_lapic_irq lapic_irq; + /* + * All other fields are unused for APIC_DM_REMRD, but may be consumed by + * common code, e.g. for tracing. Defer initialization to the compiler. + */ + struct kvm_lapic_irq lapic_irq = { + .delivery_mode = APIC_DM_REMRD, + .dest_mode = APIC_DEST_PHYSICAL, + .shorthand = APIC_DEST_NOSHORT, + .dest_id = apicid, + }; - lapic_irq.shorthand = 0; - lapic_irq.dest_mode = 0; - lapic_irq.level = 0; - lapic_irq.dest_id = apicid; - lapic_irq.msi_redir_hint = false; - - lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); } -void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) +bool kvm_apicv_activated(struct kvm *kvm) { - vcpu->arch.apicv_active = false; - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); + return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); +} +EXPORT_SYMBOL_GPL(kvm_apicv_activated); + +void kvm_apicv_init(struct kvm *kvm, bool enable) +{ + if (enable) + clear_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); + else + set_bit(APICV_INHIBIT_REASON_DISABLE, + &kvm->arch.apicv_inhibit_reasons); +} +EXPORT_SYMBOL_GPL(kvm_apicv_init); + +static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id) +{ + struct kvm_vcpu *target = NULL; + struct kvm_apic_map *map; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) + target = map->phys_map[dest_id]->vcpu; + + rcu_read_unlock(); + + if (target && READ_ONCE(target->ready)) + kvm_vcpu_yield_to(target); } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) @@ -7023,11 +8369,11 @@ if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); - nr = kvm_register_read(vcpu, VCPU_REGS_RAX); - a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); - a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); - a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); - a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); trace_kvm_hypercall(nr, a0, a1, a2, a3); @@ -7040,17 +8386,23 @@ a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops->get_cpl(vcpu) != 0) { + if (kvm_x86_ops.get_cpl(vcpu) != 0) { ret = -KVM_EPERM; goto out; } + + ret = -KVM_ENOSYS; switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; case KVM_HC_KICK_CPU: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); + kvm_sched_yield(vcpu->kvm, a1); ret = 0; break; #ifdef CONFIG_X86_64 @@ -7059,7 +8411,17 @@ break; #endif case KVM_HC_SEND_IPI: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; + case KVM_HC_SCHED_YIELD: + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + + kvm_sched_yield(vcpu->kvm, a0); + ret = 0; break; default: ret = -KVM_ENOSYS; @@ -7068,7 +8430,7 @@ out: if (!op_64_bit) ret = (u32)ret; - kvm_register_write(vcpu, VCPU_REGS_RAX, ret); + kvm_rax_write(vcpu, ret); ++vcpu->stat.hypercalls; return kvm_skip_emulated_instruction(vcpu); @@ -7081,7 +8443,7 @@ char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); + kvm_x86_ops.patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -7110,7 +8472,7 @@ { int max_irr, tpr; - if (!kvm_x86_ops->update_cr8_intercept) + if (!kvm_x86_ops.update_cr8_intercept) return; if (!lapic_in_kernel(vcpu)) @@ -7129,24 +8491,32 @@ tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } static void kvm_inject_exception(struct kvm_vcpu *vcpu) { - if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) - vcpu->arch.exception.error_code = false; - kvm_x86_ops->queue_exception(vcpu); + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code, + vcpu->arch.exception.injected); + + if (vcpu->arch.exception.error_code && !is_protmode(vcpu)) + vcpu->arch.exception.error_code = false; + kvm_x86_ops.queue_exception(vcpu); } -static int inject_pending_event(struct kvm_vcpu *vcpu) +static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; + bool can_inject = true; /* try to reinject previous events if any */ - if (vcpu->arch.exception.injected) + if (vcpu->arch.exception.injected) { kvm_inject_exception(vcpu); + can_inject = false; + } /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7162,11 +8532,17 @@ * fully complete the previous instruction. */ else if (!vcpu->arch.exception.pending) { - if (vcpu->arch.nmi_injected) - kvm_x86_ops->set_nmi(vcpu); - else if (vcpu->arch.interrupt.injected) - kvm_x86_ops->set_irq(vcpu); + if (vcpu->arch.nmi_injected) { + kvm_x86_ops.set_nmi(vcpu); + can_inject = false; + } else if (vcpu->arch.interrupt.injected) { + kvm_x86_ops.set_irq(vcpu); + can_inject = false; + } } + + WARN_ON_ONCE(vcpu->arch.exception.injected && + vcpu->arch.exception.pending); /* * Call check_nested_events() even if we reinjected a previous event @@ -7174,69 +8550,107 @@ * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu); - if (r != 0) - return r; + if (is_guest_mode(vcpu)) { + r = kvm_x86_ops.nested_ops->check_events(vcpu); + if (r < 0) + goto busy; } /* try to inject new event if pending */ if (vcpu->arch.exception.pending) { - trace_kvm_inj_exception(vcpu->arch.exception.nr, - vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); - - WARN_ON_ONCE(vcpu->arch.exception.injected); - vcpu->arch.exception.pending = false; - vcpu->arch.exception.injected = true; - + /* + * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS + * value pushed on the stack. Trap-like exception and all #DBs + * leave RF as-is (KVM follows Intel's behavior in this regard; + * AMD states that code breakpoint #DBs excplitly clear RF=0). + * + * Note, most versions of Intel's SDM and AMD's APM incorrectly + * describe the behavior of General Detect #DBs, which are + * fault-like. They do _not_ set RF, a la code breakpoints. + */ if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | X86_EFLAGS_RF); - if (vcpu->arch.exception.nr == DB_VECTOR && - (vcpu->arch.dr7 & DR7_GD)) { - vcpu->arch.dr7 &= ~DR7_GD; - kvm_update_dr7(vcpu); + if (vcpu->arch.exception.nr == DB_VECTOR) { + kvm_deliver_exception_payload(vcpu); + if (vcpu->arch.dr7 & DR7_GD) { + vcpu->arch.dr7 &= ~DR7_GD; + kvm_update_dr7(vcpu); + } } kvm_inject_exception(vcpu); + + vcpu->arch.exception.pending = false; + vcpu->arch.exception.injected = true; + + can_inject = false; } - /* Don't consider new event if we re-injected an event */ - if (kvm_event_needs_reinjection(vcpu)) - return 0; - - if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops->smi_allowed(vcpu)) { - vcpu->arch.smi_pending = false; - ++vcpu->arch.smi_count; - enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu)) { - /* - * Because interrupts can be injected asynchronously, we are - * calling check_nested_events again here to avoid a race condition. - * See https://lkml.org/lkml/2014/7/2/60 for discussion about this - * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? - */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu); - if (r != 0) - return r; - } - if (kvm_x86_ops->interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); - kvm_x86_ops->set_irq(vcpu); - } + /* + * Finally, inject interrupt events. If an event cannot be injected + * due to architectural conditions (e.g. IF=0) a window-open exit + * will re-request KVM_REQ_EVENT. Sometimes however an event is pending + * and can architecturally be injected, but we cannot do it right now: + * an interrupt could have arrived just now and we have to inject it + * as a vmexit, or there could already an event in the queue, which is + * indicated by can_inject. In that case we request an immediate exit + * in order to make progress and get back here for another iteration. + * The kvm_x86_ops hooks communicate this by returning -EBUSY. + */ + if (vcpu->arch.smi_pending) { + r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; + enter_smm(vcpu); + can_inject = false; + } else + kvm_x86_ops.enable_smi_window(vcpu); } - return 0; + if (vcpu->arch.nmi_pending) { + r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops.set_nmi(vcpu); + can_inject = false; + WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); + } + if (vcpu->arch.nmi_pending) + kvm_x86_ops.enable_nmi_window(vcpu); + } + + if (kvm_cpu_has_injectable_intr(vcpu)) { + r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); + kvm_x86_ops.set_irq(vcpu); + WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); + } + if (kvm_cpu_has_injectable_intr(vcpu)) + kvm_x86_ops.enable_irq_window(vcpu); + } + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + *req_immediate_exit = true; + + WARN_ON(vcpu->arch.exception.pending); + return; + +busy: + *req_immediate_exit = true; + return; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -7248,7 +8662,7 @@ * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -7338,11 +8752,11 @@ put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -7392,7 +8806,7 @@ put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -7402,7 +8816,7 @@ put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -7432,28 +8846,28 @@ * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops->pre_enter_smm(vcpu, buf); + kvm_x86_ops.pre_enter_smm(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops->get_nmi_mask(vcpu)) + if (kvm_x86_ops.get_nmi_mask(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops->set_nmi_mask(vcpu, true); + kvm_x86_ops.set_nmi_mask(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops->set_cr4(vcpu, 0); + kvm_x86_ops.set_cr4(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); __kvm_set_dr(vcpu, 7, DR7_FIXED_1); @@ -7484,10 +8898,10 @@ #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops->set_efer(vcpu, 0); + kvm_x86_ops.set_efer(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -7497,10 +8911,82 @@ kvm_make_request(KVM_REQ_EVENT, vcpu); } +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap) +{ + cpumask_var_t cpus; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, + NULL, vcpu_bitmap, cpus); + + free_cpumask_var(cpus); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); } + +void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) +{ + if (!lapic_in_kernel(vcpu)) + return; + + vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + kvm_apic_update_apicv(vcpu); + kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); + +/* + * NOTE: Do not hold any lock prior to calling this. + * + * In particular, kvm_request_apicv_update() expects kvm->srcu not to be + * locked, because it calls __x86_set_memory_region() which does + * synchronize_srcu(&kvm->srcu). + */ +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + struct kvm_vcpu *except; + unsigned long old, new, expected; + + if (!kvm_x86_ops.check_apicv_inhibit_reasons || + !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) + return; + + old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); + do { + expected = new = old; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + if (new == old) + break; + old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); + } while (old != expected); + + if (!!old == !!new) + return; + + trace_kvm_apicv_update_request(activate, bit); + if (kvm_x86_ops.pre_update_apicv_exec_ctrl) + kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); + + /* + * Sending request to update APICV for all other vcpus, + * while update the calling vcpu immediately instead of + * waiting for another #VMEXIT to handle the request. + */ + except = kvm_get_running_vcpu(); + kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, + except); + if (except) + kvm_vcpu_update_apicv(except); +} +EXPORT_SYMBOL_GPL(kvm_request_apicv_update); static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { @@ -7513,7 +8999,7 @@ kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -7533,7 +9019,7 @@ bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -7550,28 +9036,22 @@ kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); } +void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ + if (kvm_x86_ops.guest_memory_reclaimed) + kvm_x86_ops.guest_memory_reclaimed(kvm); +} + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; - if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops->set_apic_access_page_addr) + if (!kvm_x86_ops.set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + kvm_x86_ops.set_apic_access_page_addr(vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) { @@ -7590,12 +9070,17 @@ bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); + fastpath_t exit_fastpath; bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -7611,10 +9096,19 @@ } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; @@ -7685,6 +9179,12 @@ */ if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) kvm_hv_process_stimers(vcpu); + if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) + kvm_vcpu_update_apicv(vcpu); + if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) + kvm_check_async_pf_completion(vcpu); + if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) + kvm_x86_ops.msr_filter_changed(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -7695,32 +9195,9 @@ goto out; } - if (inject_pending_event(vcpu) != 0) - req_immediate_exit = true; - else { - /* Enable SMI/NMI/IRQ window open exits if needed. - * - * SMIs have three cases: - * 1) They can be nested, and then there is nothing to - * do here because RSM will cause a vmexit anyway. - * 2) There is an ISA-specific reason why SMI cannot be - * injected, and the moment when this changes can be - * intercepted. - * 3) Or the SMI can be pending because - * inject_pending_event has completed the injection - * of an IRQ or NMI from the previous vmexit, and - * then we request an immediate exit to inject the - * SMI. - */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops->enable_smi_window(vcpu)) - req_immediate_exit = true; - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - WARN_ON(vcpu->arch.exception.pending); - } + inject_pending_event(vcpu, &req_immediate_exit); + if (req_int_win) + kvm_x86_ops.enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); @@ -7735,7 +9212,7 @@ preempt_disable(); - kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_x86_ops.prepare_guest_switch(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -7751,7 +9228,7 @@ * 1) We should set ->mode before checking ->requests. Please see * the comment in kvm_vcpu_exiting_guest_mode(). * - * 2) For APICv, we should set ->mode before checking PIR.ON. This + * 2) For APICv, we should set ->mode before checking PID.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on * (see vmx_deliver_posted_interrupt). * @@ -7766,10 +9243,9 @@ * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); - if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) - || need_resched() || signal_pending(current)) { + if (kvm_vcpu_exit_request(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -7781,13 +9257,14 @@ if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops->request_immediate_exit(vcpu); + kvm_x86_ops.request_immediate_exit(vcpu); } - trace_kvm_entry(vcpu->vcpu_id); - if (lapic_timer_advance_ns) - wait_lapic_expire(vcpu); - guest_enter_irqoff(); + trace_kvm_entry(vcpu); + + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -7801,7 +9278,7 @@ set_debugreg(0, 7); } - kvm_x86_ops->run(vcpu); + exit_fastpath = kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -7811,9 +9288,8 @@ */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops->sync_dirty_debug_regs(vcpu); + kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); - kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } @@ -7828,18 +9304,43 @@ if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); + kvm_x86_ops.handle_exit_irqoff(vcpu); + + /* + * Consume any pending interrupts, including the possible source of + * VM-Exit on SVM and any ticks that occur between VM-Exit and now. + * An instruction is required after local_irq_enable() to fully unblock + * interrupts on processors that implement an interrupt shadow, the + * stat.exits increment will do nicely. + */ kvm_before_interrupt(vcpu); - kvm_x86_ops->handle_external_intr(vcpu); + local_irq_enable(); + ++vcpu->stat.exits; + local_irq_disable(); kvm_after_interrupt(vcpu); - ++vcpu->stat.exits; + /* + * Wait until after servicing IRQs to account guest time so that any + * ticks that occurred while running the guest are properly accounted + * to the guest. Waiting until IRQs are enabled degrades the accuracy + * of accounting via context tracking, but the loss of accuracy is + * acceptable for all known use cases. + */ + vtime_account_guest_exit(); - guest_exit_irqoff(); + if (lapic_in_kernel(vcpu)) { + s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; + if (delta != S64_MIN) { + trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta); + vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN; + } + } local_irq_enable(); preempt_enable(); @@ -7860,12 +9361,13 @@ if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu); + r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); return r; cancel_injection: - kvm_x86_ops->cancel_injection(vcpu); + if (req_immediate_exit) + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -7875,13 +9377,13 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); + if (kvm_x86_ops.post_block) + kvm_x86_ops.post_block(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -7893,6 +9395,7 @@ vcpu->arch.pv.pv_unhalted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + fallthrough; case KVM_MP_STATE_RUNNABLE: vcpu->arch.apf.halted = false; break; @@ -7900,15 +9403,14 @@ break; default: return -EINTR; - break; } return 1; } static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu); + if (is_guest_mode(vcpu)) + kvm_x86_ops.nested_ops->check_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -7923,6 +9425,13 @@ vcpu->arch.l1tf_flush_l1d = true; for (;;) { + /* + * If another guest vCPU requests a PV TLB flush in the middle + * of instruction emulation, the rest of the emulation could + * use a stale page translation. Assume that any code after + * this point can start executing an instruction. + */ + vcpu->arch.at_instruction_boundary = false; if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { @@ -7944,17 +9453,11 @@ break; } - kvm_check_async_pf_completion(vcpu); - - if (signal_pending(current)) { - r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - break; - } - if (need_resched()) { + if (__xfer_to_guest_mode_work_pending()) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - cond_resched(); + r = xfer_to_guest_mode_handle_work(vcpu); + if (r) + return r; vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } @@ -7967,12 +9470,11 @@ static inline int complete_emulated_io(struct kvm_vcpu *vcpu) { int r; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) - return 0; - return 1; + return r; } static int complete_emulated_pio(struct kvm_vcpu *vcpu) @@ -8045,31 +9547,55 @@ return 0; } +static void kvm_save_current_fpu(struct fpu *fpu) +{ + /* + * If the target FPU state is not resident in the CPU registers, just + * memcpy() from current, else save CPU state directly to the target. + */ + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&fpu->state, ¤t->thread.fpu.state, + fpu_kernel_xstate_size); + else + copy_fpregs_to_fpstate(fpu); +} + /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ - __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state, + fpregs_lock(); + + kvm_save_current_fpu(vcpu->arch.user_fpu); + + /* PKRU is separately restored in kvm_x86_ops.run. */ + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); - copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); - copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); - preempt_enable(); + fpregs_lock(); + + kvm_save_current_fpu(vcpu->arch.guest_fpu); + + copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state); + + fpregs_mark_activate(); + fpregs_unlock(); + ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } -int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; int r; vcpu_load(vcpu); @@ -8087,18 +9613,18 @@ r = -EAGAIN; if (signal_pending(current)) { r = -EINTR; - vcpu->run->exit_reason = KVM_EXIT_INTR; + kvm_run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } goto out; } - if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { r = -EINVAL; goto out; } - if (vcpu->run->kvm_dirty_regs) { + if (kvm_run->kvm_dirty_regs) { r = sync_regs(vcpu); if (r != 0) goto out; @@ -8128,7 +9654,7 @@ out: kvm_put_guest_fpu(vcpu); - if (vcpu->run->kvm_valid_regs) + if (kvm_run->kvm_valid_regs) store_regs(vcpu); post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); @@ -8147,26 +9673,26 @@ * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } - regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); - regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); - regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); - regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); - regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); - regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); + regs->rax = kvm_rax_read(vcpu); + regs->rbx = kvm_rbx_read(vcpu); + regs->rcx = kvm_rcx_read(vcpu); + regs->rdx = kvm_rdx_read(vcpu); + regs->rsi = kvm_rsi_read(vcpu); + regs->rdi = kvm_rdi_read(vcpu); + regs->rsp = kvm_rsp_read(vcpu); + regs->rbp = kvm_rbp_read(vcpu); #ifdef CONFIG_X86_64 - regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); - regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); - regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); - regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); - regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); - regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); - regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); - regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); + regs->r8 = kvm_r8_read(vcpu); + regs->r9 = kvm_r9_read(vcpu); + regs->r10 = kvm_r10_read(vcpu); + regs->r11 = kvm_r11_read(vcpu); + regs->r12 = kvm_r12_read(vcpu); + regs->r13 = kvm_r13_read(vcpu); + regs->r14 = kvm_r14_read(vcpu); + regs->r15 = kvm_r15_read(vcpu); #endif regs->rip = kvm_rip_read(vcpu); @@ -8186,23 +9712,23 @@ vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); - kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); - kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); - kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); - kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); - kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); - kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); - kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); + kvm_rax_write(vcpu, regs->rax); + kvm_rbx_write(vcpu, regs->rbx); + kvm_rcx_write(vcpu, regs->rcx); + kvm_rdx_write(vcpu, regs->rdx); + kvm_rsi_write(vcpu, regs->rsi); + kvm_rdi_write(vcpu, regs->rdi); + kvm_rsp_write(vcpu, regs->rsp); + kvm_rbp_write(vcpu, regs->rbp); #ifdef CONFIG_X86_64 - kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); - kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); - kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); - kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); - kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); - kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); - kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); - kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + kvm_r8_write(vcpu, regs->r8); + kvm_r9_write(vcpu, regs->r9); + kvm_r10_write(vcpu, regs->r10); + kvm_r11_write(vcpu, regs->r11); + kvm_r12_write(vcpu, regs->r12); + kvm_r13_write(vcpu, regs->r13); + kvm_r14_write(vcpu, regs->r14); + kvm_r15_write(vcpu, regs->r15); #endif kvm_rip_write(vcpu, regs->rip); @@ -8245,10 +9771,10 @@ kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -8260,7 +9786,7 @@ sregs->efer = vcpu->arch.efer; sregs->apic_base = kvm_get_apic_base(vcpu); - memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); + memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap)); if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) set_bit(vcpu->arch.interrupt.nr, @@ -8307,8 +9833,12 @@ mp_state->mp_state != KVM_MP_STATE_RUNNABLE) goto out; - /* INITs are latched while in SMM */ - if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + /* + * KVM_MP_STATE_INIT_RECEIVED means the processor is in + * INIT state; latched init should be reported using + * KVM_SET_VCPU_EVENTS, so reject it here. + */ + if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -8329,21 +9859,23 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); - - if (ret) - return EMULATE_FAIL; + if (ret) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; + } kvm_rip_write(vcpu, ctxt->eip); kvm_set_rflags(vcpu, ctxt->eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - return EMULATE_DONE; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -8357,6 +9889,8 @@ */ if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) + return -EINVAL; + if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits) return -EINVAL; } else { /* @@ -8389,31 +9923,31 @@ dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); + kvm_x86_ops.set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_x86_ops.set_efer(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + kvm_x86_ops.set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -8517,7 +10051,7 @@ */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_bp_intercept(vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); r = 0; @@ -8556,7 +10090,7 @@ vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu.state.fxsave; + fxsave = &vcpu->arch.guest_fpu->state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -8564,7 +10098,7 @@ fpu->last_opcode = fxsave->fop; fpu->last_ip = fxsave->rip; fpu->last_dp = fxsave->rdp; - memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space)); vcpu_put(vcpu); return 0; @@ -8576,7 +10110,7 @@ vcpu_load(vcpu); - fxsave = &vcpu->arch.guest_fpu.state.fxsave; + fxsave = &vcpu->arch.guest_fpu->state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -8585,7 +10119,7 @@ fxsave->fop = fpu->last_opcode; fxsave->rip = fpu->last_ip; fxsave->rdp = fpu->last_dp; - memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space)); vcpu_put(vcpu); return 0; @@ -8632,9 +10166,9 @@ static void fx_init(struct kvm_vcpu *vcpu) { - fpstate_init(&vcpu->arch.guest_fpu.state); + fpstate_init(&vcpu->arch.guest_fpu->state); if (boot_cpu_has(X86_FEATURE_XSAVES)) - vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = + vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -8645,48 +10179,122 @@ vcpu->arch.cr0 |= X86_CR0_ET; } -void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) { - void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask; - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; - - kvm_release_pfn(cache->pfn, cache->dirty, cache); - - kvmclock_reset(vcpu); - - kvm_x86_ops->vcpu_free(vcpu); - free_cpumask_var(wbinvd_dirty_mask); -} - -struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, - unsigned int id) -{ - struct kvm_vcpu *vcpu; - if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) - printk_once(KERN_WARNING - "kvm: SMP vm created on host with unstable TSC; " - "guest TSC will not be reliable\n"); + pr_warn_once("kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); - vcpu = kvm_x86_ops->vcpu_create(kvm, id); - - return vcpu; + return 0; } -int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { + struct page *page; + int r; + + if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + else + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; + + kvm_set_tsc_khz(vcpu, max_tsc_khz); + + r = kvm_mmu_create(vcpu); + if (r < 0) + return r; + + if (irqchip_in_kernel(vcpu->kvm)) { + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); + if (r < 0) + goto fail_mmu_destroy; + if (kvm_apicv_activated(vcpu->kvm)) + vcpu->arch.apicv_active = true; + } else + static_key_slow_inc(&kvm_no_apic_vcpu); + + r = -ENOMEM; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto fail_free_lapic; + vcpu->arch.pio_data = page_address(page); + + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.mce_banks) + goto fail_free_pio_data; + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, + GFP_KERNEL_ACCOUNT)) + goto fail_free_mce_banks; + + if (!alloc_emulate_ctxt(vcpu)) + goto free_wbinvd_dirty_mask; + + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.user_fpu) { + pr_err("kvm: failed to allocate userspace's fpu\n"); + goto free_emulate_ctxt; + } + + vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, + GFP_KERNEL_ACCOUNT); + if (!vcpu->arch.guest_fpu) { + pr_err("kvm: failed to allocate vcpu's fpu\n"); + goto free_user_fpu; + } + fx_init(vcpu); + + vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63); + + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + + kvm_async_pf_hash_reset(vcpu); + kvm_pmu_init(vcpu); + + vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; + + kvm_hv_vcpu_init(vcpu); + + r = kvm_x86_ops.vcpu_create(vcpu); + if (r) + goto free_guest_fpu; + vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); - kvm_mmu_setup(vcpu); + kvm_init_mmu(vcpu, false); vcpu_put(vcpu); return 0; + +free_guest_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); +free_user_fpu: + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); +free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); +fail_free_mce_banks: + kfree(vcpu->arch.mce_banks); +fail_free_pio_data: + free_page((unsigned long)vcpu->arch.pio_data); +fail_free_lapic: + kvm_free_lapic(vcpu); +fail_mmu_destroy: + kvm_mmu_destroy(vcpu); + return r; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { - struct msr_data msr; struct kvm *kvm = vcpu->kvm; kvm_hv_vcpu_postcreate(vcpu); @@ -8694,23 +10302,43 @@ if (mutex_lock_killable(&vcpu->mutex)) return; vcpu_load(vcpu); - msr.data = 0x0; - msr.index = MSR_IA32_TSC; - msr.host_initiated = true; - kvm_write_tsc(vcpu, &msr); + kvm_synchronize_tsc(vcpu, 0); vcpu_put(vcpu); + + /* poll control enabled by default */ + vcpu->arch.msr_kvm_poll_control = 1; + mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - kvm_arch_vcpu_free(vcpu); + int idx; + + kvmclock_reset(vcpu); + + kvm_x86_ops.vcpu_free(vcpu); + + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); + kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); + + kvm_hv_vcpu_uninit(vcpu); + kvm_pmu_destroy(vcpu); + kfree(vcpu->arch.mce_banks); + kvm_free_lapic(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); + kvm_mmu_destroy(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + free_page((unsigned long)vcpu->arch.pio_data); + kvfree(vcpu->arch.cpuid_entries); + if (!lapic_in_kernel(vcpu)) + static_key_slow_dec(&kvm_no_apic_vcpu); } void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -8726,19 +10354,18 @@ vcpu->arch.nmi_injected = false; kvm_clear_interrupt_queue(vcpu); kvm_clear_exception_queue(vcpu); - vcpu->arch.exception.pending = false; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); vcpu->arch.cr2 = 0; kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; + vcpu->arch.apf.msr_en_val = 0; + vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); @@ -8756,12 +10383,12 @@ */ if (init_event) kvm_put_guest_fpu(vcpu); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, - XFEATURE_MASK_BNDREGS); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, + XFEATURE_BNDREGS); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); - mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave, - XFEATURE_MASK_BNDCSR); + mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, + XFEATURE_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); if (init_event) @@ -8772,7 +10399,6 @@ kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; - vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; vcpu->arch.msr_misc_features_enables = 0; vcpu->arch.xcr0 = XFEATURE_MASK_FP; @@ -8784,7 +10410,7 @@ vcpu->arch.ia32_xss = 0; - kvm_x86_ops->vcpu_reset(vcpu, init_event); + kvm_x86_ops.vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -8808,8 +10434,8 @@ u64 max_tsc = 0; bool stable, backwards_tsc = false; - kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); + kvm_user_return_msr_cpu_online(); + ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -8835,7 +10461,7 @@ * before any KVM threads can be running. Unfortunately, we can't * bring the TSCs fully up to date with real time, as we aren't yet far * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, ktime_get_boot_ns() will be using boot + * elapsed; our helper function, ktime_get_boottime_ns() will be using boot * variables that haven't been updated yet. * * So we simply find the maximum observed TSC above, then record the @@ -8891,19 +10517,32 @@ void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(); + kvm_x86_ops.hardware_disable(); drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - r = kvm_x86_ops->hardware_setup(); + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + r = ops->hardware_setup(); if (r != 0) return r; - cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + +#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) + cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); +#undef __kvm_cpu_cap_has if (kvm_has_tsc_control) { /* @@ -8925,12 +10564,21 @@ void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops.hardware_unsetup(); } -void kvm_arch_check_processor_compat(void *rtn) +int kvm_arch_check_processor_compat(void *opaque) { - kvm_x86_ops->check_processor_compatibility(rtn); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct kvm_x86_init_ops *ops = opaque; + + WARN_ON(!irqs_disabled()); + + if (__cr4_reserved_bits(cpu_has, c) != + __cr4_reserved_bits(cpu_has, &boot_cpu_data)) + return -EIO; + + return ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -8947,107 +10595,35 @@ struct static_key kvm_no_apic_vcpu __read_mostly; EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu); -int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) -{ - struct page *page; - int r; - - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - vcpu->arch.emulate_ctxt.ops = &emulate_ops; - if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - else - vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; - - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) { - r = -ENOMEM; - goto fail; - } - vcpu->arch.pio_data = page_address(page); - - kvm_set_tsc_khz(vcpu, max_tsc_khz); - - r = kvm_mmu_create(vcpu); - if (r < 0) - goto fail_free_pio_data; - - if (irqchip_in_kernel(vcpu->kvm)) { - r = kvm_create_lapic(vcpu); - if (r < 0) - goto fail_mmu_destroy; - } else - static_key_slow_inc(&kvm_no_apic_vcpu); - - vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, - GFP_KERNEL); - if (!vcpu->arch.mce_banks) { - r = -ENOMEM; - goto fail_free_lapic; - } - vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - - if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) { - r = -ENOMEM; - goto fail_free_mce_banks; - } - - fx_init(vcpu); - - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - - vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; - - kvm_async_pf_hash_reset(vcpu); - kvm_pmu_init(vcpu); - - vcpu->arch.pending_external_vector = -1; - vcpu->arch.preempted_in_kernel = false; - - kvm_hv_vcpu_init(vcpu); - - return 0; - -fail_free_mce_banks: - kfree(vcpu->arch.mce_banks); -fail_free_lapic: - kvm_free_lapic(vcpu); -fail_mmu_destroy: - kvm_mmu_destroy(vcpu); -fail_free_pio_data: - free_page((unsigned long)vcpu->arch.pio_data); -fail: - return r; -} - -void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) -{ - int idx; - - kvm_hv_vcpu_uninit(vcpu); - kvm_pmu_destroy(vcpu); - kfree(vcpu->arch.mce_banks); - kvm_free_lapic(vcpu); - idx = srcu_read_lock(&vcpu->kvm->srcu); - kvm_mmu_destroy(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - free_page((unsigned long)vcpu->arch.pio_data); - if (!lapic_in_kernel(vcpu)) - static_key_slow_dec(&kvm_no_apic_vcpu); -} - void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + vcpu->arch.l1tf_flush_l1d = true; - kvm_x86_ops->sched_in(vcpu, cpu); + if (pmu->version && unlikely(pmu->event_count)) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } + kvm_x86_ops.sched_in(vcpu, cpu); } + +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm->arch.hyperv.hv_pa_pg); + vfree(kvm); +} + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + int ret; + if (type) return -EINVAL; + + ret = kvm_page_track_init(kvm); + if (ret) + return ret; INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); @@ -9066,7 +10642,7 @@ mutex_init(&kvm->arch.apic_map_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); - kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); + kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); pvclock_update_vm_gtod_copy(kvm); kvm->arch.guest_can_read_msr_platform_info = true; @@ -9075,13 +10651,9 @@ INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); kvm_hv_init_vm(kvm); - kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - if (kvm_x86_ops->vm_init) - return kvm_x86_ops->vm_init(kvm); - - return 0; + return kvm_x86_ops.vm_init(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -9109,7 +10681,7 @@ kvm_unload_vcpu_mmu(vcpu); } kvm_for_each_vcpu(i, vcpu, kvm) - kvm_arch_vcpu_free(vcpu); + kvm_vcpu_destroy(vcpu); mutex_lock(&kvm->lock); for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) @@ -9129,9 +10701,9 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + unsigned long hva, old_npages; struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) @@ -9139,7 +10711,7 @@ slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) + if (slot && slot->npages) return -EEXIST; /* @@ -9151,13 +10723,13 @@ if (IS_ERR((void *)hva)) return PTR_ERR((void *)hva); } else { - if (!slot->npages) + if (!slot || !slot->npages) return 0; + old_npages = slot->npages; hva = 0; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@ -9172,23 +10744,11 @@ } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); return 0; } EXPORT_SYMBOL_GPL(__x86_set_memory_region); - -int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) -{ - int r; - - mutex_lock(&kvm->slots_lock); - r = __x86_set_memory_region(kvm, id, gpa, size); - mutex_unlock(&kvm->slots_lock); - - return r; -} -EXPORT_SYMBOL_GPL(x86_set_memory_region); void kvm_arch_pre_destroy_vm(struct kvm *kvm) { @@ -9203,46 +10763,47 @@ * unless the the memory map has changed due to process exit * or fd copying. */ - x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); - x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_lock(&kvm->slots_lock); + __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, + 0, 0); + __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); + mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops->vm_destroy) - kvm_x86_ops->vm_destroy(kvm); + if (kvm_x86_ops.vm_destroy) + kvm_x86_ops.vm_destroy(kvm); + kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); + kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); kvm_page_track_cleanup(kvm); kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) continue; - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, + unsigned long npages) { int i; @@ -9264,13 +10825,13 @@ slot->arch.rmap[i] = kvcalloc(lpages, sizeof(*slot->arch.rmap[i]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!slot->arch.rmap[i]) goto out_free; if (i == 0) continue; - linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL); + linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) goto out_free; @@ -9283,11 +10844,9 @@ ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) @@ -9334,76 +10893,23 @@ const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - if (change == KVM_MR_MOVE) - return kvm_arch_create_memslot(kvm, memslot, - mem->memory_size >> PAGE_SHIFT); - + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + return kvm_alloc_memslot_metadata(memslot, + mem->memory_size >> PAGE_SHIFT); return 0; } static void kvm_mmu_slot_apply_flags(struct kvm *kvm, - struct kvm_memory_slot *new) + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { - /* Still write protect RO slot */ - if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); - return; - } - /* - * Call kvm_x86_ops dirty logging hooks when they are valid. - * - * kvm_x86_ops->slot_disable_log_dirty is called when: - * - * - KVM_MR_CREATE with dirty logging is disabled - * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag - * - * The reason is, in case of PML, we need to set D-bit for any slots - * with dirty logging disabled in order to eliminate unnecessary GPA - * logging in PML buffer (and potential PML buffer full VMEXT). This - * guarantees leaving PML enabled during guest's lifetime won't have - * any additonal overhead from PML when guest is running with dirty - * logging disabled for memory slots. - * - * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot - * to dirty logging mode. - * - * If kvm_x86_ops dirty logging hooks are invalid, use write protect. - * - * In case of write protect: - * - * Write protect all pages for dirty logging. - * - * All the sptes including the large sptes which point to this - * slot are set to readonly. We can not create any new large - * spte on this slot until the end of the logging. - * - * See the comments in fast_page_fault(). + * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot. + * See comments below. */ - if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) - kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); - } else { - if (kvm_x86_ops->slot_disable_log_dirty) - kvm_x86_ops->slot_disable_log_dirty(kvm, new); - } -} - -void kvm_arch_commit_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, - const struct kvm_memory_slot *new, - enum kvm_mr_change change) -{ - int nr_mmu_pages = 0; - - if (!kvm->arch.n_requested_mmu_pages) - nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - - if (nr_mmu_pages) - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); + if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY)) + return; /* * Dirty logging tracks sptes in 4k granularity, meaning that large @@ -9416,29 +10922,91 @@ * Scan sptes if dirty logging has been stopped, dropping those * which can be collapsed into a single large-page spte. Later * page faults will create the large-page sptes. + * + * There is no need to do this in any of the following cases: + * CREATE: No dirty mappings will already exist. + * MOVE/DELETE: The old mappings will already have been cleaned up by + * kvm_arch_flush_shadow_memslot() */ - if ((change != KVM_MR_DELETE) && - (old->flags & KVM_MEM_LOG_DIRTY_PAGES) && - !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) && + !(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) kvm_mmu_zap_collapsible_sptes(kvm, new); /* - * Set up write protection and/or dirty logging for the new slot. + * Enable or disable dirty logging for the slot. * - * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have - * been zapped so no dirty logging staff is needed for old slot. For - * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the - * new and it's also covered when dealing with the new slot. + * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old + * slot have been zapped so no dirty logging updates are needed for + * the old slot. + * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible + * any mappings that might be created in it will consume the + * properties of the new slot and do not need to be updated here. * + * When PML is enabled, the kvm_x86_ops dirty logging hooks are + * called to enable/disable dirty logging. + * + * When disabling dirty logging with PML enabled, the D-bit is set + * for sptes in the slot in order to prevent unnecessary GPA + * logging in the PML buffer (and potential PML buffer full VMEXIT). + * This guarantees leaving PML enabled for the guest's lifetime + * won't have any additional overhead from PML when the guest is + * running with dirty logging disabled. + * + * When enabling dirty logging, large sptes are write-protected + * so they can be split on first write. New large sptes cannot + * be created for this slot until the end of the logging. + * See the comments in fast_page_fault(). + * For small sptes, nothing is done if the dirty log is in the + * initial-all-set state. Otherwise, depending on whether pml + * is enabled the D-bit or the W-bit will be cleared. + */ + if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (kvm_x86_ops.slot_enable_log_dirty) { + kvm_x86_ops.slot_enable_log_dirty(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PG_LEVEL_2M : PG_LEVEL_4K; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } + } else { + if (kvm_x86_ops.slot_disable_log_dirty) + kvm_x86_ops.slot_disable_log_dirty(kvm, new); + } +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) +{ + if (!kvm->arch.n_requested_mmu_pages) + kvm_mmu_change_mmu_pages(kvm, + kvm_mmu_calculate_default_mmu_pages(kvm)); + + /* * FIXME: const-ify all uses of struct kvm_memory_slot. */ - if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_mmu_zap_all(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -9450,8 +11018,8 @@ static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops->guest_apic_has_interrupt && - kvm_x86_ops->guest_apic_has_interrupt(vcpu)); + kvm_x86_ops.guest_apic_has_interrupt && + kvm_x86_ops.guest_apic_has_interrupt(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -9470,11 +11038,12 @@ if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops->nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || - (vcpu->arch.smi_pending && !is_smm(vcpu))) + (vcpu->arch.smi_pending && + kvm_x86_ops.smi_allowed(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -9483,6 +11052,11 @@ return true; if (kvm_hv_has_stimer_pending(vcpu)) + return true; + + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) return true; return false; @@ -9503,7 +11077,7 @@ kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) return true; return false; @@ -9521,7 +11095,7 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -9543,7 +11117,7 @@ { unsigned long rflags; - rflags = kvm_x86_ops->get_rflags(vcpu); + rflags = kvm_x86_ops.get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -9555,7 +11129,7 @@ if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_x86_ops.set_rflags(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -9569,7 +11143,7 @@ { int r; - if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) || + if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) || work->wakeup_all) return; @@ -9577,21 +11151,23 @@ if (unlikely(r)) return; - if (!vcpu->arch.mmu.direct_map && - work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu)) + if (!vcpu->arch.mmu->direct_map && + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) return; - vcpu->arch.mmu.page_fault(vcpu, work->cr2_or_gpa, 0, true); + kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); } static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); } static inline u32 kvm_async_pf_next_probe(u32 key) { - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); } static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -9609,7 +11185,7 @@ int i; u32 key = kvm_async_pf_hash_fn(gfn); - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + for (i = 0; i < ASYNC_PF_PER_VCPU && (vcpu->arch.apf.gfns[key] != gfn && vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); @@ -9627,6 +11203,10 @@ u32 i, j, k; i = j = kvm_async_pf_gfn_slot(vcpu, gfn); + + if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) + return; + while (true) { vcpu->arch.apf.gfns[i] = ~0; do { @@ -9645,21 +11225,64 @@ } } -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) { + u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, + sizeof(reason)); } -static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) { + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); - return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, - sizeof(u32)); + return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &token, offset, sizeof(token)); } -void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, +static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) +{ + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + u32 val; + + if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &val, offset, sizeof(val))) + return false; + + return !val; +} + +static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) + return false; + + if (!kvm_pv_async_pf_enabled(vcpu) || + (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) + return false; + + return true; +} + +bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!lapic_in_kernel(vcpu) || + kvm_event_needs_reinjection(vcpu) || + vcpu->arch.exception.pending)) + return false; + + if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) + return false; + + /* + * If interrupts are off we cannot even use an artificial + * halt state. + */ + return kvm_arch_interrupt_allowed(vcpu); +} + +bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; @@ -9667,11 +11290,8 @@ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa); kvm_add_async_pf_gfn(vcpu, work->arch.gfn); - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) - kvm_make_request(KVM_REQ_APF_HALT, vcpu); - else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + if (kvm_can_deliver_async_pf(vcpu) && + !apf_put_user_notpresent(vcpu)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -9679,14 +11299,28 @@ fault.address = work->arch.token; fault.async_page_fault = true; kvm_inject_page_fault(vcpu, &fault); + return true; + } else { + /* + * It is not possible to deliver a paravirtualized asynchronous + * page fault, but putting the guest in an artificial halt state + * can be beneficial nevertheless: if an interrupt arrives, we + * can deliver it timely and perhaps the guest will schedule + * another process. When the instruction that triggered a page + * fault is retried, hopefully the page will be ready in the host. + */ + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return false; } } void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - struct x86_exception fault; - u32 val; + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vcpu->arch.apf.vec + }; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -9694,37 +11328,30 @@ kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_get_user(vcpu, &val)) { - if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && - vcpu->arch.exception.pending && - vcpu->arch.exception.nr == PF_VECTOR && - !apf_put_user(vcpu, 0)) { - vcpu->arch.exception.injected = false; - vcpu->arch.exception.pending = false; - vcpu->arch.exception.nr = 0; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.error_code = 0; - } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); - } + if ((work->wakeup_all || work->notpresent_injected) && + kvm_pv_async_pf_enabled(vcpu) && + !apf_put_user_ready(vcpu, work->arch.token)) { + vcpu->arch.apf.pageready_pending = true; + kvm_apic_set_irq(vcpu, &irq, NULL); } + vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + kvm_make_request(KVM_REQ_APF_READY, vcpu); + if (!vcpu->arch.apf.pageready_pending) + kvm_vcpu_kick(vcpu); +} + +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) +{ + if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return kvm_can_do_async_pf(vcpu); + return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) @@ -9739,9 +11366,9 @@ } EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); -bool kvm_arch_has_assigned_device(struct kvm *kvm) +bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) { - return atomic_read(&kvm->arch.assigned_device_count); + return arch_atomic_read(&kvm->arch.assigned_device_count); } EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); @@ -9765,7 +11392,7 @@ bool kvm_arch_has_irq_bypass(void) { - return kvm_x86_ops->update_pi_irte != NULL; + return true; } int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, @@ -9773,11 +11400,17 @@ { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + int ret; irqfd->producer = prod; + kvm_arch_start_assignment(irqfd->kvm); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, + prod->irq, irqfd->gsi, 1); - return kvm_x86_ops->update_pi_irte(irqfd->kvm, - prod->irq, irqfd->gsi, 1); + if (ret) + kvm_arch_end_assignment(irqfd->kvm); + + return ret; } void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, @@ -9796,26 +11429,185 @@ * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + + kvm_arch_end_assignment(irqfd->kvm); } int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - if (!kvm_x86_ops->update_pi_irte) - return -EINVAL; - - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) { return vector_hashing; } -EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled); + +bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.msr_kvm_poll_control & 1) == 0; +} +EXPORT_SYMBOL_GPL(kvm_arch_no_poll); + + +int kvm_spec_ctrl_test_value(u64 value) +{ + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ + + u64 saved_value; + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); + +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct x86_exception fault; + u32 access = error_code & + (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); + + if (!(error_code & PFERR_PRESENT_MASK) || + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); +} +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); + +/* + * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns + * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value + * indicates whether exit to userspace is needed. + */ +int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, + struct x86_exception *e) +{ + if (r == X86EMUL_PROPAGATE_FAULT) { + kvm_inject_emulated_page_fault(vcpu, e); + return 1; + } + + /* + * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED + * while handling a VMX instruction KVM could've handled the request + * correctly by exiting to userspace and performing I/O but there + * doesn't seem to be a real use-case behind such requests, just return + * KVM_EXIT_INTERNAL_ERROR for now. + */ + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); + +int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) +{ + bool pcid_enabled; + struct x86_exception e; + unsigned i; + unsigned long roots_to_free = 0; + struct { + u64 pcid; + u64 gla; + } operand; + int r; + + r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); + if (r != X86EMUL_CONTINUE) + return kvm_handle_memory_failure(vcpu, r, &e); + + if (operand.pcid >> 12 != 0) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE); + + switch (type) { + case INVPCID_TYPE_INDIV_ADDR: + if ((!pcid_enabled && (operand.pcid != 0)) || + is_noncanonical_address(operand.gla, vcpu)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid); + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_SINGLE_CTXT: + if (!pcid_enabled && (operand.pcid != 0)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + + if (kvm_get_active_pcid(vcpu) == operand.pcid) { + kvm_mmu_sync_roots(vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) + == operand.pcid) + roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); + + kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free); + /* + * If neither the current cr3 nor any of the prev_roots use the + * given PCID, then nothing needs to be done here because a + * resync will happen anyway before switching to any other CR3. + */ + + return kvm_skip_emulated_instruction(vcpu); + + case INVPCID_TYPE_ALL_NON_GLOBAL: + /* + * Currently, KVM doesn't mark global entries in the shadow + * page tables, so a non-global flush just degenerates to a + * global flush. If needed, we could optimize this later by + * keeping track of global entries in shadow page tables. + */ + + fallthrough; + case INVPCID_TYPE_ALL_INCL_GLOBAL: + kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); + return kvm_skip_emulated_instruction(vcpu); + + default: + BUG(); /* We have already checked above that type <= 3 */ + } +} +EXPORT_SYMBOL_GPL(kvm_handle_invpcid); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); @@ -9827,12 +11619,31 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); -EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); + +static int __init kvm_x86_init(void) +{ + kvm_mmu_x86_module_init(); + return 0; +} +module_init(kvm_x86_init); + +static void __exit kvm_x86_exit(void) +{ + /* + * If module_init() is implemented, module_exit() must also be + * implemented to allow module unload. + */ +} +module_exit(kvm_x86_exit); -- Gitblit v1.6.2