From 9d77db3c730780c8ef5ccd4b66403ff5675cfe4e Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Mon, 13 May 2024 10:30:14 +0000 Subject: [PATCH] modify sin led gpio --- kernel/arch/x86/kernel/kvm.c | 632 +++++++++++++++++++++++++++++++++++++-------------------- 1 files changed, 409 insertions(+), 223 deletions(-) diff --git a/kernel/arch/x86/kernel/kvm.c b/kernel/arch/x86/kernel/kvm.c index cee45d4..fe9babe 100644 --- a/kernel/arch/x86/kernel/kvm.c +++ b/kernel/arch/x86/kernel/kvm.c @@ -1,27 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * KVM paravirt_ops implementation - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright IBM Corporation, 2007 * Authors: Anthony Liguori <aliguori@us.ibm.com> */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include <linux/context_tracking.h> #include <linux/init.h> +#include <linux/irq.h> #include <linux/kernel.h> #include <linux/kvm_para.h> #include <linux/cpu.h> @@ -34,9 +24,9 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/kprobes.h> -#include <linux/debugfs.h> #include <linux/nmi.h> #include <linux/swait.h> +#include <linux/syscore_ops.h> #include <asm/timer.h> #include <asm/cpu.h> #include <asm/traps.h> @@ -46,6 +36,12 @@ #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/tlb.h> +#include <asm/cpuidle_haltpoll.h> +#include <asm/ptrace.h> +#include <asm/reboot.h> +#include <asm/svm.h> + +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); static int kvmapf = 1; @@ -67,9 +63,10 @@ early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; +static int has_guest_poll = 0; /* * No need for any "IO delay" on KVM */ @@ -85,7 +82,6 @@ struct swait_queue_head wq; u32 token; int cpu; - bool halted; }; static struct kvm_task_sleep_head { @@ -108,77 +104,64 @@ return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); - - rcu_irq_exit(); - return; + kfree(e); + return false; } - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); + n->token = token; + n->cpu = smp_processor_id(); + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; - rcu_irq_exit(); - - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } - - rcu_irq_enter(); + local_irq_enable(); + schedule(); + local_irq_disable(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; + finish_swait(&n.wq, &wait); } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) - smp_send_reschedule(n->cpu); - else if (swq_has_sleeper(&n->wq)) + if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); } @@ -187,12 +170,13 @@ int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -204,7 +188,7 @@ { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node *n; + struct kvm_task_sleep_node *n, *dummy = NULL; if (token == ~0) { apf_task_wake_all(); @@ -216,74 +200,115 @@ n = _find_apf_task(b, token); if (!n) { /* - * async PF was not yet handled. - * Add dummy entry for the token. + * Async #PF not yet handled, add a dummy entry for the token. + * Allocating the token must be down outside of the raw lock + * as the allocator is preemptible on PREEMPT_RT kernels. */ - n = kzalloc(sizeof(*n), GFP_ATOMIC); - if (!n) { - /* - * Allocation failed! Busy wait while other cpu - * handles async PF. - */ + if (!dummy) { raw_spin_unlock(&b->lock); - cpu_relax(); + dummy = kzalloc(sizeof(*dummy), GFP_ATOMIC); + + /* + * Continue looping on allocation failure, eventually + * the async #PF will be handled and allocating a new + * node will be unnecessary. + */ + if (!dummy) + cpu_relax(); + + /* + * Recheck for async #PF completion before enqueueing + * the dummy token to avoid duplicate list entries. + */ goto again; } - n->token = token; - n->cpu = smp_processor_id(); - init_swait_queue_head(&n->wq); - hlist_add_head(&n->link, &b->list); - } else + dummy->token = token; + dummy->cpu = smp_processor_id(); + init_swait_queue_head(&dummy->wq); + hlist_add_head(&dummy->link, &b->list); + dummy = NULL; + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); - return; + + /* A dummy token might be allocated and ultimately not used. */ + if (dummy) + kfree(dummy); } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +noinstr u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code) +noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - enum ctx_state prev_state; + u32 flags = kvm_read_and_reset_apf_flags(); + irqentry_state_t state; - switch (kvm_read_and_reset_pf_reason()) { - default: - do_page_fault(regs, error_code); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: - /* page is swapped out by the host. */ - prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); - exception_exit(prev_state); - break; - case KVM_PV_REASON_PAGE_READY: - rcu_irq_enter(); - kvm_async_pf_task_wake((u32)read_cr2()); - rcu_irq_exit(); - break; + if (!flags) + return false; + + state = irqentry_enter(regs); + instrumentation_begin(); + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (unlikely(!(user_mode(regs)))) + panic("Host injected async #PF in kernel mode\n"); + /* Page is swapped out by the host. */ + kvm_async_pf_task_wait_schedule(token); + } else { + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } + + instrumentation_end(); + irqentry_exit(regs, state); + return true; } -NOKPROBE_SYMBOL(do_async_page_fault); + +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + + ack_APIC_irq(); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + set_irq_regs(old_regs); +} static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) - pv_cpu_ops.io_delay = kvm_io_delay; + pv_ops.cpu.io_delay = kvm_io_delay; #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; @@ -299,8 +324,8 @@ return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -321,28 +346,27 @@ static void kvm_guest_cpu_init(void) { - if (!kvm_para_available()) - return; - - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); -#ifdef CONFIG_PREEMPT - pa |= KVM_ASYNC_PF_SEND_ALWAYS; -#endif - pa |= KVM_ASYNC_PF_ENABLED; + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -363,8 +387,15 @@ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); +} + +static void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); } static void kvm_pv_guest_cpu_reboot(void *unused) @@ -409,14 +440,6 @@ return steal; } -void kvm_disable_steal_time(void) -{ - if (!has_steal_clock) - return; - - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); -} - static inline void __set_percpu_decrypted(void *ptr, unsigned long size) { early_set_memory_decrypted((unsigned long) ptr, size); @@ -444,7 +467,50 @@ } } +static bool pv_tlb_flush_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + +static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask); + +static void kvm_guest_cpu_offline(bool shutdown) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); + if (!shutdown) + apf_task_wake_all(); + kvmclock_disable(); +} + +static int kvm_cpu_online(unsigned int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_init(); + local_irq_restore(flags); + return 0; +} + #ifdef CONFIG_SMP + +static bool pv_ipi_supported(void) +{ + return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI); +} + +static bool pv_sched_yield_supported(void) +{ + return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) && + !kvm_para_has_hint(KVM_HINTS_REALTIME) && + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)); +} + #define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG) static void __send_ipi_mask(const struct cpumask *mask, int vector) @@ -480,12 +546,13 @@ } else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) { ipi_bitmap <<= min - apic_id; min = apic_id; - } else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) { + } else if (apic_id > min && apic_id < min + KVM_IPI_CLUSTER_SIZE) { max = apic_id < max ? max : apic_id; } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -495,7 +562,8 @@ if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -509,23 +577,13 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) { unsigned int this_cpu = smp_processor_id(); - struct cpumask new_mask; + struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); const struct cpumask *local_mask; - cpumask_copy(&new_mask, mask); - cpumask_clear_cpu(this_cpu, &new_mask); - local_mask = &new_mask; + cpumask_copy(new_mask, mask); + cpumask_clear_cpu(this_cpu, new_mask); + local_mask = new_mask; __send_ipi_mask(local_mask, vector); -} - -static void kvm_send_ipi_allbutself(int vector) -{ - kvm_send_ipi_mask_allbutself(cpu_online_mask, vector); -} - -static void kvm_send_ipi_all(int vector) -{ - __send_ipi_mask(cpu_online_mask, vector); } /* @@ -535,16 +593,22 @@ { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - apic->send_IPI_allbutself = kvm_send_ipi_allbutself; - apic->send_IPI_all = kvm_send_ipi_all; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) +static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) { - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); + int cpu; + + native_send_call_func_ipi(mask); + + /* Make sure other vCPUs get a chance to run if they need to. */ + for_each_cpu(cpu, mask) { + if (vcpu_is_preempted(cpu)) { + kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu)); + break; + } + } } static void __init kvm_smp_prepare_boot_cpu(void) @@ -560,38 +624,60 @@ kvm_spinlock_init(); } -static void kvm_guest_cpu_offline(void) -{ - kvm_disable_steal_time(); - if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); - kvm_pv_disable_apf(); - apf_task_wake_all(); -} - -static int kvm_cpu_online(unsigned int cpu) -{ - local_irq_disable(); - kvm_guest_cpu_init(); - local_irq_enable(); - return 0; -} - static int kvm_cpu_down_prepare(unsigned int cpu) { - local_irq_disable(); - kvm_guest_cpu_offline(); - local_irq_enable(); + unsigned long flags; + + local_irq_save(flags); + kvm_guest_cpu_offline(false); + local_irq_restore(flags); return 0; } + #endif -static void __init kvm_apf_trap_init(void) +static int kvm_suspend(void) { - update_intr_gate(X86_TRAP_PF, async_page_fault); + u64 val = 0; + + kvm_guest_cpu_offline(false); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + rdmsrl(MSR_KVM_POLL_CONTROL, val); + has_guest_poll = !(val & 1); +#endif + return 0; } -static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask); +static void kvm_resume(void) +{ + kvm_cpu_online(raw_smp_processor_id()); + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +#endif +} + +static struct syscore_ops kvm_syscore_ops = { + .suspend = kvm_suspend, + .resume = kvm_resume, +}; + +/* + * After a PV feature is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. + */ +#ifdef CONFIG_KEXEC_CORE +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + kvm_guest_cpu_offline(true); + native_machine_crash_shutdown(regs); +} +#endif static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) @@ -599,7 +685,7 @@ u8 state; int cpu; struct kvm_steal_time *src; - struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask); + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask); cpumask_copy(flushmask, cpumask); /* @@ -623,41 +709,49 @@ { int i; - if (!kvm_para_available()) - return; - paravirt_ops_setup(); register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; - pv_time_ops.steal_clock = kvm_steal_clock; + pv_ops.time.steal_clock = kvm_steal_clock; } - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; - pv_mmu_ops.tlb_remove_table = tlb_remove_table; + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others; + pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } + #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + if (pv_sched_yield_supported()) { + smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; + pr_info("setup PV sched yield\n"); + } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); #endif + +#ifdef CONFIG_KEXEC_CORE + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + + register_syscore_ops(&kvm_syscore_ops); /* * Hard lockup detection is enabled by default. Disable it, as guests @@ -703,6 +797,7 @@ { return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES); } +EXPORT_SYMBOL_GPL(kvm_arch_para_hints); static uint32_t __init kvm_detect(void) { @@ -712,7 +807,7 @@ static void __init kvm_apic_init(void) { #if defined(CONFIG_SMP) - if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI)) + if (pv_ipi_supported()) kvm_setup_pv_ipi(); #endif } @@ -723,13 +818,34 @@ x86_platform.apic_post_init = kvm_apic_init; } +#if defined(CONFIG_AMD_MEM_ENCRYPT) +static void kvm_sev_es_hcall_prepare(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* RAX and CPL are already in the GHCB */ + ghcb_set_rbx(ghcb, regs->bx); + ghcb_set_rcx(ghcb, regs->cx); + ghcb_set_rdx(ghcb, regs->dx); + ghcb_set_rsi(ghcb, regs->si); +} + +static bool kvm_sev_es_hcall_finish(struct ghcb *ghcb, struct pt_regs *regs) +{ + /* No checking of the return state needed */ + return true; +} +#endif + const __initconst struct hypervisor_x86 x86_hyper_kvm = { - .name = "KVM", - .detect = kvm_detect, - .type = X86_HYPER_KVM, - .init.guest_late_init = kvm_guest_init, - .init.x2apic_available = kvm_para_available, - .init.init_platform = kvm_init_platform, + .name = "KVM", + .detect = kvm_detect, + .type = X86_HYPER_KVM, + .init.guest_late_init = kvm_guest_init, + .init.x2apic_available = kvm_para_available, + .init.init_platform = kvm_init_platform, +#if defined(CONFIG_AMD_MEM_ENCRYPT) + .runtime.sev_es_hcall_prepare = kvm_sev_es_hcall_prepare, + .runtime.sev_es_hcall_finish = kvm_sev_es_hcall_finish, +#endif }; static __init int activate_jump_labels(void) @@ -744,23 +860,31 @@ } arch_initcall(activate_jump_labels); -static __init int kvm_setup_pv_tlb_flush(void) +static __init int kvm_alloc_cpumask(void) { int cpu; + bool alloc = false; - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && - !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + if (!kvm_para_available() || nopv) + return 0; + + if (pv_tlb_flush_supported()) + alloc = true; + +#if defined(CONFIG_SMP) + if (pv_ipi_supported()) + alloc = true; +#endif + + if (alloc) for_each_possible_cpu(cpu) { - zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); } - pr_info("KVM setup pv remote TLB flush\n"); - } return 0; } -arch_initcall(kvm_setup_pv_tlb_flush); +arch_initcall(kvm_alloc_cpumask); #ifdef CONFIG_PARAVIRT_SPINLOCKS @@ -829,7 +953,7 @@ "movq __per_cpu_offset(,%rdi,8), %rax;" "cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" "setne %al;" -"ret;" +ASM_RET ".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;" ".popsection"); @@ -840,29 +964,91 @@ */ void __init kvm_spinlock_init(void) { - if (!kvm_para_available()) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) - return; + } - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); - pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; - pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); - pv_lock_ops.wait = kvm_wait; - pv_lock_ops.kick = kvm_kick_cpu; + pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_ops.lock.queued_spin_unlock = + PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_ops.lock.wait = kvm_wait; + pv_ops.lock.kick = kvm_kick_cpu; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { - pv_lock_ops.vcpu_is_preempted = + pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL + +static void kvm_disable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 0); +} + +static void kvm_enable_host_haltpoll(void *i) +{ + wrmsrl(MSR_KVM_POLL_CONTROL, 1); +} + +void arch_haltpoll_enable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); + return; + } + + /* Enable guest halt poll disables host halt poll */ + smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_enable); + +void arch_haltpoll_disable(unsigned int cpu) +{ + if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) + return; + + /* Disable guest halt poll enables host halt poll */ + smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1); +} +EXPORT_SYMBOL_GPL(arch_haltpoll_disable); +#endif -- Gitblit v1.6.2