From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/arch/x86/kvm/x86.c | 187 +++++++++++++++++++++++++++++++---------------
1 files changed, 126 insertions(+), 61 deletions(-)
diff --git a/kernel/arch/x86/kvm/x86.c b/kernel/arch/x86/kvm/x86.c
index 4c62278..9d30158 100644
--- a/kernel/arch/x86/kvm/x86.c
+++ b/kernel/arch/x86/kvm/x86.c
@@ -231,6 +231,8 @@
VCPU_STAT("l1d_flush", l1d_flush),
VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VCPU_STAT("preemption_reported", preemption_reported),
+ VCPU_STAT("preemption_other", preemption_other),
VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
VM_STAT("mmu_pte_write", mmu_pte_write),
VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
@@ -1387,7 +1389,7 @@
ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
static u64 kvm_get_arch_capabilities(void)
{
@@ -1443,6 +1445,9 @@
* using VERW to clear CPU buffers.
*/
}
+
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
+ data |= ARCH_CAP_GDS_NO;
return data;
}
@@ -1586,6 +1591,9 @@
allowed = !!test_bit(index - start, bitmap);
break;
}
+
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ ++vcpu->stat.exits;
}
out:
@@ -3020,51 +3028,95 @@
static void record_steal_time(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ u64 steal;
+ u32 version;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- /* -EAGAIN is returned in atomic context so we can just return. */
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
- &map, &vcpu->arch.st.cache, false))
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
+ /* We rely on the fact that it fits in a single page. */
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
+ return;
+ }
+
+ st = (struct kvm_steal_time __user *)ghc->hva;
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ u8 st_preempted = 0;
+ int err = -EFAULT;
+
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ asm volatile("1: xchgb %0, %2\n"
+ "xor %1, %1\n"
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+q" (st_preempted),
+ "+&r" (err),
+ "+m" (st->preempted));
+ if (err)
+ goto out;
+
+ user_access_end();
+
+ vcpu->arch.st.preempted = 0;
+
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- st->preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st_preempted & KVM_VCPU_FLUSH_TLB);
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
+
+ if (!user_access_begin(st, sizeof(*st)))
+ goto dirty;
} else {
- st->preempted = 0;
+ if (!user_access_begin(st, sizeof(*st)))
+ return;
+
+ unsafe_put_user(0, &st->preempted, out);
+ vcpu->arch.st.preempted = 0;
}
- vcpu->arch.st.preempted = 0;
+ unsafe_get_user(version, &st->version, out);
+ if (version & 1)
+ version += 1; /* first time write, random junk */
- if (st->version & 1)
- st->version += 1; /* first time write, random junk */
-
- st->version += 1;
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
smp_wmb();
- st->steal += current->sched_info.run_delay -
+ unsafe_get_user(steal, &st->steal, out);
+ steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
+ unsafe_put_user(steal, &st->steal, out);
- smp_wmb();
+ version += 1;
+ unsafe_put_user(version, &st->version, out);
- st->version += 1;
-
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
+ out:
+ user_access_end();
+ dirty:
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -4049,51 +4101,67 @@
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
- struct kvm_host_map map;
- struct kvm_steal_time *st;
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
+ struct kvm_steal_time __user *st;
+ struct kvm_memslots *slots;
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
+ /*
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
+ * an instruction boundary and will not trigger guest emulation of any
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
+ * when this is true, for example allowing the vCPU to be marked
+ * preempted if and only if the VM-Exit was due to a host interrupt.
+ */
+ if (!vcpu->arch.at_instruction_boundary) {
+ vcpu->stat.preemption_other++;
+ return;
+ }
+
+ vcpu->stat.preemption_reported++;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
if (vcpu->arch.st.preempted)
return;
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
- &vcpu->arch.st.cache, true))
+ /* This happens on process exit */
+ if (unlikely(current->mm != vcpu->kvm->mm))
return;
- st = map.hva +
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+ slots = kvm_memslots(vcpu->kvm);
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+ if (unlikely(slots->generation != ghc->generation ||
+ gpa != ghc->gpa ||
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
+ return;
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
+ st = (struct kvm_steal_time __user *)ghc->hva;
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
+
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
int idx;
- if (vcpu->preempted)
+ if (vcpu->preempted) {
vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
- /*
- * Disable page faults because we're in atomic context here.
- * kvm_write_guest_offset_cached() would call might_fault()
- * that relies on pagefault_disable() to tell if there's a
- * bug. NOTE: the write to guest memory may not go through if
- * during postcopy live migration or if there's heavy guest
- * paging.
- */
- pagefault_disable();
- /*
- * kvm_memslots() will be called by
- * kvm_write_guest_offset_cached() so take the srcu lock.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_steal_time_set_preempted(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- pagefault_enable();
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ }
+
kvm_x86_ops.vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
@@ -4455,12 +4523,11 @@
{
unsigned long val;
+ memset(dbgregs, 0, sizeof(*dbgregs));
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
- dbgregs->flags = 0;
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
}
static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -7535,7 +7602,9 @@
write_fault_to_spt,
emulation_type))
return 1;
- if (ctxt->have_exception) {
+
+ if (ctxt->have_exception &&
+ !(emulation_type & EMULTYPE_SKIP)) {
/*
* #UD should result in just EMULATION_FAILED, and trap-like
* exception should not be encountered during decode.
@@ -8095,14 +8164,6 @@
r = -EOPNOTSUPP;
goto out;
}
-
-#ifdef CONFIG_PREEMPT_RT
- if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
- pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
- r = -EOPNOTSUPP;
- goto out;
- }
-#endif
r = -ENOMEM;
x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
@@ -9364,6 +9425,13 @@
vcpu->arch.l1tf_flush_l1d = true;
for (;;) {
+ /*
+ * If another guest vCPU requests a PV TLB flush in the middle
+ * of instruction emulation, the rest of the emulation could
+ * use a stale page translation. Assume that any code after
+ * this point can start executing an instruction.
+ */
+ vcpu->arch.at_instruction_boundary = false;
if (kvm_vcpu_running(vcpu)) {
r = vcpu_enter_guest(vcpu);
} else {
@@ -10249,10 +10317,7 @@
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
int idx;
-
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
kvmclock_reset(vcpu);
--
Gitblit v1.6.2