hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/arch/x86/kvm/x86.c
....@@ -231,6 +231,8 @@
231231 VCPU_STAT("l1d_flush", l1d_flush),
232232 VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
233233 VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
234
+ VCPU_STAT("preemption_reported", preemption_reported),
235
+ VCPU_STAT("preemption_other", preemption_other),
234236 VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
235237 VM_STAT("mmu_pte_write", mmu_pte_write),
236238 VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
....@@ -1387,7 +1389,7 @@
13871389 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
13881390 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
13891391 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1390
- ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
1392
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO)
13911393
13921394 static u64 kvm_get_arch_capabilities(void)
13931395 {
....@@ -1443,6 +1445,9 @@
14431445 * using VERW to clear CPU buffers.
14441446 */
14451447 }
1448
+
1449
+ if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated())
1450
+ data |= ARCH_CAP_GDS_NO;
14461451
14471452 return data;
14481453 }
....@@ -1586,6 +1591,9 @@
15861591 allowed = !!test_bit(index - start, bitmap);
15871592 break;
15881593 }
1594
+
1595
+ /* Note, VM-Exits that go down the "slow" path are accounted below. */
1596
+ ++vcpu->stat.exits;
15891597 }
15901598
15911599 out:
....@@ -3020,51 +3028,95 @@
30203028
30213029 static void record_steal_time(struct kvm_vcpu *vcpu)
30223030 {
3023
- struct kvm_host_map map;
3024
- struct kvm_steal_time *st;
3031
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3032
+ struct kvm_steal_time __user *st;
3033
+ struct kvm_memslots *slots;
3034
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3035
+ u64 steal;
3036
+ u32 version;
30253037
30263038 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
30273039 return;
30283040
3029
- /* -EAGAIN is returned in atomic context so we can just return. */
3030
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
3031
- &map, &vcpu->arch.st.cache, false))
3041
+ if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
30323042 return;
30333043
3034
- st = map.hva +
3035
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
3044
+ slots = kvm_memslots(vcpu->kvm);
30363045
3046
+ if (unlikely(slots->generation != ghc->generation ||
3047
+ gpa != ghc->gpa ||
3048
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3049
+ /* We rely on the fact that it fits in a single page. */
3050
+ BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3051
+
3052
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3053
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3054
+ return;
3055
+ }
3056
+
3057
+ st = (struct kvm_steal_time __user *)ghc->hva;
30373058 /*
30383059 * Doing a TLB flush here, on the guest's behalf, can avoid
30393060 * expensive IPIs.
30403061 */
30413062 if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3063
+ u8 st_preempted = 0;
3064
+ int err = -EFAULT;
3065
+
3066
+ if (!user_access_begin(st, sizeof(*st)))
3067
+ return;
3068
+
3069
+ asm volatile("1: xchgb %0, %2\n"
3070
+ "xor %1, %1\n"
3071
+ "2:\n"
3072
+ _ASM_EXTABLE_UA(1b, 2b)
3073
+ : "+q" (st_preempted),
3074
+ "+&r" (err),
3075
+ "+m" (st->preempted));
3076
+ if (err)
3077
+ goto out;
3078
+
3079
+ user_access_end();
3080
+
3081
+ vcpu->arch.st.preempted = 0;
3082
+
30423083 trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3043
- st->preempted & KVM_VCPU_FLUSH_TLB);
3044
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
3084
+ st_preempted & KVM_VCPU_FLUSH_TLB);
3085
+ if (st_preempted & KVM_VCPU_FLUSH_TLB)
30453086 kvm_vcpu_flush_tlb_guest(vcpu);
3087
+
3088
+ if (!user_access_begin(st, sizeof(*st)))
3089
+ goto dirty;
30463090 } else {
3047
- st->preempted = 0;
3091
+ if (!user_access_begin(st, sizeof(*st)))
3092
+ return;
3093
+
3094
+ unsafe_put_user(0, &st->preempted, out);
3095
+ vcpu->arch.st.preempted = 0;
30483096 }
30493097
3050
- vcpu->arch.st.preempted = 0;
3098
+ unsafe_get_user(version, &st->version, out);
3099
+ if (version & 1)
3100
+ version += 1; /* first time write, random junk */
30513101
3052
- if (st->version & 1)
3053
- st->version += 1; /* first time write, random junk */
3054
-
3055
- st->version += 1;
3102
+ version += 1;
3103
+ unsafe_put_user(version, &st->version, out);
30563104
30573105 smp_wmb();
30583106
3059
- st->steal += current->sched_info.run_delay -
3107
+ unsafe_get_user(steal, &st->steal, out);
3108
+ steal += current->sched_info.run_delay -
30603109 vcpu->arch.st.last_steal;
30613110 vcpu->arch.st.last_steal = current->sched_info.run_delay;
3111
+ unsafe_put_user(steal, &st->steal, out);
30623112
3063
- smp_wmb();
3113
+ version += 1;
3114
+ unsafe_put_user(version, &st->version, out);
30643115
3065
- st->version += 1;
3066
-
3067
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
3116
+ out:
3117
+ user_access_end();
3118
+ dirty:
3119
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
30683120 }
30693121
30703122 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
....@@ -4049,51 +4101,67 @@
40494101
40504102 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
40514103 {
4052
- struct kvm_host_map map;
4053
- struct kvm_steal_time *st;
4104
+ struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4105
+ struct kvm_steal_time __user *st;
4106
+ struct kvm_memslots *slots;
4107
+ static const u8 preempted = KVM_VCPU_PREEMPTED;
4108
+ gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
40544109
4110
+ /*
4111
+ * The vCPU can be marked preempted if and only if the VM-Exit was on
4112
+ * an instruction boundary and will not trigger guest emulation of any
4113
+ * kind (see vcpu_run). Vendor specific code controls (conservatively)
4114
+ * when this is true, for example allowing the vCPU to be marked
4115
+ * preempted if and only if the VM-Exit was due to a host interrupt.
4116
+ */
4117
+ if (!vcpu->arch.at_instruction_boundary) {
4118
+ vcpu->stat.preemption_other++;
4119
+ return;
4120
+ }
4121
+
4122
+ vcpu->stat.preemption_reported++;
40554123 if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
40564124 return;
40574125
40584126 if (vcpu->arch.st.preempted)
40594127 return;
40604128
4061
- if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
4062
- &vcpu->arch.st.cache, true))
4129
+ /* This happens on process exit */
4130
+ if (unlikely(current->mm != vcpu->kvm->mm))
40634131 return;
40644132
4065
- st = map.hva +
4066
- offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
4133
+ slots = kvm_memslots(vcpu->kvm);
40674134
4068
- st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4135
+ if (unlikely(slots->generation != ghc->generation ||
4136
+ gpa != ghc->gpa ||
4137
+ kvm_is_error_hva(ghc->hva) || !ghc->memslot))
4138
+ return;
40694139
4070
- kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
4140
+ st = (struct kvm_steal_time __user *)ghc->hva;
4141
+ BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
4142
+
4143
+ if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4144
+ vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4145
+
4146
+ mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa));
40714147 }
40724148
40734149 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
40744150 {
40754151 int idx;
40764152
4077
- if (vcpu->preempted)
4153
+ if (vcpu->preempted) {
40784154 vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
40794155
4080
- /*
4081
- * Disable page faults because we're in atomic context here.
4082
- * kvm_write_guest_offset_cached() would call might_fault()
4083
- * that relies on pagefault_disable() to tell if there's a
4084
- * bug. NOTE: the write to guest memory may not go through if
4085
- * during postcopy live migration or if there's heavy guest
4086
- * paging.
4087
- */
4088
- pagefault_disable();
4089
- /*
4090
- * kvm_memslots() will be called by
4091
- * kvm_write_guest_offset_cached() so take the srcu lock.
4092
- */
4093
- idx = srcu_read_lock(&vcpu->kvm->srcu);
4094
- kvm_steal_time_set_preempted(vcpu);
4095
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
4096
- pagefault_enable();
4156
+ /*
4157
+ * Take the srcu lock as memslots will be accessed to check the gfn
4158
+ * cache generation against the memslots generation.
4159
+ */
4160
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
4161
+ kvm_steal_time_set_preempted(vcpu);
4162
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
4163
+ }
4164
+
40974165 kvm_x86_ops.vcpu_put(vcpu);
40984166 vcpu->arch.last_host_tsc = rdtsc();
40994167 /*
....@@ -4455,12 +4523,11 @@
44554523 {
44564524 unsigned long val;
44574525
4526
+ memset(dbgregs, 0, sizeof(*dbgregs));
44584527 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
44594528 kvm_get_dr(vcpu, 6, &val);
44604529 dbgregs->dr6 = val;
44614530 dbgregs->dr7 = vcpu->arch.dr7;
4462
- dbgregs->flags = 0;
4463
- memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
44644531 }
44654532
44664533 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
....@@ -7535,7 +7602,9 @@
75357602 write_fault_to_spt,
75367603 emulation_type))
75377604 return 1;
7538
- if (ctxt->have_exception) {
7605
+
7606
+ if (ctxt->have_exception &&
7607
+ !(emulation_type & EMULTYPE_SKIP)) {
75397608 /*
75407609 * #UD should result in just EMULATION_FAILED, and trap-like
75417610 * exception should not be encountered during decode.
....@@ -9356,6 +9425,13 @@
93569425 vcpu->arch.l1tf_flush_l1d = true;
93579426
93589427 for (;;) {
9428
+ /*
9429
+ * If another guest vCPU requests a PV TLB flush in the middle
9430
+ * of instruction emulation, the rest of the emulation could
9431
+ * use a stale page translation. Assume that any code after
9432
+ * this point can start executing an instruction.
9433
+ */
9434
+ vcpu->arch.at_instruction_boundary = false;
93599435 if (kvm_vcpu_running(vcpu)) {
93609436 r = vcpu_enter_guest(vcpu);
93619437 } else {
....@@ -10241,10 +10317,7 @@
1024110317
1024210318 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
1024310319 {
10244
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
1024510320 int idx;
10246
-
10247
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
1024810321
1024910322 kvmclock_reset(vcpu);
1025010323