.. | .. |
---|
231 | 231 | VCPU_STAT("l1d_flush", l1d_flush), |
---|
232 | 232 | VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), |
---|
233 | 233 | VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), |
---|
| 234 | + VCPU_STAT("preemption_reported", preemption_reported), |
---|
| 235 | + VCPU_STAT("preemption_other", preemption_other), |
---|
234 | 236 | VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), |
---|
235 | 237 | VM_STAT("mmu_pte_write", mmu_pte_write), |
---|
236 | 238 | VM_STAT("mmu_pde_zapped", mmu_pde_zapped), |
---|
.. | .. |
---|
1387 | 1389 | ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ |
---|
1388 | 1390 | ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ |
---|
1389 | 1391 | ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ |
---|
1390 | | - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) |
---|
| 1392 | + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) |
---|
1391 | 1393 | |
---|
1392 | 1394 | static u64 kvm_get_arch_capabilities(void) |
---|
1393 | 1395 | { |
---|
.. | .. |
---|
1443 | 1445 | * using VERW to clear CPU buffers. |
---|
1444 | 1446 | */ |
---|
1445 | 1447 | } |
---|
| 1448 | + |
---|
| 1449 | + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) |
---|
| 1450 | + data |= ARCH_CAP_GDS_NO; |
---|
1446 | 1451 | |
---|
1447 | 1452 | return data; |
---|
1448 | 1453 | } |
---|
.. | .. |
---|
1586 | 1591 | allowed = !!test_bit(index - start, bitmap); |
---|
1587 | 1592 | break; |
---|
1588 | 1593 | } |
---|
| 1594 | + |
---|
| 1595 | + /* Note, VM-Exits that go down the "slow" path are accounted below. */ |
---|
| 1596 | + ++vcpu->stat.exits; |
---|
1589 | 1597 | } |
---|
1590 | 1598 | |
---|
1591 | 1599 | out: |
---|
.. | .. |
---|
3020 | 3028 | |
---|
3021 | 3029 | static void record_steal_time(struct kvm_vcpu *vcpu) |
---|
3022 | 3030 | { |
---|
3023 | | - struct kvm_host_map map; |
---|
3024 | | - struct kvm_steal_time *st; |
---|
| 3031 | + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
---|
| 3032 | + struct kvm_steal_time __user *st; |
---|
| 3033 | + struct kvm_memslots *slots; |
---|
| 3034 | + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
---|
| 3035 | + u64 steal; |
---|
| 3036 | + u32 version; |
---|
3025 | 3037 | |
---|
3026 | 3038 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
---|
3027 | 3039 | return; |
---|
3028 | 3040 | |
---|
3029 | | - /* -EAGAIN is returned in atomic context so we can just return. */ |
---|
3030 | | - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, |
---|
3031 | | - &map, &vcpu->arch.st.cache, false)) |
---|
| 3041 | + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) |
---|
3032 | 3042 | return; |
---|
3033 | 3043 | |
---|
3034 | | - st = map.hva + |
---|
3035 | | - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); |
---|
| 3044 | + slots = kvm_memslots(vcpu->kvm); |
---|
3036 | 3045 | |
---|
| 3046 | + if (unlikely(slots->generation != ghc->generation || |
---|
| 3047 | + gpa != ghc->gpa || |
---|
| 3048 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { |
---|
| 3049 | + /* We rely on the fact that it fits in a single page. */ |
---|
| 3050 | + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); |
---|
| 3051 | + |
---|
| 3052 | + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || |
---|
| 3053 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot) |
---|
| 3054 | + return; |
---|
| 3055 | + } |
---|
| 3056 | + |
---|
| 3057 | + st = (struct kvm_steal_time __user *)ghc->hva; |
---|
3037 | 3058 | /* |
---|
3038 | 3059 | * Doing a TLB flush here, on the guest's behalf, can avoid |
---|
3039 | 3060 | * expensive IPIs. |
---|
3040 | 3061 | */ |
---|
3041 | 3062 | if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { |
---|
| 3063 | + u8 st_preempted = 0; |
---|
| 3064 | + int err = -EFAULT; |
---|
| 3065 | + |
---|
| 3066 | + if (!user_access_begin(st, sizeof(*st))) |
---|
| 3067 | + return; |
---|
| 3068 | + |
---|
| 3069 | + asm volatile("1: xchgb %0, %2\n" |
---|
| 3070 | + "xor %1, %1\n" |
---|
| 3071 | + "2:\n" |
---|
| 3072 | + _ASM_EXTABLE_UA(1b, 2b) |
---|
| 3073 | + : "+q" (st_preempted), |
---|
| 3074 | + "+&r" (err), |
---|
| 3075 | + "+m" (st->preempted)); |
---|
| 3076 | + if (err) |
---|
| 3077 | + goto out; |
---|
| 3078 | + |
---|
| 3079 | + user_access_end(); |
---|
| 3080 | + |
---|
| 3081 | + vcpu->arch.st.preempted = 0; |
---|
| 3082 | + |
---|
3042 | 3083 | trace_kvm_pv_tlb_flush(vcpu->vcpu_id, |
---|
3043 | | - st->preempted & KVM_VCPU_FLUSH_TLB); |
---|
3044 | | - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) |
---|
| 3084 | + st_preempted & KVM_VCPU_FLUSH_TLB); |
---|
| 3085 | + if (st_preempted & KVM_VCPU_FLUSH_TLB) |
---|
3045 | 3086 | kvm_vcpu_flush_tlb_guest(vcpu); |
---|
| 3087 | + |
---|
| 3088 | + if (!user_access_begin(st, sizeof(*st))) |
---|
| 3089 | + goto dirty; |
---|
3046 | 3090 | } else { |
---|
3047 | | - st->preempted = 0; |
---|
| 3091 | + if (!user_access_begin(st, sizeof(*st))) |
---|
| 3092 | + return; |
---|
| 3093 | + |
---|
| 3094 | + unsafe_put_user(0, &st->preempted, out); |
---|
| 3095 | + vcpu->arch.st.preempted = 0; |
---|
3048 | 3096 | } |
---|
3049 | 3097 | |
---|
3050 | | - vcpu->arch.st.preempted = 0; |
---|
| 3098 | + unsafe_get_user(version, &st->version, out); |
---|
| 3099 | + if (version & 1) |
---|
| 3100 | + version += 1; /* first time write, random junk */ |
---|
3051 | 3101 | |
---|
3052 | | - if (st->version & 1) |
---|
3053 | | - st->version += 1; /* first time write, random junk */ |
---|
3054 | | - |
---|
3055 | | - st->version += 1; |
---|
| 3102 | + version += 1; |
---|
| 3103 | + unsafe_put_user(version, &st->version, out); |
---|
3056 | 3104 | |
---|
3057 | 3105 | smp_wmb(); |
---|
3058 | 3106 | |
---|
3059 | | - st->steal += current->sched_info.run_delay - |
---|
| 3107 | + unsafe_get_user(steal, &st->steal, out); |
---|
| 3108 | + steal += current->sched_info.run_delay - |
---|
3060 | 3109 | vcpu->arch.st.last_steal; |
---|
3061 | 3110 | vcpu->arch.st.last_steal = current->sched_info.run_delay; |
---|
| 3111 | + unsafe_put_user(steal, &st->steal, out); |
---|
3062 | 3112 | |
---|
3063 | | - smp_wmb(); |
---|
| 3113 | + version += 1; |
---|
| 3114 | + unsafe_put_user(version, &st->version, out); |
---|
3064 | 3115 | |
---|
3065 | | - st->version += 1; |
---|
3066 | | - |
---|
3067 | | - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); |
---|
| 3116 | + out: |
---|
| 3117 | + user_access_end(); |
---|
| 3118 | + dirty: |
---|
| 3119 | + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); |
---|
3068 | 3120 | } |
---|
3069 | 3121 | |
---|
3070 | 3122 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
---|
.. | .. |
---|
4049 | 4101 | |
---|
4050 | 4102 | static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) |
---|
4051 | 4103 | { |
---|
4052 | | - struct kvm_host_map map; |
---|
4053 | | - struct kvm_steal_time *st; |
---|
| 4104 | + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
---|
| 4105 | + struct kvm_steal_time __user *st; |
---|
| 4106 | + struct kvm_memslots *slots; |
---|
| 4107 | + static const u8 preempted = KVM_VCPU_PREEMPTED; |
---|
| 4108 | + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
---|
4054 | 4109 | |
---|
| 4110 | + /* |
---|
| 4111 | + * The vCPU can be marked preempted if and only if the VM-Exit was on |
---|
| 4112 | + * an instruction boundary and will not trigger guest emulation of any |
---|
| 4113 | + * kind (see vcpu_run). Vendor specific code controls (conservatively) |
---|
| 4114 | + * when this is true, for example allowing the vCPU to be marked |
---|
| 4115 | + * preempted if and only if the VM-Exit was due to a host interrupt. |
---|
| 4116 | + */ |
---|
| 4117 | + if (!vcpu->arch.at_instruction_boundary) { |
---|
| 4118 | + vcpu->stat.preemption_other++; |
---|
| 4119 | + return; |
---|
| 4120 | + } |
---|
| 4121 | + |
---|
| 4122 | + vcpu->stat.preemption_reported++; |
---|
4055 | 4123 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
---|
4056 | 4124 | return; |
---|
4057 | 4125 | |
---|
4058 | 4126 | if (vcpu->arch.st.preempted) |
---|
4059 | 4127 | return; |
---|
4060 | 4128 | |
---|
4061 | | - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, |
---|
4062 | | - &vcpu->arch.st.cache, true)) |
---|
| 4129 | + /* This happens on process exit */ |
---|
| 4130 | + if (unlikely(current->mm != vcpu->kvm->mm)) |
---|
4063 | 4131 | return; |
---|
4064 | 4132 | |
---|
4065 | | - st = map.hva + |
---|
4066 | | - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); |
---|
| 4133 | + slots = kvm_memslots(vcpu->kvm); |
---|
4067 | 4134 | |
---|
4068 | | - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; |
---|
| 4135 | + if (unlikely(slots->generation != ghc->generation || |
---|
| 4136 | + gpa != ghc->gpa || |
---|
| 4137 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) |
---|
| 4138 | + return; |
---|
4069 | 4139 | |
---|
4070 | | - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); |
---|
| 4140 | + st = (struct kvm_steal_time __user *)ghc->hva; |
---|
| 4141 | + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); |
---|
| 4142 | + |
---|
| 4143 | + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) |
---|
| 4144 | + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; |
---|
| 4145 | + |
---|
| 4146 | + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); |
---|
4071 | 4147 | } |
---|
4072 | 4148 | |
---|
4073 | 4149 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
---|
4074 | 4150 | { |
---|
4075 | 4151 | int idx; |
---|
4076 | 4152 | |
---|
4077 | | - if (vcpu->preempted) |
---|
| 4153 | + if (vcpu->preempted) { |
---|
4078 | 4154 | vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); |
---|
4079 | 4155 | |
---|
4080 | | - /* |
---|
4081 | | - * Disable page faults because we're in atomic context here. |
---|
4082 | | - * kvm_write_guest_offset_cached() would call might_fault() |
---|
4083 | | - * that relies on pagefault_disable() to tell if there's a |
---|
4084 | | - * bug. NOTE: the write to guest memory may not go through if |
---|
4085 | | - * during postcopy live migration or if there's heavy guest |
---|
4086 | | - * paging. |
---|
4087 | | - */ |
---|
4088 | | - pagefault_disable(); |
---|
4089 | | - /* |
---|
4090 | | - * kvm_memslots() will be called by |
---|
4091 | | - * kvm_write_guest_offset_cached() so take the srcu lock. |
---|
4092 | | - */ |
---|
4093 | | - idx = srcu_read_lock(&vcpu->kvm->srcu); |
---|
4094 | | - kvm_steal_time_set_preempted(vcpu); |
---|
4095 | | - srcu_read_unlock(&vcpu->kvm->srcu, idx); |
---|
4096 | | - pagefault_enable(); |
---|
| 4156 | + /* |
---|
| 4157 | + * Take the srcu lock as memslots will be accessed to check the gfn |
---|
| 4158 | + * cache generation against the memslots generation. |
---|
| 4159 | + */ |
---|
| 4160 | + idx = srcu_read_lock(&vcpu->kvm->srcu); |
---|
| 4161 | + kvm_steal_time_set_preempted(vcpu); |
---|
| 4162 | + srcu_read_unlock(&vcpu->kvm->srcu, idx); |
---|
| 4163 | + } |
---|
| 4164 | + |
---|
4097 | 4165 | kvm_x86_ops.vcpu_put(vcpu); |
---|
4098 | 4166 | vcpu->arch.last_host_tsc = rdtsc(); |
---|
4099 | 4167 | /* |
---|
.. | .. |
---|
4455 | 4523 | { |
---|
4456 | 4524 | unsigned long val; |
---|
4457 | 4525 | |
---|
| 4526 | + memset(dbgregs, 0, sizeof(*dbgregs)); |
---|
4458 | 4527 | memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); |
---|
4459 | 4528 | kvm_get_dr(vcpu, 6, &val); |
---|
4460 | 4529 | dbgregs->dr6 = val; |
---|
4461 | 4530 | dbgregs->dr7 = vcpu->arch.dr7; |
---|
4462 | | - dbgregs->flags = 0; |
---|
4463 | | - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); |
---|
4464 | 4531 | } |
---|
4465 | 4532 | |
---|
4466 | 4533 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, |
---|
.. | .. |
---|
7535 | 7602 | write_fault_to_spt, |
---|
7536 | 7603 | emulation_type)) |
---|
7537 | 7604 | return 1; |
---|
7538 | | - if (ctxt->have_exception) { |
---|
| 7605 | + |
---|
| 7606 | + if (ctxt->have_exception && |
---|
| 7607 | + !(emulation_type & EMULTYPE_SKIP)) { |
---|
7539 | 7608 | /* |
---|
7540 | 7609 | * #UD should result in just EMULATION_FAILED, and trap-like |
---|
7541 | 7610 | * exception should not be encountered during decode. |
---|
.. | .. |
---|
9356 | 9425 | vcpu->arch.l1tf_flush_l1d = true; |
---|
9357 | 9426 | |
---|
9358 | 9427 | for (;;) { |
---|
| 9428 | + /* |
---|
| 9429 | + * If another guest vCPU requests a PV TLB flush in the middle |
---|
| 9430 | + * of instruction emulation, the rest of the emulation could |
---|
| 9431 | + * use a stale page translation. Assume that any code after |
---|
| 9432 | + * this point can start executing an instruction. |
---|
| 9433 | + */ |
---|
| 9434 | + vcpu->arch.at_instruction_boundary = false; |
---|
9359 | 9435 | if (kvm_vcpu_running(vcpu)) { |
---|
9360 | 9436 | r = vcpu_enter_guest(vcpu); |
---|
9361 | 9437 | } else { |
---|
.. | .. |
---|
10241 | 10317 | |
---|
10242 | 10318 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
---|
10243 | 10319 | { |
---|
10244 | | - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; |
---|
10245 | 10320 | int idx; |
---|
10246 | | - |
---|
10247 | | - kvm_release_pfn(cache->pfn, cache->dirty, cache); |
---|
10248 | 10321 | |
---|
10249 | 10322 | kvmclock_reset(vcpu); |
---|
10250 | 10323 | |
---|