| .. | .. |
|---|
| 231 | 231 | VCPU_STAT("l1d_flush", l1d_flush), |
|---|
| 232 | 232 | VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), |
|---|
| 233 | 233 | VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), |
|---|
| 234 | + VCPU_STAT("preemption_reported", preemption_reported), |
|---|
| 235 | + VCPU_STAT("preemption_other", preemption_other), |
|---|
| 234 | 236 | VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), |
|---|
| 235 | 237 | VM_STAT("mmu_pte_write", mmu_pte_write), |
|---|
| 236 | 238 | VM_STAT("mmu_pde_zapped", mmu_pde_zapped), |
|---|
| .. | .. |
|---|
| 1387 | 1389 | ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ |
|---|
| 1388 | 1390 | ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ |
|---|
| 1389 | 1391 | ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ |
|---|
| 1390 | | - ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) |
|---|
| 1392 | + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO) |
|---|
| 1391 | 1393 | |
|---|
| 1392 | 1394 | static u64 kvm_get_arch_capabilities(void) |
|---|
| 1393 | 1395 | { |
|---|
| .. | .. |
|---|
| 1443 | 1445 | * using VERW to clear CPU buffers. |
|---|
| 1444 | 1446 | */ |
|---|
| 1445 | 1447 | } |
|---|
| 1448 | + |
|---|
| 1449 | + if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) |
|---|
| 1450 | + data |= ARCH_CAP_GDS_NO; |
|---|
| 1446 | 1451 | |
|---|
| 1447 | 1452 | return data; |
|---|
| 1448 | 1453 | } |
|---|
| .. | .. |
|---|
| 1586 | 1591 | allowed = !!test_bit(index - start, bitmap); |
|---|
| 1587 | 1592 | break; |
|---|
| 1588 | 1593 | } |
|---|
| 1594 | + |
|---|
| 1595 | + /* Note, VM-Exits that go down the "slow" path are accounted below. */ |
|---|
| 1596 | + ++vcpu->stat.exits; |
|---|
| 1589 | 1597 | } |
|---|
| 1590 | 1598 | |
|---|
| 1591 | 1599 | out: |
|---|
| .. | .. |
|---|
| 3020 | 3028 | |
|---|
| 3021 | 3029 | static void record_steal_time(struct kvm_vcpu *vcpu) |
|---|
| 3022 | 3030 | { |
|---|
| 3023 | | - struct kvm_host_map map; |
|---|
| 3024 | | - struct kvm_steal_time *st; |
|---|
| 3031 | + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
|---|
| 3032 | + struct kvm_steal_time __user *st; |
|---|
| 3033 | + struct kvm_memslots *slots; |
|---|
| 3034 | + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
|---|
| 3035 | + u64 steal; |
|---|
| 3036 | + u32 version; |
|---|
| 3025 | 3037 | |
|---|
| 3026 | 3038 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
|---|
| 3027 | 3039 | return; |
|---|
| 3028 | 3040 | |
|---|
| 3029 | | - /* -EAGAIN is returned in atomic context so we can just return. */ |
|---|
| 3030 | | - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, |
|---|
| 3031 | | - &map, &vcpu->arch.st.cache, false)) |
|---|
| 3041 | + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) |
|---|
| 3032 | 3042 | return; |
|---|
| 3033 | 3043 | |
|---|
| 3034 | | - st = map.hva + |
|---|
| 3035 | | - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); |
|---|
| 3044 | + slots = kvm_memslots(vcpu->kvm); |
|---|
| 3036 | 3045 | |
|---|
| 3046 | + if (unlikely(slots->generation != ghc->generation || |
|---|
| 3047 | + gpa != ghc->gpa || |
|---|
| 3048 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { |
|---|
| 3049 | + /* We rely on the fact that it fits in a single page. */ |
|---|
| 3050 | + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); |
|---|
| 3051 | + |
|---|
| 3052 | + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) || |
|---|
| 3053 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot) |
|---|
| 3054 | + return; |
|---|
| 3055 | + } |
|---|
| 3056 | + |
|---|
| 3057 | + st = (struct kvm_steal_time __user *)ghc->hva; |
|---|
| 3037 | 3058 | /* |
|---|
| 3038 | 3059 | * Doing a TLB flush here, on the guest's behalf, can avoid |
|---|
| 3039 | 3060 | * expensive IPIs. |
|---|
| 3040 | 3061 | */ |
|---|
| 3041 | 3062 | if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { |
|---|
| 3063 | + u8 st_preempted = 0; |
|---|
| 3064 | + int err = -EFAULT; |
|---|
| 3065 | + |
|---|
| 3066 | + if (!user_access_begin(st, sizeof(*st))) |
|---|
| 3067 | + return; |
|---|
| 3068 | + |
|---|
| 3069 | + asm volatile("1: xchgb %0, %2\n" |
|---|
| 3070 | + "xor %1, %1\n" |
|---|
| 3071 | + "2:\n" |
|---|
| 3072 | + _ASM_EXTABLE_UA(1b, 2b) |
|---|
| 3073 | + : "+q" (st_preempted), |
|---|
| 3074 | + "+&r" (err), |
|---|
| 3075 | + "+m" (st->preempted)); |
|---|
| 3076 | + if (err) |
|---|
| 3077 | + goto out; |
|---|
| 3078 | + |
|---|
| 3079 | + user_access_end(); |
|---|
| 3080 | + |
|---|
| 3081 | + vcpu->arch.st.preempted = 0; |
|---|
| 3082 | + |
|---|
| 3042 | 3083 | trace_kvm_pv_tlb_flush(vcpu->vcpu_id, |
|---|
| 3043 | | - st->preempted & KVM_VCPU_FLUSH_TLB); |
|---|
| 3044 | | - if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) |
|---|
| 3084 | + st_preempted & KVM_VCPU_FLUSH_TLB); |
|---|
| 3085 | + if (st_preempted & KVM_VCPU_FLUSH_TLB) |
|---|
| 3045 | 3086 | kvm_vcpu_flush_tlb_guest(vcpu); |
|---|
| 3087 | + |
|---|
| 3088 | + if (!user_access_begin(st, sizeof(*st))) |
|---|
| 3089 | + goto dirty; |
|---|
| 3046 | 3090 | } else { |
|---|
| 3047 | | - st->preempted = 0; |
|---|
| 3091 | + if (!user_access_begin(st, sizeof(*st))) |
|---|
| 3092 | + return; |
|---|
| 3093 | + |
|---|
| 3094 | + unsafe_put_user(0, &st->preempted, out); |
|---|
| 3095 | + vcpu->arch.st.preempted = 0; |
|---|
| 3048 | 3096 | } |
|---|
| 3049 | 3097 | |
|---|
| 3050 | | - vcpu->arch.st.preempted = 0; |
|---|
| 3098 | + unsafe_get_user(version, &st->version, out); |
|---|
| 3099 | + if (version & 1) |
|---|
| 3100 | + version += 1; /* first time write, random junk */ |
|---|
| 3051 | 3101 | |
|---|
| 3052 | | - if (st->version & 1) |
|---|
| 3053 | | - st->version += 1; /* first time write, random junk */ |
|---|
| 3054 | | - |
|---|
| 3055 | | - st->version += 1; |
|---|
| 3102 | + version += 1; |
|---|
| 3103 | + unsafe_put_user(version, &st->version, out); |
|---|
| 3056 | 3104 | |
|---|
| 3057 | 3105 | smp_wmb(); |
|---|
| 3058 | 3106 | |
|---|
| 3059 | | - st->steal += current->sched_info.run_delay - |
|---|
| 3107 | + unsafe_get_user(steal, &st->steal, out); |
|---|
| 3108 | + steal += current->sched_info.run_delay - |
|---|
| 3060 | 3109 | vcpu->arch.st.last_steal; |
|---|
| 3061 | 3110 | vcpu->arch.st.last_steal = current->sched_info.run_delay; |
|---|
| 3111 | + unsafe_put_user(steal, &st->steal, out); |
|---|
| 3062 | 3112 | |
|---|
| 3063 | | - smp_wmb(); |
|---|
| 3113 | + version += 1; |
|---|
| 3114 | + unsafe_put_user(version, &st->version, out); |
|---|
| 3064 | 3115 | |
|---|
| 3065 | | - st->version += 1; |
|---|
| 3066 | | - |
|---|
| 3067 | | - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); |
|---|
| 3116 | + out: |
|---|
| 3117 | + user_access_end(); |
|---|
| 3118 | + dirty: |
|---|
| 3119 | + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); |
|---|
| 3068 | 3120 | } |
|---|
| 3069 | 3121 | |
|---|
| 3070 | 3122 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
|---|
| .. | .. |
|---|
| 4049 | 4101 | |
|---|
| 4050 | 4102 | static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) |
|---|
| 4051 | 4103 | { |
|---|
| 4052 | | - struct kvm_host_map map; |
|---|
| 4053 | | - struct kvm_steal_time *st; |
|---|
| 4104 | + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
|---|
| 4105 | + struct kvm_steal_time __user *st; |
|---|
| 4106 | + struct kvm_memslots *slots; |
|---|
| 4107 | + static const u8 preempted = KVM_VCPU_PREEMPTED; |
|---|
| 4108 | + gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
|---|
| 4054 | 4109 | |
|---|
| 4110 | + /* |
|---|
| 4111 | + * The vCPU can be marked preempted if and only if the VM-Exit was on |
|---|
| 4112 | + * an instruction boundary and will not trigger guest emulation of any |
|---|
| 4113 | + * kind (see vcpu_run). Vendor specific code controls (conservatively) |
|---|
| 4114 | + * when this is true, for example allowing the vCPU to be marked |
|---|
| 4115 | + * preempted if and only if the VM-Exit was due to a host interrupt. |
|---|
| 4116 | + */ |
|---|
| 4117 | + if (!vcpu->arch.at_instruction_boundary) { |
|---|
| 4118 | + vcpu->stat.preemption_other++; |
|---|
| 4119 | + return; |
|---|
| 4120 | + } |
|---|
| 4121 | + |
|---|
| 4122 | + vcpu->stat.preemption_reported++; |
|---|
| 4055 | 4123 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
|---|
| 4056 | 4124 | return; |
|---|
| 4057 | 4125 | |
|---|
| 4058 | 4126 | if (vcpu->arch.st.preempted) |
|---|
| 4059 | 4127 | return; |
|---|
| 4060 | 4128 | |
|---|
| 4061 | | - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, |
|---|
| 4062 | | - &vcpu->arch.st.cache, true)) |
|---|
| 4129 | + /* This happens on process exit */ |
|---|
| 4130 | + if (unlikely(current->mm != vcpu->kvm->mm)) |
|---|
| 4063 | 4131 | return; |
|---|
| 4064 | 4132 | |
|---|
| 4065 | | - st = map.hva + |
|---|
| 4066 | | - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); |
|---|
| 4133 | + slots = kvm_memslots(vcpu->kvm); |
|---|
| 4067 | 4134 | |
|---|
| 4068 | | - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; |
|---|
| 4135 | + if (unlikely(slots->generation != ghc->generation || |
|---|
| 4136 | + gpa != ghc->gpa || |
|---|
| 4137 | + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) |
|---|
| 4138 | + return; |
|---|
| 4069 | 4139 | |
|---|
| 4070 | | - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); |
|---|
| 4140 | + st = (struct kvm_steal_time __user *)ghc->hva; |
|---|
| 4141 | + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); |
|---|
| 4142 | + |
|---|
| 4143 | + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) |
|---|
| 4144 | + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; |
|---|
| 4145 | + |
|---|
| 4146 | + mark_page_dirty_in_slot(ghc->memslot, gpa_to_gfn(ghc->gpa)); |
|---|
| 4071 | 4147 | } |
|---|
| 4072 | 4148 | |
|---|
| 4073 | 4149 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
|---|
| 4074 | 4150 | { |
|---|
| 4075 | 4151 | int idx; |
|---|
| 4076 | 4152 | |
|---|
| 4077 | | - if (vcpu->preempted) |
|---|
| 4153 | + if (vcpu->preempted) { |
|---|
| 4078 | 4154 | vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); |
|---|
| 4079 | 4155 | |
|---|
| 4080 | | - /* |
|---|
| 4081 | | - * Disable page faults because we're in atomic context here. |
|---|
| 4082 | | - * kvm_write_guest_offset_cached() would call might_fault() |
|---|
| 4083 | | - * that relies on pagefault_disable() to tell if there's a |
|---|
| 4084 | | - * bug. NOTE: the write to guest memory may not go through if |
|---|
| 4085 | | - * during postcopy live migration or if there's heavy guest |
|---|
| 4086 | | - * paging. |
|---|
| 4087 | | - */ |
|---|
| 4088 | | - pagefault_disable(); |
|---|
| 4089 | | - /* |
|---|
| 4090 | | - * kvm_memslots() will be called by |
|---|
| 4091 | | - * kvm_write_guest_offset_cached() so take the srcu lock. |
|---|
| 4092 | | - */ |
|---|
| 4093 | | - idx = srcu_read_lock(&vcpu->kvm->srcu); |
|---|
| 4094 | | - kvm_steal_time_set_preempted(vcpu); |
|---|
| 4095 | | - srcu_read_unlock(&vcpu->kvm->srcu, idx); |
|---|
| 4096 | | - pagefault_enable(); |
|---|
| 4156 | + /* |
|---|
| 4157 | + * Take the srcu lock as memslots will be accessed to check the gfn |
|---|
| 4158 | + * cache generation against the memslots generation. |
|---|
| 4159 | + */ |
|---|
| 4160 | + idx = srcu_read_lock(&vcpu->kvm->srcu); |
|---|
| 4161 | + kvm_steal_time_set_preempted(vcpu); |
|---|
| 4162 | + srcu_read_unlock(&vcpu->kvm->srcu, idx); |
|---|
| 4163 | + } |
|---|
| 4164 | + |
|---|
| 4097 | 4165 | kvm_x86_ops.vcpu_put(vcpu); |
|---|
| 4098 | 4166 | vcpu->arch.last_host_tsc = rdtsc(); |
|---|
| 4099 | 4167 | /* |
|---|
| .. | .. |
|---|
| 4455 | 4523 | { |
|---|
| 4456 | 4524 | unsigned long val; |
|---|
| 4457 | 4525 | |
|---|
| 4526 | + memset(dbgregs, 0, sizeof(*dbgregs)); |
|---|
| 4458 | 4527 | memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); |
|---|
| 4459 | 4528 | kvm_get_dr(vcpu, 6, &val); |
|---|
| 4460 | 4529 | dbgregs->dr6 = val; |
|---|
| 4461 | 4530 | dbgregs->dr7 = vcpu->arch.dr7; |
|---|
| 4462 | | - dbgregs->flags = 0; |
|---|
| 4463 | | - memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); |
|---|
| 4464 | 4531 | } |
|---|
| 4465 | 4532 | |
|---|
| 4466 | 4533 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, |
|---|
| .. | .. |
|---|
| 7535 | 7602 | write_fault_to_spt, |
|---|
| 7536 | 7603 | emulation_type)) |
|---|
| 7537 | 7604 | return 1; |
|---|
| 7538 | | - if (ctxt->have_exception) { |
|---|
| 7605 | + |
|---|
| 7606 | + if (ctxt->have_exception && |
|---|
| 7607 | + !(emulation_type & EMULTYPE_SKIP)) { |
|---|
| 7539 | 7608 | /* |
|---|
| 7540 | 7609 | * #UD should result in just EMULATION_FAILED, and trap-like |
|---|
| 7541 | 7610 | * exception should not be encountered during decode. |
|---|
| .. | .. |
|---|
| 8095 | 8164 | r = -EOPNOTSUPP; |
|---|
| 8096 | 8165 | goto out; |
|---|
| 8097 | 8166 | } |
|---|
| 8098 | | - |
|---|
| 8099 | | -#ifdef CONFIG_PREEMPT_RT |
|---|
| 8100 | | - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
|---|
| 8101 | | - pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); |
|---|
| 8102 | | - r = -EOPNOTSUPP; |
|---|
| 8103 | | - goto out; |
|---|
| 8104 | | - } |
|---|
| 8105 | | -#endif |
|---|
| 8106 | 8167 | |
|---|
| 8107 | 8168 | r = -ENOMEM; |
|---|
| 8108 | 8169 | x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), |
|---|
| .. | .. |
|---|
| 9364 | 9425 | vcpu->arch.l1tf_flush_l1d = true; |
|---|
| 9365 | 9426 | |
|---|
| 9366 | 9427 | for (;;) { |
|---|
| 9428 | + /* |
|---|
| 9429 | + * If another guest vCPU requests a PV TLB flush in the middle |
|---|
| 9430 | + * of instruction emulation, the rest of the emulation could |
|---|
| 9431 | + * use a stale page translation. Assume that any code after |
|---|
| 9432 | + * this point can start executing an instruction. |
|---|
| 9433 | + */ |
|---|
| 9434 | + vcpu->arch.at_instruction_boundary = false; |
|---|
| 9367 | 9435 | if (kvm_vcpu_running(vcpu)) { |
|---|
| 9368 | 9436 | r = vcpu_enter_guest(vcpu); |
|---|
| 9369 | 9437 | } else { |
|---|
| .. | .. |
|---|
| 10249 | 10317 | |
|---|
| 10250 | 10318 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
|---|
| 10251 | 10319 | { |
|---|
| 10252 | | - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; |
|---|
| 10253 | 10320 | int idx; |
|---|
| 10254 | | - |
|---|
| 10255 | | - kvm_release_pfn(cache->pfn, cache->dirty, cache); |
|---|
| 10256 | 10321 | |
|---|
| 10257 | 10322 | kvmclock_reset(vcpu); |
|---|
| 10258 | 10323 | |
|---|