.. | .. |
---|
135 | 135 | #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) |
---|
136 | 136 | #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE |
---|
137 | 137 | #define KVM_VM_CR0_ALWAYS_ON \ |
---|
138 | | - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ |
---|
139 | | - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) |
---|
| 138 | + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) |
---|
140 | 139 | |
---|
141 | 140 | #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE |
---|
142 | 141 | #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) |
---|
.. | .. |
---|
1431 | 1430 | |
---|
1432 | 1431 | /* |
---|
1433 | 1432 | * No indirect branch prediction barrier needed when switching |
---|
1434 | | - * the active VMCS within a guest, e.g. on nested VM-Enter. |
---|
1435 | | - * The L1 VMM can protect itself with retpolines, IBPB or IBRS. |
---|
| 1433 | + * the active VMCS within a vCPU, unless IBRS is advertised to |
---|
| 1434 | + * the vCPU. To minimize the number of IBPBs executed, KVM |
---|
| 1435 | + * performs IBPB on nested VM-Exit (a single nested transition |
---|
| 1436 | + * may switch the active VMCS multiple times). |
---|
1436 | 1437 | */ |
---|
1437 | 1438 | if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev)) |
---|
1438 | 1439 | indirect_branch_prediction_barrier(); |
---|
.. | .. |
---|
1518 | 1519 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
---|
1519 | 1520 | unsigned long old_rflags; |
---|
1520 | 1521 | |
---|
| 1522 | + /* |
---|
| 1523 | + * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU |
---|
| 1524 | + * is an unrestricted guest in order to mark L2 as needing emulation |
---|
| 1525 | + * if L1 runs L2 as a restricted guest. |
---|
| 1526 | + */ |
---|
1521 | 1527 | if (is_unrestricted_guest(vcpu)) { |
---|
1522 | 1528 | kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); |
---|
1523 | 1529 | vmx->rflags = rflags; |
---|
.. | .. |
---|
2723 | 2729 | if (!loaded_vmcs->msr_bitmap) |
---|
2724 | 2730 | goto out_vmcs; |
---|
2725 | 2731 | memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE); |
---|
2726 | | - |
---|
2727 | | - if (IS_ENABLED(CONFIG_HYPERV) && |
---|
2728 | | - static_branch_unlikely(&enable_evmcs) && |
---|
2729 | | - (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { |
---|
2730 | | - struct hv_enlightened_vmcs *evmcs = |
---|
2731 | | - (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs; |
---|
2732 | | - |
---|
2733 | | - evmcs->hv_enlightenments_control.msr_bitmap = 1; |
---|
2734 | | - } |
---|
2735 | 2732 | } |
---|
2736 | 2733 | |
---|
2737 | 2734 | memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state)); |
---|
.. | .. |
---|
3071 | 3068 | kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); |
---|
3072 | 3069 | } |
---|
3073 | 3070 | |
---|
3074 | | -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, |
---|
3075 | | - unsigned long cr0, |
---|
3076 | | - struct kvm_vcpu *vcpu) |
---|
3077 | | -{ |
---|
3078 | | - struct vcpu_vmx *vmx = to_vmx(vcpu); |
---|
3079 | | - |
---|
3080 | | - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) |
---|
3081 | | - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); |
---|
3082 | | - if (!(cr0 & X86_CR0_PG)) { |
---|
3083 | | - /* From paging/starting to nonpaging */ |
---|
3084 | | - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | |
---|
3085 | | - CPU_BASED_CR3_STORE_EXITING); |
---|
3086 | | - vcpu->arch.cr0 = cr0; |
---|
3087 | | - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); |
---|
3088 | | - } else if (!is_paging(vcpu)) { |
---|
3089 | | - /* From nonpaging to paging */ |
---|
3090 | | - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | |
---|
3091 | | - CPU_BASED_CR3_STORE_EXITING); |
---|
3092 | | - vcpu->arch.cr0 = cr0; |
---|
3093 | | - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); |
---|
3094 | | - } |
---|
3095 | | - |
---|
3096 | | - if (!(cr0 & X86_CR0_WP)) |
---|
3097 | | - *hw_cr0 &= ~X86_CR0_WP; |
---|
3098 | | -} |
---|
| 3071 | +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ |
---|
| 3072 | + CPU_BASED_CR3_STORE_EXITING) |
---|
3099 | 3073 | |
---|
3100 | 3074 | void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
---|
3101 | 3075 | { |
---|
3102 | 3076 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
---|
3103 | 3077 | unsigned long hw_cr0; |
---|
| 3078 | + u32 tmp; |
---|
3104 | 3079 | |
---|
3105 | 3080 | hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); |
---|
3106 | | - if (is_unrestricted_guest(vcpu)) |
---|
| 3081 | + if (enable_unrestricted_guest) |
---|
3107 | 3082 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; |
---|
3108 | 3083 | else { |
---|
3109 | 3084 | hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; |
---|
| 3085 | + if (!enable_ept) |
---|
| 3086 | + hw_cr0 |= X86_CR0_WP; |
---|
3110 | 3087 | |
---|
3111 | 3088 | if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) |
---|
3112 | 3089 | enter_pmode(vcpu); |
---|
.. | .. |
---|
3124 | 3101 | } |
---|
3125 | 3102 | #endif |
---|
3126 | 3103 | |
---|
3127 | | - if (enable_ept && !is_unrestricted_guest(vcpu)) |
---|
3128 | | - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); |
---|
| 3104 | + if (enable_ept && !enable_unrestricted_guest) { |
---|
| 3105 | + /* |
---|
| 3106 | + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If |
---|
| 3107 | + * the below code _enables_ CR3 exiting, vmx_cache_reg() will |
---|
| 3108 | + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks |
---|
| 3109 | + * KVM's CR3 is installed. |
---|
| 3110 | + */ |
---|
| 3111 | + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) |
---|
| 3112 | + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); |
---|
| 3113 | + |
---|
| 3114 | + /* |
---|
| 3115 | + * When running with EPT but not unrestricted guest, KVM must |
---|
| 3116 | + * intercept CR3 accesses when paging is _disabled_. This is |
---|
| 3117 | + * necessary because restricted guests can't actually run with |
---|
| 3118 | + * paging disabled, and so KVM stuffs its own CR3 in order to |
---|
| 3119 | + * run the guest when identity mapped page tables. |
---|
| 3120 | + * |
---|
| 3121 | + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the |
---|
| 3122 | + * update, it may be stale with respect to CR3 interception, |
---|
| 3123 | + * e.g. after nested VM-Enter. |
---|
| 3124 | + * |
---|
| 3125 | + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or |
---|
| 3126 | + * stores to forward them to L1, even if KVM does not need to |
---|
| 3127 | + * intercept them to preserve its identity mapped page tables. |
---|
| 3128 | + */ |
---|
| 3129 | + if (!(cr0 & X86_CR0_PG)) { |
---|
| 3130 | + exec_controls_setbit(vmx, CR3_EXITING_BITS); |
---|
| 3131 | + } else if (!is_guest_mode(vcpu)) { |
---|
| 3132 | + exec_controls_clearbit(vmx, CR3_EXITING_BITS); |
---|
| 3133 | + } else { |
---|
| 3134 | + tmp = exec_controls_get(vmx); |
---|
| 3135 | + tmp &= ~CR3_EXITING_BITS; |
---|
| 3136 | + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; |
---|
| 3137 | + exec_controls_set(vmx, tmp); |
---|
| 3138 | + } |
---|
| 3139 | + |
---|
| 3140 | + if (!is_paging(vcpu) != !(cr0 & X86_CR0_PG)) { |
---|
| 3141 | + vcpu->arch.cr0 = cr0; |
---|
| 3142 | + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); |
---|
| 3143 | + } |
---|
| 3144 | + } |
---|
3129 | 3145 | |
---|
3130 | 3146 | vmcs_writel(CR0_READ_SHADOW, cr0); |
---|
3131 | 3147 | vmcs_writel(GUEST_CR0, hw_cr0); |
---|
.. | .. |
---|
3220 | 3236 | unsigned long hw_cr4; |
---|
3221 | 3237 | |
---|
3222 | 3238 | hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); |
---|
3223 | | - if (is_unrestricted_guest(vcpu)) |
---|
| 3239 | + if (enable_unrestricted_guest) |
---|
3224 | 3240 | hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; |
---|
3225 | 3241 | else if (vmx->rmode.vm86_active) |
---|
3226 | 3242 | hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON; |
---|
.. | .. |
---|
3240 | 3256 | vcpu->arch.cr4 = cr4; |
---|
3241 | 3257 | kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); |
---|
3242 | 3258 | |
---|
3243 | | - if (!is_unrestricted_guest(vcpu)) { |
---|
| 3259 | + if (!enable_unrestricted_guest) { |
---|
3244 | 3260 | if (enable_ept) { |
---|
3245 | 3261 | if (!is_paging(vcpu)) { |
---|
3246 | 3262 | hw_cr4 &= ~X86_CR4_PAE; |
---|
.. | .. |
---|
3332 | 3348 | { |
---|
3333 | 3349 | u32 ar; |
---|
3334 | 3350 | |
---|
3335 | | - if (var->unusable || !var->present) |
---|
3336 | | - ar = 1 << 16; |
---|
3337 | | - else { |
---|
3338 | | - ar = var->type & 15; |
---|
3339 | | - ar |= (var->s & 1) << 4; |
---|
3340 | | - ar |= (var->dpl & 3) << 5; |
---|
3341 | | - ar |= (var->present & 1) << 7; |
---|
3342 | | - ar |= (var->avl & 1) << 12; |
---|
3343 | | - ar |= (var->l & 1) << 13; |
---|
3344 | | - ar |= (var->db & 1) << 14; |
---|
3345 | | - ar |= (var->g & 1) << 15; |
---|
3346 | | - } |
---|
| 3351 | + ar = var->type & 15; |
---|
| 3352 | + ar |= (var->s & 1) << 4; |
---|
| 3353 | + ar |= (var->dpl & 3) << 5; |
---|
| 3354 | + ar |= (var->present & 1) << 7; |
---|
| 3355 | + ar |= (var->avl & 1) << 12; |
---|
| 3356 | + ar |= (var->l & 1) << 13; |
---|
| 3357 | + ar |= (var->db & 1) << 14; |
---|
| 3358 | + ar |= (var->g & 1) << 15; |
---|
| 3359 | + ar |= (var->unusable || !var->present) << 16; |
---|
3347 | 3360 | |
---|
3348 | 3361 | return ar; |
---|
3349 | 3362 | } |
---|
.. | .. |
---|
3795 | 3808 | __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); |
---|
3796 | 3809 | } |
---|
3797 | 3810 | |
---|
| 3811 | +static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx) |
---|
| 3812 | +{ |
---|
| 3813 | + /* |
---|
| 3814 | + * When KVM is a nested hypervisor on top of Hyper-V and uses |
---|
| 3815 | + * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR |
---|
| 3816 | + * bitmap has changed. |
---|
| 3817 | + */ |
---|
| 3818 | + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) { |
---|
| 3819 | + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; |
---|
| 3820 | + |
---|
| 3821 | + if (evmcs->hv_enlightenments_control.msr_bitmap) |
---|
| 3822 | + evmcs->hv_clean_fields &= |
---|
| 3823 | + ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP; |
---|
| 3824 | + } |
---|
| 3825 | +} |
---|
| 3826 | + |
---|
3798 | 3827 | static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, |
---|
3799 | 3828 | u32 msr, int type) |
---|
3800 | 3829 | { |
---|
.. | .. |
---|
3804 | 3833 | if (!cpu_has_vmx_msr_bitmap()) |
---|
3805 | 3834 | return; |
---|
3806 | 3835 | |
---|
3807 | | - if (static_branch_unlikely(&enable_evmcs)) |
---|
3808 | | - evmcs_touch_msr_bitmap(); |
---|
| 3836 | + vmx_msr_bitmap_l01_changed(vmx); |
---|
3809 | 3837 | |
---|
3810 | 3838 | /* |
---|
3811 | 3839 | * Mark the desired intercept state in shadow bitmap, this is needed |
---|
.. | .. |
---|
3850 | 3878 | if (!cpu_has_vmx_msr_bitmap()) |
---|
3851 | 3879 | return; |
---|
3852 | 3880 | |
---|
3853 | | - if (static_branch_unlikely(&enable_evmcs)) |
---|
3854 | | - evmcs_touch_msr_bitmap(); |
---|
| 3881 | + vmx_msr_bitmap_l01_changed(vmx); |
---|
3855 | 3882 | |
---|
3856 | 3883 | /* |
---|
3857 | 3884 | * Mark the desired intercept state in shadow bitmap, this is needed |
---|
.. | .. |
---|
6506 | 6533 | return; |
---|
6507 | 6534 | |
---|
6508 | 6535 | handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); |
---|
| 6536 | + vcpu->arch.at_instruction_boundary = true; |
---|
6509 | 6537 | } |
---|
6510 | 6538 | |
---|
6511 | 6539 | static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) |
---|
.. | .. |
---|
7030 | 7058 | if (err < 0) |
---|
7031 | 7059 | goto free_pml; |
---|
7032 | 7060 | |
---|
| 7061 | + /* |
---|
| 7062 | + * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a |
---|
| 7063 | + * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the |
---|
| 7064 | + * feature only for vmcs01, KVM currently isn't equipped to realize any |
---|
| 7065 | + * performance benefits from enabling it for vmcs02. |
---|
| 7066 | + */ |
---|
| 7067 | + if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) && |
---|
| 7068 | + (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) { |
---|
| 7069 | + struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs; |
---|
| 7070 | + |
---|
| 7071 | + evmcs->hv_enlightenments_control.msr_bitmap = 1; |
---|
| 7072 | + } |
---|
| 7073 | + |
---|
7033 | 7074 | /* The MSR bitmap starts with all ones */ |
---|
7034 | 7075 | bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS); |
---|
7035 | 7076 | bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS); |
---|
.. | .. |
---|
7519 | 7560 | /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */ |
---|
7520 | 7561 | break; |
---|
7521 | 7562 | |
---|
| 7563 | + case x86_intercept_pause: |
---|
| 7564 | + /* |
---|
| 7565 | + * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides |
---|
| 7566 | + * with vanilla NOPs in the emulator. Apply the interception |
---|
| 7567 | + * check only to actual PAUSE instructions. Don't check |
---|
| 7568 | + * PAUSE-loop-exiting, software can't expect a given PAUSE to |
---|
| 7569 | + * exit, i.e. KVM is within its rights to allow L2 to execute |
---|
| 7570 | + * the PAUSE. |
---|
| 7571 | + */ |
---|
| 7572 | + if ((info->rep_prefix != REPE_PREFIX) || |
---|
| 7573 | + !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING)) |
---|
| 7574 | + return X86EMUL_CONTINUE; |
---|
| 7575 | + |
---|
| 7576 | + break; |
---|
| 7577 | + |
---|
7522 | 7578 | /* TODO: check more intercepts... */ |
---|
7523 | 7579 | default: |
---|
7524 | 7580 | break; |
---|