hc
2024-01-03 2f7c68cb55ecb7331f2381deb497c27155f32faf
kernel/arch/x86/kvm/vmx/vmx.c
....@@ -135,8 +135,7 @@
135135 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
136136 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
137137 #define KVM_VM_CR0_ALWAYS_ON \
138
- (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
139
- X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
138
+ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
140139
141140 #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
142141 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
....@@ -1431,8 +1430,10 @@
14311430
14321431 /*
14331432 * No indirect branch prediction barrier needed when switching
1434
- * the active VMCS within a guest, e.g. on nested VM-Enter.
1435
- * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
1433
+ * the active VMCS within a vCPU, unless IBRS is advertised to
1434
+ * the vCPU. To minimize the number of IBPBs executed, KVM
1435
+ * performs IBPB on nested VM-Exit (a single nested transition
1436
+ * may switch the active VMCS multiple times).
14361437 */
14371438 if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
14381439 indirect_branch_prediction_barrier();
....@@ -1518,6 +1519,11 @@
15181519 struct vcpu_vmx *vmx = to_vmx(vcpu);
15191520 unsigned long old_rflags;
15201521
1522
+ /*
1523
+ * Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1524
+ * is an unrestricted guest in order to mark L2 as needing emulation
1525
+ * if L1 runs L2 as a restricted guest.
1526
+ */
15211527 if (is_unrestricted_guest(vcpu)) {
15221528 kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
15231529 vmx->rflags = rflags;
....@@ -2723,15 +2729,6 @@
27232729 if (!loaded_vmcs->msr_bitmap)
27242730 goto out_vmcs;
27252731 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
2726
-
2727
- if (IS_ENABLED(CONFIG_HYPERV) &&
2728
- static_branch_unlikely(&enable_evmcs) &&
2729
- (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
2730
- struct hv_enlightened_vmcs *evmcs =
2731
- (struct hv_enlightened_vmcs *)loaded_vmcs->vmcs;
2732
-
2733
- evmcs->hv_enlightenments_control.msr_bitmap = 1;
2734
- }
27352732 }
27362733
27372734 memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
....@@ -3071,42 +3068,22 @@
30713068 kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
30723069 }
30733070
3074
-static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3075
- unsigned long cr0,
3076
- struct kvm_vcpu *vcpu)
3077
-{
3078
- struct vcpu_vmx *vmx = to_vmx(vcpu);
3079
-
3080
- if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3081
- vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3082
- if (!(cr0 & X86_CR0_PG)) {
3083
- /* From paging/starting to nonpaging */
3084
- exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3085
- CPU_BASED_CR3_STORE_EXITING);
3086
- vcpu->arch.cr0 = cr0;
3087
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3088
- } else if (!is_paging(vcpu)) {
3089
- /* From nonpaging to paging */
3090
- exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
3091
- CPU_BASED_CR3_STORE_EXITING);
3092
- vcpu->arch.cr0 = cr0;
3093
- vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3094
- }
3095
-
3096
- if (!(cr0 & X86_CR0_WP))
3097
- *hw_cr0 &= ~X86_CR0_WP;
3098
-}
3071
+#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \
3072
+ CPU_BASED_CR3_STORE_EXITING)
30993073
31003074 void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
31013075 {
31023076 struct vcpu_vmx *vmx = to_vmx(vcpu);
31033077 unsigned long hw_cr0;
3078
+ u32 tmp;
31043079
31053080 hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3106
- if (is_unrestricted_guest(vcpu))
3081
+ if (enable_unrestricted_guest)
31073082 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
31083083 else {
31093084 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3085
+ if (!enable_ept)
3086
+ hw_cr0 |= X86_CR0_WP;
31103087
31113088 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
31123089 enter_pmode(vcpu);
....@@ -3124,8 +3101,47 @@
31243101 }
31253102 #endif
31263103
3127
- if (enable_ept && !is_unrestricted_guest(vcpu))
3128
- ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3104
+ if (enable_ept && !enable_unrestricted_guest) {
3105
+ /*
3106
+ * Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3107
+ * the below code _enables_ CR3 exiting, vmx_cache_reg() will
3108
+ * (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3109
+ * KVM's CR3 is installed.
3110
+ */
3111
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
3112
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
3113
+
3114
+ /*
3115
+ * When running with EPT but not unrestricted guest, KVM must
3116
+ * intercept CR3 accesses when paging is _disabled_. This is
3117
+ * necessary because restricted guests can't actually run with
3118
+ * paging disabled, and so KVM stuffs its own CR3 in order to
3119
+ * run the guest when identity mapped page tables.
3120
+ *
3121
+ * Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3122
+ * update, it may be stale with respect to CR3 interception,
3123
+ * e.g. after nested VM-Enter.
3124
+ *
3125
+ * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3126
+ * stores to forward them to L1, even if KVM does not need to
3127
+ * intercept them to preserve its identity mapped page tables.
3128
+ */
3129
+ if (!(cr0 & X86_CR0_PG)) {
3130
+ exec_controls_setbit(vmx, CR3_EXITING_BITS);
3131
+ } else if (!is_guest_mode(vcpu)) {
3132
+ exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3133
+ } else {
3134
+ tmp = exec_controls_get(vmx);
3135
+ tmp &= ~CR3_EXITING_BITS;
3136
+ tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3137
+ exec_controls_set(vmx, tmp);
3138
+ }
3139
+
3140
+ if (!is_paging(vcpu) != !(cr0 & X86_CR0_PG)) {
3141
+ vcpu->arch.cr0 = cr0;
3142
+ vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3143
+ }
3144
+ }
31293145
31303146 vmcs_writel(CR0_READ_SHADOW, cr0);
31313147 vmcs_writel(GUEST_CR0, hw_cr0);
....@@ -3220,7 +3236,7 @@
32203236 unsigned long hw_cr4;
32213237
32223238 hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
3223
- if (is_unrestricted_guest(vcpu))
3239
+ if (enable_unrestricted_guest)
32243240 hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
32253241 else if (vmx->rmode.vm86_active)
32263242 hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
....@@ -3240,7 +3256,7 @@
32403256 vcpu->arch.cr4 = cr4;
32413257 kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
32423258
3243
- if (!is_unrestricted_guest(vcpu)) {
3259
+ if (!enable_unrestricted_guest) {
32443260 if (enable_ept) {
32453261 if (!is_paging(vcpu)) {
32463262 hw_cr4 &= ~X86_CR4_PAE;
....@@ -3332,18 +3348,15 @@
33323348 {
33333349 u32 ar;
33343350
3335
- if (var->unusable || !var->present)
3336
- ar = 1 << 16;
3337
- else {
3338
- ar = var->type & 15;
3339
- ar |= (var->s & 1) << 4;
3340
- ar |= (var->dpl & 3) << 5;
3341
- ar |= (var->present & 1) << 7;
3342
- ar |= (var->avl & 1) << 12;
3343
- ar |= (var->l & 1) << 13;
3344
- ar |= (var->db & 1) << 14;
3345
- ar |= (var->g & 1) << 15;
3346
- }
3351
+ ar = var->type & 15;
3352
+ ar |= (var->s & 1) << 4;
3353
+ ar |= (var->dpl & 3) << 5;
3354
+ ar |= (var->present & 1) << 7;
3355
+ ar |= (var->avl & 1) << 12;
3356
+ ar |= (var->l & 1) << 13;
3357
+ ar |= (var->db & 1) << 14;
3358
+ ar |= (var->g & 1) << 15;
3359
+ ar |= (var->unusable || !var->present) << 16;
33473360
33483361 return ar;
33493362 }
....@@ -3795,6 +3808,22 @@
37953808 __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
37963809 }
37973810
3811
+static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3812
+{
3813
+ /*
3814
+ * When KVM is a nested hypervisor on top of Hyper-V and uses
3815
+ * 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3816
+ * bitmap has changed.
3817
+ */
3818
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
3819
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
3820
+
3821
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
3822
+ evmcs->hv_clean_fields &=
3823
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3824
+ }
3825
+}
3826
+
37983827 static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
37993828 u32 msr, int type)
38003829 {
....@@ -3804,8 +3833,7 @@
38043833 if (!cpu_has_vmx_msr_bitmap())
38053834 return;
38063835
3807
- if (static_branch_unlikely(&enable_evmcs))
3808
- evmcs_touch_msr_bitmap();
3836
+ vmx_msr_bitmap_l01_changed(vmx);
38093837
38103838 /*
38113839 * Mark the desired intercept state in shadow bitmap, this is needed
....@@ -3850,8 +3878,7 @@
38503878 if (!cpu_has_vmx_msr_bitmap())
38513879 return;
38523880
3853
- if (static_branch_unlikely(&enable_evmcs))
3854
- evmcs_touch_msr_bitmap();
3881
+ vmx_msr_bitmap_l01_changed(vmx);
38553882
38563883 /*
38573884 * Mark the desired intercept state in shadow bitmap, this is needed
....@@ -6506,6 +6533,7 @@
65066533 return;
65076534
65086535 handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
6536
+ vcpu->arch.at_instruction_boundary = true;
65096537 }
65106538
65116539 static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
....@@ -7030,6 +7058,19 @@
70307058 if (err < 0)
70317059 goto free_pml;
70327060
7061
+ /*
7062
+ * Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7063
+ * nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7064
+ * feature only for vmcs01, KVM currently isn't equipped to realize any
7065
+ * performance benefits from enabling it for vmcs02.
7066
+ */
7067
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs) &&
7068
+ (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7069
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
7070
+
7071
+ evmcs->hv_enlightenments_control.msr_bitmap = 1;
7072
+ }
7073
+
70337074 /* The MSR bitmap starts with all ones */
70347075 bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
70357076 bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
....@@ -7519,6 +7560,21 @@
75197560 /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
75207561 break;
75217562
7563
+ case x86_intercept_pause:
7564
+ /*
7565
+ * PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
7566
+ * with vanilla NOPs in the emulator. Apply the interception
7567
+ * check only to actual PAUSE instructions. Don't check
7568
+ * PAUSE-loop-exiting, software can't expect a given PAUSE to
7569
+ * exit, i.e. KVM is within its rights to allow L2 to execute
7570
+ * the PAUSE.
7571
+ */
7572
+ if ((info->rep_prefix != REPE_PREFIX) ||
7573
+ !nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
7574
+ return X86EMUL_CONTINUE;
7575
+
7576
+ break;
7577
+
75227578 /* TODO: check more intercepts... */
75237579 default:
75247580 break;