forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-11 04dd17822334871b23ea2862f7798fb0e0007777
kernel/arch/powerpc/kvm/book3s_hv.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
34 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
....@@ -12,10 +13,6 @@
1213 *
1314 * This file is derived from arch/powerpc/kvm/book3s.c,
1415 * by Alexander Graf <agraf@suse.de>.
15
- *
16
- * This program is free software; you can redistribute it and/or modify
17
- * it under the terms of the GNU General Public License, version 2, as
18
- * published by the Free Software Foundation.
1916 */
2017
2118 #include <linux/kvm_host.h>
....@@ -50,6 +47,7 @@
5047 #include <asm/reg.h>
5148 #include <asm/ppc-opcode.h>
5249 #include <asm/asm-prototypes.h>
50
+#include <asm/archrandom.h>
5351 #include <asm/debug.h>
5452 #include <asm/disassemble.h>
5553 #include <asm/cputable.h>
....@@ -60,6 +58,7 @@
6058 #include <asm/kvm_book3s.h>
6159 #include <asm/mmu_context.h>
6260 #include <asm/lppaca.h>
61
+#include <asm/pmc.h>
6362 #include <asm/processor.h>
6463 #include <asm/cputhreads.h>
6564 #include <asm/page.h>
....@@ -73,6 +72,10 @@
7372 #include <asm/opal.h>
7473 #include <asm/xics.h>
7574 #include <asm/xive.h>
75
+#include <asm/hw_breakpoint.h>
76
+#include <asm/kvm_book3s_uvmem.h>
77
+#include <asm/ultravisor.h>
78
+#include <asm/dtl.h>
7679
7780 #include "book3s.h"
7881
....@@ -104,8 +107,12 @@
104107 module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
105108 MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
106109
110
+static bool one_vm_per_core;
111
+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
112
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
113
+
107114 #ifdef CONFIG_KVM_XICS
108
-static struct kernel_param_ops module_param_ops = {
115
+static const struct kernel_param_ops module_param_ops = {
109116 .set = param_set_int,
110117 .get = param_get_int,
111118 };
....@@ -117,10 +124,19 @@
117124 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
118125 #endif
119126
127
+/* If set, guests are allowed to create and control nested guests */
128
+static bool nested = true;
129
+module_param(nested, bool, S_IRUGO | S_IWUSR);
130
+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
131
+
132
+static inline bool nesting_enabled(struct kvm *kvm)
133
+{
134
+ return kvm->arch.nested_enable && kvm_is_radix(kvm);
135
+}
136
+
120137 /* If set, the threads on each CPU core have to be in the same MMU mode */
121138 static bool no_mixing_hpt_and_radix;
122139
123
-static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
124140 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
125141
126142 /*
....@@ -173,6 +189,10 @@
173189 {
174190 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
175191
192
+ /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
193
+ if (kvmhv_on_pseries())
194
+ return false;
195
+
176196 /* On POWER9 we can use msgsnd to IPI any cpu */
177197 if (cpu_has_feature(CPU_FTR_ARCH_300)) {
178198 msg |= get_hard_smp_processor_id(cpu);
....@@ -212,13 +232,11 @@
212232 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
213233 {
214234 int cpu;
215
- struct swait_queue_head *wqp;
235
+ struct rcuwait *waitp;
216236
217
- wqp = kvm_arch_vcpu_wq(vcpu);
218
- if (swq_has_sleeper(wqp)) {
219
- swake_up_one(wqp);
237
+ waitp = kvm_arch_vcpu_get_wait(vcpu);
238
+ if (rcuwait_wake_up(waitp))
220239 ++vcpu->stat.halt_wakeup;
221
- }
222240
223241 cpu = READ_ONCE(vcpu->arch.thread_cpu);
224242 if (cpu >= 0 && kvmppc_ipi_thread(cpu))
....@@ -321,25 +339,13 @@
321339 spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
322340 }
323341
324
-static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
325
-{
326
- /*
327
- * Check for illegal transactional state bit combination
328
- * and if we find it, force the TS field to a safe state.
329
- */
330
- if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
331
- msr &= ~MSR_TS_MASK;
332
- vcpu->arch.shregs.msr = msr;
333
- kvmppc_end_cede(vcpu);
334
-}
335
-
336342 static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
337343 {
338344 vcpu->arch.pvr = pvr;
339345 }
340346
341347 /* Dummy value used in computing PCR value below */
342
-#define PCR_ARCH_300 (PCR_ARCH_207 << 1)
348
+#define PCR_ARCH_31 (PCR_ARCH_300 << 1)
343349
344350 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
345351 {
....@@ -347,7 +353,9 @@
347353 struct kvmppc_vcore *vc = vcpu->arch.vcore;
348354
349355 /* We can (emulate) our own architecture version and anything older */
350
- if (cpu_has_feature(CPU_FTR_ARCH_300))
356
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
357
+ host_pcr_bit = PCR_ARCH_31;
358
+ else if (cpu_has_feature(CPU_FTR_ARCH_300))
351359 host_pcr_bit = PCR_ARCH_300;
352360 else if (cpu_has_feature(CPU_FTR_ARCH_207S))
353361 host_pcr_bit = PCR_ARCH_207;
....@@ -373,6 +381,9 @@
373381 case PVR_ARCH_300:
374382 guest_pcr_bit = PCR_ARCH_300;
375383 break;
384
+ case PVR_ARCH_31:
385
+ guest_pcr_bit = PCR_ARCH_31;
386
+ break;
376387 default:
377388 return -EINVAL;
378389 }
....@@ -384,8 +395,11 @@
384395
385396 spin_lock(&vc->lock);
386397 vc->arch_compat = arch_compat;
387
- /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
388
- vc->pcr = host_pcr_bit - guest_pcr_bit;
398
+ /*
399
+ * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
400
+ * Also set all reserved PCR bits
401
+ */
402
+ vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
389403 spin_unlock(&vc->lock);
390404
391405 return 0;
....@@ -725,8 +739,7 @@
725739 /*
726740 * Ensure that the read of vcore->dpdes comes after the read
727741 * of vcpu->doorbell_request. This barrier matches the
728
- * lwsync in book3s_hv_rmhandlers.S just before the
729
- * fast_guest_return label.
742
+ * smp_wmb() in kvmppc_guest_entry_inject().
730743 */
731744 smp_rmb();
732745 vc = vcpu->arch.vcore;
....@@ -761,7 +774,7 @@
761774 return H_P3;
762775 vcpu->arch.ciabr = value1;
763776 return H_SUCCESS;
764
- case H_SET_MODE_RESOURCE_SET_DAWR:
777
+ case H_SET_MODE_RESOURCE_SET_DAWR0:
765778 if (!kvmppc_power8_compatible(vcpu))
766779 return H_P2;
767780 if (!ppc_breakpoint_available())
....@@ -773,9 +786,88 @@
773786 vcpu->arch.dawr = value1;
774787 vcpu->arch.dawrx = value2;
775788 return H_SUCCESS;
789
+ case H_SET_MODE_RESOURCE_ADDR_TRANS_MODE:
790
+ /* KVM does not support mflags=2 (AIL=2) */
791
+ if (mflags != 0 && mflags != 3)
792
+ return H_UNSUPPORTED_FLAG_START;
793
+ return H_TOO_HARD;
776794 default:
777795 return H_TOO_HARD;
778796 }
797
+}
798
+
799
+/* Copy guest memory in place - must reside within a single memslot */
800
+static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
801
+ unsigned long len)
802
+{
803
+ struct kvm_memory_slot *to_memslot = NULL;
804
+ struct kvm_memory_slot *from_memslot = NULL;
805
+ unsigned long to_addr, from_addr;
806
+ int r;
807
+
808
+ /* Get HPA for from address */
809
+ from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
810
+ if (!from_memslot)
811
+ return -EFAULT;
812
+ if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
813
+ << PAGE_SHIFT))
814
+ return -EINVAL;
815
+ from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
816
+ if (kvm_is_error_hva(from_addr))
817
+ return -EFAULT;
818
+ from_addr |= (from & (PAGE_SIZE - 1));
819
+
820
+ /* Get HPA for to address */
821
+ to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
822
+ if (!to_memslot)
823
+ return -EFAULT;
824
+ if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
825
+ << PAGE_SHIFT))
826
+ return -EINVAL;
827
+ to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
828
+ if (kvm_is_error_hva(to_addr))
829
+ return -EFAULT;
830
+ to_addr |= (to & (PAGE_SIZE - 1));
831
+
832
+ /* Perform copy */
833
+ r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
834
+ len);
835
+ if (r)
836
+ return -EFAULT;
837
+ mark_page_dirty(kvm, to >> PAGE_SHIFT);
838
+ return 0;
839
+}
840
+
841
+static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
842
+ unsigned long dest, unsigned long src)
843
+{
844
+ u64 pg_sz = SZ_4K; /* 4K page size */
845
+ u64 pg_mask = SZ_4K - 1;
846
+ int ret;
847
+
848
+ /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
849
+ if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
850
+ H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
851
+ return H_PARAMETER;
852
+
853
+ /* dest (and src if copy_page flag set) must be page aligned */
854
+ if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
855
+ return H_PARAMETER;
856
+
857
+ /* zero and/or copy the page as determined by the flags */
858
+ if (flags & H_COPY_PAGE) {
859
+ ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
860
+ if (ret < 0)
861
+ return H_PARAMETER;
862
+ } else if (flags & H_ZERO_PAGE) {
863
+ ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
864
+ if (ret < 0)
865
+ return H_PARAMETER;
866
+ }
867
+
868
+ /* We can ignore the remaining flags */
869
+
870
+ return H_SUCCESS;
779871 }
780872
781873 static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
....@@ -899,7 +991,7 @@
899991 case H_IPOLL:
900992 case H_XIRR_X:
901993 if (kvmppc_xics_enabled(vcpu)) {
902
- if (xive_enabled()) {
994
+ if (xics_on_xive()) {
903995 ret = H_NOT_AVAILABLE;
904996 return RESUME_GUEST;
905997 }
....@@ -907,6 +999,20 @@
907999 break;
9081000 }
9091001 return RESUME_HOST;
1002
+ case H_SET_DABR:
1003
+ ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
1004
+ break;
1005
+ case H_SET_XDABR:
1006
+ ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
1007
+ kvmppc_get_gpr(vcpu, 5));
1008
+ break;
1009
+#ifdef CONFIG_SPAPR_TCE_IOMMU
1010
+ case H_GET_TCE:
1011
+ ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
1012
+ kvmppc_get_gpr(vcpu, 5));
1013
+ if (ret == H_TOO_HARD)
1014
+ return RESUME_HOST;
1015
+ break;
9101016 case H_PUT_TCE:
9111017 ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
9121018 kvmppc_get_gpr(vcpu, 5),
....@@ -930,12 +1036,108 @@
9301036 if (ret == H_TOO_HARD)
9311037 return RESUME_HOST;
9321038 break;
1039
+#endif
1040
+ case H_RANDOM:
1041
+ if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
1042
+ ret = H_HARDWARE;
1043
+ break;
1044
+
1045
+ case H_SET_PARTITION_TABLE:
1046
+ ret = H_FUNCTION;
1047
+ if (nesting_enabled(vcpu->kvm))
1048
+ ret = kvmhv_set_partition_table(vcpu);
1049
+ break;
1050
+ case H_ENTER_NESTED:
1051
+ ret = H_FUNCTION;
1052
+ if (!nesting_enabled(vcpu->kvm))
1053
+ break;
1054
+ ret = kvmhv_enter_nested_guest(vcpu);
1055
+ if (ret == H_INTERRUPT) {
1056
+ kvmppc_set_gpr(vcpu, 3, 0);
1057
+ vcpu->arch.hcall_needed = 0;
1058
+ return -EINTR;
1059
+ } else if (ret == H_TOO_HARD) {
1060
+ kvmppc_set_gpr(vcpu, 3, 0);
1061
+ vcpu->arch.hcall_needed = 0;
1062
+ return RESUME_HOST;
1063
+ }
1064
+ break;
1065
+ case H_TLB_INVALIDATE:
1066
+ ret = H_FUNCTION;
1067
+ if (nesting_enabled(vcpu->kvm))
1068
+ ret = kvmhv_do_nested_tlbie(vcpu);
1069
+ break;
1070
+ case H_COPY_TOFROM_GUEST:
1071
+ ret = H_FUNCTION;
1072
+ if (nesting_enabled(vcpu->kvm))
1073
+ ret = kvmhv_copy_tofrom_guest_nested(vcpu);
1074
+ break;
1075
+ case H_PAGE_INIT:
1076
+ ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
1077
+ kvmppc_get_gpr(vcpu, 5),
1078
+ kvmppc_get_gpr(vcpu, 6));
1079
+ break;
1080
+ case H_SVM_PAGE_IN:
1081
+ ret = H_UNSUPPORTED;
1082
+ if (kvmppc_get_srr1(vcpu) & MSR_S)
1083
+ ret = kvmppc_h_svm_page_in(vcpu->kvm,
1084
+ kvmppc_get_gpr(vcpu, 4),
1085
+ kvmppc_get_gpr(vcpu, 5),
1086
+ kvmppc_get_gpr(vcpu, 6));
1087
+ break;
1088
+ case H_SVM_PAGE_OUT:
1089
+ ret = H_UNSUPPORTED;
1090
+ if (kvmppc_get_srr1(vcpu) & MSR_S)
1091
+ ret = kvmppc_h_svm_page_out(vcpu->kvm,
1092
+ kvmppc_get_gpr(vcpu, 4),
1093
+ kvmppc_get_gpr(vcpu, 5),
1094
+ kvmppc_get_gpr(vcpu, 6));
1095
+ break;
1096
+ case H_SVM_INIT_START:
1097
+ ret = H_UNSUPPORTED;
1098
+ if (kvmppc_get_srr1(vcpu) & MSR_S)
1099
+ ret = kvmppc_h_svm_init_start(vcpu->kvm);
1100
+ break;
1101
+ case H_SVM_INIT_DONE:
1102
+ ret = H_UNSUPPORTED;
1103
+ if (kvmppc_get_srr1(vcpu) & MSR_S)
1104
+ ret = kvmppc_h_svm_init_done(vcpu->kvm);
1105
+ break;
1106
+ case H_SVM_INIT_ABORT:
1107
+ /*
1108
+ * Even if that call is made by the Ultravisor, the SSR1 value
1109
+ * is the guest context one, with the secure bit clear as it has
1110
+ * not yet been secured. So we can't check it here.
1111
+ * Instead the kvm->arch.secure_guest flag is checked inside
1112
+ * kvmppc_h_svm_init_abort().
1113
+ */
1114
+ ret = kvmppc_h_svm_init_abort(vcpu->kvm);
1115
+ break;
1116
+
9331117 default:
9341118 return RESUME_HOST;
9351119 }
9361120 kvmppc_set_gpr(vcpu, 3, ret);
9371121 vcpu->arch.hcall_needed = 0;
9381122 return RESUME_GUEST;
1123
+}
1124
+
1125
+/*
1126
+ * Handle H_CEDE in the nested virtualization case where we haven't
1127
+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
1128
+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
1129
+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
1130
+ */
1131
+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
1132
+{
1133
+ vcpu->arch.shregs.msr |= MSR_EE;
1134
+ vcpu->arch.ceded = 1;
1135
+ smp_mb();
1136
+ if (vcpu->arch.prodded) {
1137
+ vcpu->arch.prodded = 0;
1138
+ smp_mb();
1139
+ vcpu->arch.ceded = 0;
1140
+ }
9391141 }
9401142
9411143 static int kvmppc_hcall_impl_hv(unsigned long cmd)
....@@ -956,6 +1158,7 @@
9561158 case H_IPOLL:
9571159 case H_XIRR_X:
9581160 #endif
1161
+ case H_PAGE_INIT:
9591162 return 1;
9601163 }
9611164
....@@ -963,8 +1166,7 @@
9631166 return kvmppc_hcall_impl_hv_realmode(cmd);
9641167 }
9651168
966
-static int kvmppc_emulate_debug_inst(struct kvm_run *run,
967
- struct kvm_vcpu *vcpu)
1169
+static int kvmppc_emulate_debug_inst(struct kvm_vcpu *vcpu)
9681170 {
9691171 u32 last_inst;
9701172
....@@ -978,8 +1180,8 @@
9781180 }
9791181
9801182 if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
981
- run->exit_reason = KVM_EXIT_DEBUG;
982
- run->debug.arch.address = kvmppc_get_pc(vcpu);
1183
+ vcpu->run->exit_reason = KVM_EXIT_DEBUG;
1184
+ vcpu->run->debug.arch.address = kvmppc_get_pc(vcpu);
9831185 return RESUME_HOST;
9841186 } else {
9851187 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
....@@ -1080,10 +1282,10 @@
10801282 return RESUME_GUEST;
10811283 }
10821284
1083
-/* Called with vcpu->arch.vcore->lock held */
1084
-static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
1285
+static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
10851286 struct task_struct *tsk)
10861287 {
1288
+ struct kvm_run *run = vcpu->run;
10871289 int r = RESUME_HOST;
10881290
10891291 vcpu->stat.sum_exits++;
....@@ -1127,6 +1329,22 @@
11271329 r = RESUME_GUEST;
11281330 break;
11291331 case BOOK3S_INTERRUPT_MACHINE_CHECK:
1332
+ /* Print the MCE event to host console. */
1333
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1334
+
1335
+ /*
1336
+ * If the guest can do FWNMI, exit to userspace so it can
1337
+ * deliver a FWNMI to the guest.
1338
+ * Otherwise we synthesize a machine check for the guest
1339
+ * so that it knows that the machine check occurred.
1340
+ */
1341
+ if (!vcpu->kvm->arch.fwnmi_enabled) {
1342
+ ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
1343
+ kvmppc_core_queue_machine_check(vcpu, flags);
1344
+ r = RESUME_GUEST;
1345
+ break;
1346
+ }
1347
+
11301348 /* Exit to guest with KVM_EXIT_NMI as exit reason */
11311349 run->exit_reason = KVM_EXIT_NMI;
11321350 run->hw.hardware_exit_reason = vcpu->arch.trap;
....@@ -1139,8 +1357,6 @@
11391357 run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
11401358
11411359 r = RESUME_HOST;
1142
- /* Print the MCE event to host console. */
1143
- machine_check_print_event_info(&vcpu->arch.mce_evt, false);
11441360 break;
11451361 case BOOK3S_INTERRUPT_PROGRAM:
11461362 {
....@@ -1185,7 +1401,10 @@
11851401 break;
11861402 case BOOK3S_INTERRUPT_H_INST_STORAGE:
11871403 vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1188
- vcpu->arch.fault_dsisr = 0;
1404
+ vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
1405
+ DSISR_SRR1_MATCH_64S;
1406
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1407
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
11891408 r = RESUME_PAGE_FAULT;
11901409 break;
11911410 /*
....@@ -1201,10 +1420,7 @@
12011420 swab32(vcpu->arch.emul_inst) :
12021421 vcpu->arch.emul_inst;
12031422 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
1204
- /* Need vcore unlocked to call kvmppc_get_last_inst */
1205
- spin_unlock(&vcpu->arch.vcore->lock);
1206
- r = kvmppc_emulate_debug_inst(run, vcpu);
1207
- spin_lock(&vcpu->arch.vcore->lock);
1423
+ r = kvmppc_emulate_debug_inst(vcpu);
12081424 } else {
12091425 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
12101426 r = RESUME_GUEST;
....@@ -1220,12 +1436,8 @@
12201436 case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
12211437 r = EMULATE_FAIL;
12221438 if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
1223
- cpu_has_feature(CPU_FTR_ARCH_300)) {
1224
- /* Need vcore unlocked to call kvmppc_get_last_inst */
1225
- spin_unlock(&vcpu->arch.vcore->lock);
1439
+ cpu_has_feature(CPU_FTR_ARCH_300))
12261440 r = kvmppc_emulate_doorbell_instr(vcpu);
1227
- spin_lock(&vcpu->arch.vcore->lock);
1228
- }
12291441 if (r == EMULATE_FAIL) {
12301442 kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
12311443 r = RESUME_GUEST;
....@@ -1253,6 +1465,104 @@
12531465 vcpu->arch.trap, kvmppc_get_pc(vcpu),
12541466 vcpu->arch.shregs.msr);
12551467 run->hw.hardware_exit_reason = vcpu->arch.trap;
1468
+ r = RESUME_HOST;
1469
+ break;
1470
+ }
1471
+
1472
+ return r;
1473
+}
1474
+
1475
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
1476
+{
1477
+ int r;
1478
+ int srcu_idx;
1479
+
1480
+ vcpu->stat.sum_exits++;
1481
+
1482
+ /*
1483
+ * This can happen if an interrupt occurs in the last stages
1484
+ * of guest entry or the first stages of guest exit (i.e. after
1485
+ * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
1486
+ * and before setting it to KVM_GUEST_MODE_HOST_HV).
1487
+ * That can happen due to a bug, or due to a machine check
1488
+ * occurring at just the wrong time.
1489
+ */
1490
+ if (vcpu->arch.shregs.msr & MSR_HV) {
1491
+ pr_emerg("KVM trap in HV mode while nested!\n");
1492
+ pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
1493
+ vcpu->arch.trap, kvmppc_get_pc(vcpu),
1494
+ vcpu->arch.shregs.msr);
1495
+ kvmppc_dump_regs(vcpu);
1496
+ return RESUME_HOST;
1497
+ }
1498
+ switch (vcpu->arch.trap) {
1499
+ /* We're good on these - the host merely wanted to get our attention */
1500
+ case BOOK3S_INTERRUPT_HV_DECREMENTER:
1501
+ vcpu->stat.dec_exits++;
1502
+ r = RESUME_GUEST;
1503
+ break;
1504
+ case BOOK3S_INTERRUPT_EXTERNAL:
1505
+ vcpu->stat.ext_intr_exits++;
1506
+ r = RESUME_HOST;
1507
+ break;
1508
+ case BOOK3S_INTERRUPT_H_DOORBELL:
1509
+ case BOOK3S_INTERRUPT_H_VIRT:
1510
+ vcpu->stat.ext_intr_exits++;
1511
+ r = RESUME_GUEST;
1512
+ break;
1513
+ /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
1514
+ case BOOK3S_INTERRUPT_HMI:
1515
+ case BOOK3S_INTERRUPT_PERFMON:
1516
+ case BOOK3S_INTERRUPT_SYSTEM_RESET:
1517
+ r = RESUME_GUEST;
1518
+ break;
1519
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
1520
+ /* Pass the machine check to the L1 guest */
1521
+ r = RESUME_HOST;
1522
+ /* Print the MCE event to host console. */
1523
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
1524
+ break;
1525
+ /*
1526
+ * We get these next two if the guest accesses a page which it thinks
1527
+ * it has mapped but which is not actually present, either because
1528
+ * it is for an emulated I/O device or because the corresonding
1529
+ * host page has been paged out.
1530
+ */
1531
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
1532
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1533
+ r = kvmhv_nested_page_fault(vcpu);
1534
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1535
+ break;
1536
+ case BOOK3S_INTERRUPT_H_INST_STORAGE:
1537
+ vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
1538
+ vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
1539
+ DSISR_SRR1_MATCH_64S;
1540
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
1541
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
1542
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1543
+ r = kvmhv_nested_page_fault(vcpu);
1544
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
1545
+ break;
1546
+
1547
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1548
+ case BOOK3S_INTERRUPT_HV_SOFTPATCH:
1549
+ /*
1550
+ * This occurs for various TM-related instructions that
1551
+ * we need to emulate on POWER9 DD2.2. We have already
1552
+ * handled the cases where the guest was in real-suspend
1553
+ * mode and was transitioning to transactional state.
1554
+ */
1555
+ r = kvmhv_p9_tm_emulation(vcpu);
1556
+ break;
1557
+#endif
1558
+
1559
+ case BOOK3S_INTERRUPT_HV_RM_HARD:
1560
+ vcpu->arch.trap = 0;
1561
+ r = RESUME_GUEST;
1562
+ if (!xics_on_xive())
1563
+ kvmppc_xics_rm_complete(vcpu, 0);
1564
+ break;
1565
+ default:
12561566 r = RESUME_HOST;
12571567 break;
12581568 }
....@@ -1379,9 +1689,21 @@
13791689 case KVM_REG_PPC_UAMOR:
13801690 *val = get_reg_val(id, vcpu->arch.uamor);
13811691 break;
1382
- case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1692
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
13831693 i = id - KVM_REG_PPC_MMCR0;
13841694 *val = get_reg_val(id, vcpu->arch.mmcr[i]);
1695
+ break;
1696
+ case KVM_REG_PPC_MMCR2:
1697
+ *val = get_reg_val(id, vcpu->arch.mmcr[2]);
1698
+ break;
1699
+ case KVM_REG_PPC_MMCRA:
1700
+ *val = get_reg_val(id, vcpu->arch.mmcra);
1701
+ break;
1702
+ case KVM_REG_PPC_MMCRS:
1703
+ *val = get_reg_val(id, vcpu->arch.mmcrs);
1704
+ break;
1705
+ case KVM_REG_PPC_MMCR3:
1706
+ *val = get_reg_val(id, vcpu->arch.mmcr[3]);
13851707 break;
13861708 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
13871709 i = id - KVM_REG_PPC_PMC1;
....@@ -1398,7 +1720,13 @@
13981720 *val = get_reg_val(id, vcpu->arch.sdar);
13991721 break;
14001722 case KVM_REG_PPC_SIER:
1401
- *val = get_reg_val(id, vcpu->arch.sier);
1723
+ *val = get_reg_val(id, vcpu->arch.sier[0]);
1724
+ break;
1725
+ case KVM_REG_PPC_SIER2:
1726
+ *val = get_reg_val(id, vcpu->arch.sier[1]);
1727
+ break;
1728
+ case KVM_REG_PPC_SIER3:
1729
+ *val = get_reg_val(id, vcpu->arch.sier[2]);
14021730 break;
14031731 case KVM_REG_PPC_IAMR:
14041732 *val = get_reg_val(id, vcpu->arch.iamr);
....@@ -1555,6 +1883,9 @@
15551883 case KVM_REG_PPC_ONLINE:
15561884 *val = get_reg_val(id, vcpu->arch.online);
15571885 break;
1886
+ case KVM_REG_PPC_PTCR:
1887
+ *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
1888
+ break;
15581889 default:
15591890 r = -EINVAL;
15601891 break;
....@@ -1597,9 +1928,21 @@
15971928 case KVM_REG_PPC_UAMOR:
15981929 vcpu->arch.uamor = set_reg_val(id, *val);
15991930 break;
1600
- case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1931
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
16011932 i = id - KVM_REG_PPC_MMCR0;
16021933 vcpu->arch.mmcr[i] = set_reg_val(id, *val);
1934
+ break;
1935
+ case KVM_REG_PPC_MMCR2:
1936
+ vcpu->arch.mmcr[2] = set_reg_val(id, *val);
1937
+ break;
1938
+ case KVM_REG_PPC_MMCRA:
1939
+ vcpu->arch.mmcra = set_reg_val(id, *val);
1940
+ break;
1941
+ case KVM_REG_PPC_MMCRS:
1942
+ vcpu->arch.mmcrs = set_reg_val(id, *val);
1943
+ break;
1944
+ case KVM_REG_PPC_MMCR3:
1945
+ *val = get_reg_val(id, vcpu->arch.mmcr[3]);
16031946 break;
16041947 case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
16051948 i = id - KVM_REG_PPC_PMC1;
....@@ -1616,7 +1959,13 @@
16161959 vcpu->arch.sdar = set_reg_val(id, *val);
16171960 break;
16181961 case KVM_REG_PPC_SIER:
1619
- vcpu->arch.sier = set_reg_val(id, *val);
1962
+ vcpu->arch.sier[0] = set_reg_val(id, *val);
1963
+ break;
1964
+ case KVM_REG_PPC_SIER2:
1965
+ vcpu->arch.sier[1] = set_reg_val(id, *val);
1966
+ break;
1967
+ case KVM_REG_PPC_SIER3:
1968
+ vcpu->arch.sier[2] = set_reg_val(id, *val);
16201969 break;
16211970 case KVM_REG_PPC_IAMR:
16221971 vcpu->arch.iamr = set_reg_val(id, *val);
....@@ -1786,6 +2135,9 @@
17862135 atomic_dec(&vcpu->arch.vcore->online_count);
17872136 vcpu->arch.online = i;
17882137 break;
2138
+ case KVM_REG_PPC_PTCR:
2139
+ vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
2140
+ break;
17892141 default:
17902142 r = -EINVAL;
17912143 break;
....@@ -1819,7 +2171,7 @@
18192171
18202172 spin_lock_init(&vcore->lock);
18212173 spin_lock_init(&vcore->stoltb_lock);
1822
- init_swait_queue_head(&vcore->wq);
2174
+ rcuwait_init(&vcore->wait);
18232175 vcore->preempt_tb = TB_NIL;
18242176 vcore->lpcr = kvm->arch.lpcr;
18252177 vcore->first_vcpuid = id;
....@@ -1961,14 +2313,9 @@
19612313 struct kvm *kvm = vcpu->kvm;
19622314
19632315 snprintf(buf, sizeof(buf), "vcpu%u", id);
1964
- if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
1965
- return;
19662316 vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
1967
- if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
1968
- return;
1969
- vcpu->arch.debugfs_timings =
1970
- debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
1971
- vcpu, &debugfs_timings_ops);
2317
+ debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, vcpu,
2318
+ &debugfs_timings_ops);
19722319 }
19732320
19742321 #else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
....@@ -1977,22 +2324,16 @@
19772324 }
19782325 #endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
19792326
1980
-static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1981
- unsigned int id)
2327
+static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
19822328 {
1983
- struct kvm_vcpu *vcpu;
19842329 int err;
19852330 int core;
19862331 struct kvmppc_vcore *vcore;
2332
+ struct kvm *kvm;
2333
+ unsigned int id;
19872334
1988
- err = -ENOMEM;
1989
- vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1990
- if (!vcpu)
1991
- goto out;
1992
-
1993
- err = kvm_vcpu_init(vcpu, kvm, id);
1994
- if (err)
1995
- goto free_vcpu;
2335
+ kvm = vcpu->kvm;
2336
+ id = vcpu->vcpu_id;
19962337
19972338 vcpu->arch.shared = &vcpu->arch.shregs;
19982339 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
....@@ -2019,15 +2360,20 @@
20192360 * Set the default HFSCR for the guest from the host value.
20202361 * This value is only used on POWER9.
20212362 * On POWER9, we want to virtualize the doorbell facility, so we
2022
- * turn off the HFSCR bit, which causes those instructions to trap.
2363
+ * don't set the HFSCR_MSGP bit, and that causes those instructions
2364
+ * to trap and then we emulate them.
20232365 */
2024
- vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
2025
- if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
2366
+ vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
2367
+ HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP | HFSCR_PREFIX;
2368
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
2369
+ vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
2370
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
2371
+ if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
2372
+ vcpu->arch.hfscr |= HFSCR_TM;
2373
+#endif
2374
+ }
2375
+ if (cpu_has_feature(CPU_FTR_TM_COMP))
20262376 vcpu->arch.hfscr |= HFSCR_TM;
2027
- else if (!cpu_has_feature(CPU_FTR_TM_COMP))
2028
- vcpu->arch.hfscr &= ~HFSCR_TM;
2029
- if (cpu_has_feature(CPU_FTR_ARCH_300))
2030
- vcpu->arch.hfscr &= ~HFSCR_MSGP;
20312377
20322378 kvmppc_mmu_book3s_hv_init(vcpu);
20332379
....@@ -2055,17 +2401,23 @@
20552401 pr_devel("KVM: collision on id %u", id);
20562402 vcore = NULL;
20572403 } else if (!vcore) {
2404
+ /*
2405
+ * Take mmu_setup_lock for mutual exclusion
2406
+ * with kvmppc_update_lpcr().
2407
+ */
20582408 err = -ENOMEM;
20592409 vcore = kvmppc_vcore_create(kvm,
20602410 id & ~(kvm->arch.smt_mode - 1));
2411
+ mutex_lock(&kvm->arch.mmu_setup_lock);
20612412 kvm->arch.vcores[core] = vcore;
20622413 kvm->arch.online_vcores++;
2414
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
20632415 }
20642416 }
20652417 mutex_unlock(&kvm->lock);
20662418
20672419 if (!vcore)
2068
- goto uninit_vcpu;
2420
+ return err;
20692421
20702422 spin_lock(&vcore->lock);
20712423 ++vcore->num_threads;
....@@ -2080,14 +2432,7 @@
20802432
20812433 debugfs_vcpu_init(vcpu, id);
20822434
2083
- return vcpu;
2084
-
2085
-uninit_vcpu:
2086
- kvm_vcpu_uninit(vcpu);
2087
-free_vcpu:
2088
- kmem_cache_free(kvm_vcpu_cache, vcpu);
2089
-out:
2090
- return ERR_PTR(err);
2435
+ return 0;
20912436 }
20922437
20932438 static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
....@@ -2141,8 +2486,6 @@
21412486 unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
21422487 unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
21432488 spin_unlock(&vcpu->arch.vpa_update_lock);
2144
- kvm_vcpu_uninit(vcpu);
2145
- kmem_cache_free(kvm_vcpu_cache, vcpu);
21462489 }
21472490
21482491 static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
....@@ -2162,19 +2505,9 @@
21622505 kvmppc_core_prepare_to_enter(vcpu);
21632506 return;
21642507 }
2165
- dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
2166
- / tb_ticks_per_sec;
2508
+ dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
21672509 hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
21682510 vcpu->arch.timer_running = 1;
2169
-}
2170
-
2171
-static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
2172
-{
2173
- vcpu->arch.ceded = 0;
2174
- if (vcpu->arch.timer_running) {
2175
- hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
2176
- vcpu->arch.timer_running = 0;
2177
- }
21782511 }
21792512
21802513 extern int __kvmppc_vcore_entry(void);
....@@ -2244,24 +2577,43 @@
22442577
22452578 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
22462579 {
2580
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
2581
+ cpumask_t *cpu_in_guest;
22472582 int i;
22482583
2249
- cpu = cpu_first_thread_sibling(cpu);
2250
- cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
2584
+ cpu = cpu_first_tlb_thread_sibling(cpu);
2585
+ if (nested) {
2586
+ cpumask_set_cpu(cpu, &nested->need_tlb_flush);
2587
+ cpu_in_guest = &nested->cpu_in_guest;
2588
+ } else {
2589
+ cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
2590
+ cpu_in_guest = &kvm->arch.cpu_in_guest;
2591
+ }
22512592 /*
22522593 * Make sure setting of bit in need_tlb_flush precedes
22532594 * testing of cpu_in_guest bits. The matching barrier on
22542595 * the other side is the first smp_mb() in kvmppc_run_core().
22552596 */
22562597 smp_mb();
2257
- for (i = 0; i < threads_per_core; ++i)
2258
- if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
2259
- smp_call_function_single(cpu + i, do_nothing, NULL, 1);
2598
+ for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
2599
+ i += cpu_tlb_thread_sibling_step())
2600
+ if (cpumask_test_cpu(i, cpu_in_guest))
2601
+ smp_call_function_single(i, do_nothing, NULL, 1);
22602602 }
22612603
22622604 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
22632605 {
2606
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
22642607 struct kvm *kvm = vcpu->kvm;
2608
+ int prev_cpu;
2609
+
2610
+ if (!cpu_has_feature(CPU_FTR_HVMODE))
2611
+ return;
2612
+
2613
+ if (nested)
2614
+ prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
2615
+ else
2616
+ prev_cpu = vcpu->arch.prev_cpu;
22652617
22662618 /*
22672619 * With radix, the guest can do TLB invalidations itself,
....@@ -2275,12 +2627,15 @@
22752627 * ran to flush the TLB. The TLB is shared between threads,
22762628 * so we use a single bit in .need_tlb_flush for all 4 threads.
22772629 */
2278
- if (vcpu->arch.prev_cpu != pcpu) {
2279
- if (vcpu->arch.prev_cpu >= 0 &&
2280
- cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
2281
- cpu_first_thread_sibling(pcpu))
2282
- radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
2283
- vcpu->arch.prev_cpu = pcpu;
2630
+ if (prev_cpu != pcpu) {
2631
+ if (prev_cpu >= 0 &&
2632
+ cpu_first_tlb_thread_sibling(prev_cpu) !=
2633
+ cpu_first_tlb_thread_sibling(pcpu))
2634
+ radix_flush_cpu(kvm, prev_cpu, vcpu);
2635
+ if (nested)
2636
+ nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
2637
+ else
2638
+ vcpu->arch.prev_cpu = pcpu;
22842639 }
22852640 }
22862641
....@@ -2495,6 +2850,10 @@
24952850 if (!cpu_has_feature(CPU_FTR_ARCH_207S))
24962851 return false;
24972852
2853
+ /* In one_vm_per_core mode, require all vcores to be from the same vm */
2854
+ if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
2855
+ return false;
2856
+
24982857 /* Some POWER9 chips require all threads to be in the same MMU mode */
24992858 if (no_mixing_hpt_and_radix &&
25002859 kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
....@@ -2607,6 +2966,14 @@
26072966 spin_lock(&vc->lock);
26082967 now = get_tb();
26092968 for_each_runnable_thread(i, vcpu, vc) {
2969
+ /*
2970
+ * It's safe to unlock the vcore in the loop here, because
2971
+ * for_each_runnable_thread() is safe against removal of
2972
+ * the vcpu, and the vcore state is VCORE_EXITING here,
2973
+ * so any vcpus becoming runnable will have their arch.trap
2974
+ * set to zero and can't actually run in the guest.
2975
+ */
2976
+ spin_unlock(&vc->lock);
26102977 /* cancel pending dec exception if dec is positive */
26112978 if (now < vcpu->arch.dec_expires &&
26122979 kvmppc_core_pending_dec(vcpu))
....@@ -2616,12 +2983,13 @@
26162983
26172984 ret = RESUME_GUEST;
26182985 if (vcpu->arch.trap)
2619
- ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
2986
+ ret = kvmppc_handle_exit_hv(vcpu,
26202987 vcpu->arch.run_task);
26212988
26222989 vcpu->arch.ret = ret;
26232990 vcpu->arch.trap = 0;
26242991
2992
+ spin_lock(&vc->lock);
26252993 if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
26262994 if (vcpu->arch.pending_exceptions)
26272995 kvmppc_core_prepare_to_enter(vcpu);
....@@ -2969,32 +3337,6 @@
29693337 for (sub = 0; sub < core_info.n_subcores; ++sub)
29703338 spin_unlock(&core_info.vc[sub]->lock);
29713339
2972
- if (kvm_is_radix(vc->kvm)) {
2973
- int tmp = pcpu;
2974
-
2975
- /*
2976
- * Do we need to flush the process scoped TLB for the LPAR?
2977
- *
2978
- * On POWER9, individual threads can come in here, but the
2979
- * TLB is shared between the 4 threads in a core, hence
2980
- * invalidating on one thread invalidates for all.
2981
- * Thus we make all 4 threads use the same bit here.
2982
- *
2983
- * Hash must be flushed in realmode in order to use tlbiel.
2984
- */
2985
- mtspr(SPRN_LPID, vc->kvm->arch.lpid);
2986
- isync();
2987
-
2988
- if (cpu_has_feature(CPU_FTR_ARCH_300))
2989
- tmp &= ~0x3UL;
2990
-
2991
- if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
2992
- radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
2993
- /* Clear the bit after the TLB flush */
2994
- cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
2995
- }
2996
- }
2997
-
29983340 guest_enter_irqoff();
29993341
30003342 srcu_idx = srcu_read_lock(&vc->kvm->srcu);
....@@ -3057,8 +3399,22 @@
30573399
30583400 kvmppc_set_host_core(pcpu);
30593401
3402
+ context_tracking_guest_exit();
3403
+ if (!vtime_accounting_enabled_this_cpu()) {
3404
+ local_irq_enable();
3405
+ /*
3406
+ * Service IRQs here before vtime_account_guest_exit() so any
3407
+ * ticks that occurred while running the guest are accounted to
3408
+ * the guest. If vtime accounting is enabled, accounting uses
3409
+ * TB rather than ticks, so it can be done without enabling
3410
+ * interrupts here, which has the problem that it accounts
3411
+ * interrupt processing overhead to the host.
3412
+ */
3413
+ local_irq_disable();
3414
+ }
3415
+ vtime_account_guest_exit();
3416
+
30603417 local_irq_enable();
3061
- guest_exit();
30623418
30633419 /* Let secondaries go back to the offline loop */
30643420 for (i = 0; i < controlled_threads; ++i) {
....@@ -3088,6 +3444,367 @@
30883444 }
30893445
30903446 /*
3447
+ * Load up hypervisor-mode registers on P9.
3448
+ */
3449
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
3450
+ unsigned long lpcr)
3451
+{
3452
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
3453
+ s64 hdec;
3454
+ u64 tb, purr, spurr;
3455
+ int trap;
3456
+ unsigned long host_hfscr = mfspr(SPRN_HFSCR);
3457
+ unsigned long host_ciabr = mfspr(SPRN_CIABR);
3458
+ unsigned long host_dawr = mfspr(SPRN_DAWR0);
3459
+ unsigned long host_dawrx = mfspr(SPRN_DAWRX0);
3460
+ unsigned long host_psscr = mfspr(SPRN_PSSCR);
3461
+ unsigned long host_pidr = mfspr(SPRN_PID);
3462
+
3463
+ /*
3464
+ * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0,
3465
+ * so set HDICE before writing HDEC.
3466
+ */
3467
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE);
3468
+ isync();
3469
+
3470
+ hdec = time_limit - mftb();
3471
+ if (hdec < 0) {
3472
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
3473
+ isync();
3474
+ return BOOK3S_INTERRUPT_HV_DECREMENTER;
3475
+ }
3476
+ mtspr(SPRN_HDEC, hdec);
3477
+
3478
+ if (vc->tb_offset) {
3479
+ u64 new_tb = mftb() + vc->tb_offset;
3480
+ mtspr(SPRN_TBU40, new_tb);
3481
+ tb = mftb();
3482
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
3483
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
3484
+ vc->tb_offset_applied = vc->tb_offset;
3485
+ }
3486
+
3487
+ if (vc->pcr)
3488
+ mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
3489
+ mtspr(SPRN_DPDES, vc->dpdes);
3490
+ mtspr(SPRN_VTB, vc->vtb);
3491
+
3492
+ local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
3493
+ local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
3494
+ mtspr(SPRN_PURR, vcpu->arch.purr);
3495
+ mtspr(SPRN_SPURR, vcpu->arch.spurr);
3496
+
3497
+ if (dawr_enabled()) {
3498
+ mtspr(SPRN_DAWR0, vcpu->arch.dawr);
3499
+ mtspr(SPRN_DAWRX0, vcpu->arch.dawrx);
3500
+ }
3501
+ mtspr(SPRN_CIABR, vcpu->arch.ciabr);
3502
+ mtspr(SPRN_IC, vcpu->arch.ic);
3503
+ mtspr(SPRN_PID, vcpu->arch.pid);
3504
+
3505
+ mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
3506
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
3507
+
3508
+ mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
3509
+
3510
+ mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
3511
+ mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
3512
+ mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
3513
+ mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
3514
+
3515
+ mtspr(SPRN_AMOR, ~0UL);
3516
+
3517
+ mtspr(SPRN_LPCR, lpcr);
3518
+ isync();
3519
+
3520
+ kvmppc_xive_push_vcpu(vcpu);
3521
+
3522
+ mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
3523
+ mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
3524
+
3525
+ trap = __kvmhv_vcpu_entry_p9(vcpu);
3526
+
3527
+ /* Advance host PURR/SPURR by the amount used by guest */
3528
+ purr = mfspr(SPRN_PURR);
3529
+ spurr = mfspr(SPRN_SPURR);
3530
+ mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
3531
+ purr - vcpu->arch.purr);
3532
+ mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
3533
+ spurr - vcpu->arch.spurr);
3534
+ vcpu->arch.purr = purr;
3535
+ vcpu->arch.spurr = spurr;
3536
+
3537
+ vcpu->arch.ic = mfspr(SPRN_IC);
3538
+ vcpu->arch.pid = mfspr(SPRN_PID);
3539
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
3540
+
3541
+ vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
3542
+ vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
3543
+ vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
3544
+ vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
3545
+
3546
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
3547
+ mtspr(SPRN_PSSCR, host_psscr |
3548
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
3549
+ mtspr(SPRN_HFSCR, host_hfscr);
3550
+ mtspr(SPRN_CIABR, host_ciabr);
3551
+ mtspr(SPRN_DAWR0, host_dawr);
3552
+ mtspr(SPRN_DAWRX0, host_dawrx);
3553
+ mtspr(SPRN_PID, host_pidr);
3554
+
3555
+ /*
3556
+ * Since this is radix, do a eieio; tlbsync; ptesync sequence in
3557
+ * case we interrupted the guest between a tlbie and a ptesync.
3558
+ */
3559
+ asm volatile("eieio; tlbsync; ptesync");
3560
+
3561
+ /*
3562
+ * cp_abort is required if the processor supports local copy-paste
3563
+ * to clear the copy buffer that was under control of the guest.
3564
+ */
3565
+ if (cpu_has_feature(CPU_FTR_ARCH_31))
3566
+ asm volatile(PPC_CP_ABORT);
3567
+
3568
+ mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
3569
+ isync();
3570
+
3571
+ vc->dpdes = mfspr(SPRN_DPDES);
3572
+ vc->vtb = mfspr(SPRN_VTB);
3573
+ mtspr(SPRN_DPDES, 0);
3574
+ if (vc->pcr)
3575
+ mtspr(SPRN_PCR, PCR_MASK);
3576
+
3577
+ if (vc->tb_offset_applied) {
3578
+ u64 new_tb = mftb() - vc->tb_offset_applied;
3579
+ mtspr(SPRN_TBU40, new_tb);
3580
+ tb = mftb();
3581
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
3582
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
3583
+ vc->tb_offset_applied = 0;
3584
+ }
3585
+
3586
+ mtspr(SPRN_HDEC, 0x7fffffff);
3587
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
3588
+
3589
+ return trap;
3590
+}
3591
+
3592
+/*
3593
+ * Virtual-mode guest entry for POWER9 and later when the host and
3594
+ * guest are both using the radix MMU. The LPIDR has already been set.
3595
+ */
3596
+static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
3597
+ unsigned long lpcr)
3598
+{
3599
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
3600
+ unsigned long host_dscr = mfspr(SPRN_DSCR);
3601
+ unsigned long host_tidr = mfspr(SPRN_TIDR);
3602
+ unsigned long host_iamr = mfspr(SPRN_IAMR);
3603
+ unsigned long host_amr = mfspr(SPRN_AMR);
3604
+ unsigned long host_fscr = mfspr(SPRN_FSCR);
3605
+ s64 dec;
3606
+ u64 tb;
3607
+ int trap, save_pmu;
3608
+
3609
+ dec = mfspr(SPRN_DEC);
3610
+ tb = mftb();
3611
+ if (dec < 0)
3612
+ return BOOK3S_INTERRUPT_HV_DECREMENTER;
3613
+ local_paca->kvm_hstate.dec_expires = dec + tb;
3614
+ if (local_paca->kvm_hstate.dec_expires < time_limit)
3615
+ time_limit = local_paca->kvm_hstate.dec_expires;
3616
+
3617
+ vcpu->arch.ceded = 0;
3618
+
3619
+ kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
3620
+
3621
+ kvmppc_subcore_enter_guest();
3622
+
3623
+ vc->entry_exit_map = 1;
3624
+ vc->in_guest = 1;
3625
+
3626
+ if (vcpu->arch.vpa.pinned_addr) {
3627
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3628
+ u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3629
+ lp->yield_count = cpu_to_be32(yield_count);
3630
+ vcpu->arch.vpa.dirty = 1;
3631
+ }
3632
+
3633
+ if (cpu_has_feature(CPU_FTR_TM) ||
3634
+ cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3635
+ kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3636
+
3637
+#ifdef CONFIG_PPC_PSERIES
3638
+ if (kvmhv_on_pseries()) {
3639
+ barrier();
3640
+ if (vcpu->arch.vpa.pinned_addr) {
3641
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3642
+ get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
3643
+ } else {
3644
+ get_lppaca()->pmcregs_in_use = 1;
3645
+ }
3646
+ barrier();
3647
+ }
3648
+#endif
3649
+ kvmhv_load_guest_pmu(vcpu);
3650
+
3651
+ msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3652
+ load_fp_state(&vcpu->arch.fp);
3653
+#ifdef CONFIG_ALTIVEC
3654
+ load_vr_state(&vcpu->arch.vr);
3655
+#endif
3656
+ mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
3657
+
3658
+ mtspr(SPRN_DSCR, vcpu->arch.dscr);
3659
+ mtspr(SPRN_IAMR, vcpu->arch.iamr);
3660
+ mtspr(SPRN_PSPB, vcpu->arch.pspb);
3661
+ mtspr(SPRN_FSCR, vcpu->arch.fscr);
3662
+ mtspr(SPRN_TAR, vcpu->arch.tar);
3663
+ mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
3664
+ mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
3665
+ mtspr(SPRN_BESCR, vcpu->arch.bescr);
3666
+ mtspr(SPRN_WORT, vcpu->arch.wort);
3667
+ mtspr(SPRN_TIDR, vcpu->arch.tid);
3668
+ mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
3669
+ mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
3670
+ mtspr(SPRN_AMR, vcpu->arch.amr);
3671
+ mtspr(SPRN_UAMOR, vcpu->arch.uamor);
3672
+
3673
+ if (!(vcpu->arch.ctrl & 1))
3674
+ mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
3675
+
3676
+ mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
3677
+
3678
+ if (kvmhv_on_pseries()) {
3679
+ /*
3680
+ * We need to save and restore the guest visible part of the
3681
+ * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
3682
+ * doesn't do this for us. Note only required if pseries since
3683
+ * this is done in kvmhv_load_hv_regs_and_go() below otherwise.
3684
+ */
3685
+ unsigned long host_psscr;
3686
+ /* call our hypervisor to load up HV regs and go */
3687
+ struct hv_guest_state hvregs;
3688
+
3689
+ host_psscr = mfspr(SPRN_PSSCR_PR);
3690
+ mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
3691
+ kvmhv_save_hv_regs(vcpu, &hvregs);
3692
+ hvregs.lpcr = lpcr;
3693
+ vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
3694
+ hvregs.version = HV_GUEST_STATE_VERSION;
3695
+ if (vcpu->arch.nested) {
3696
+ hvregs.lpid = vcpu->arch.nested->shadow_lpid;
3697
+ hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
3698
+ } else {
3699
+ hvregs.lpid = vcpu->kvm->arch.lpid;
3700
+ hvregs.vcpu_token = vcpu->vcpu_id;
3701
+ }
3702
+ hvregs.hdec_expiry = time_limit;
3703
+ trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
3704
+ __pa(&vcpu->arch.regs));
3705
+ kvmhv_restore_hv_return_state(vcpu, &hvregs);
3706
+ vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
3707
+ vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
3708
+ vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
3709
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
3710
+ mtspr(SPRN_PSSCR_PR, host_psscr);
3711
+
3712
+ /* H_CEDE has to be handled now, not later */
3713
+ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
3714
+ kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
3715
+ kvmppc_nested_cede(vcpu);
3716
+ kvmppc_set_gpr(vcpu, 3, 0);
3717
+ trap = 0;
3718
+ }
3719
+ } else {
3720
+ trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
3721
+ }
3722
+
3723
+ vcpu->arch.slb_max = 0;
3724
+ dec = mfspr(SPRN_DEC);
3725
+ if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
3726
+ dec = (s32) dec;
3727
+ tb = mftb();
3728
+ vcpu->arch.dec_expires = dec + tb;
3729
+ vcpu->cpu = -1;
3730
+ vcpu->arch.thread_cpu = -1;
3731
+ /* Save guest CTRL register, set runlatch to 1 */
3732
+ vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
3733
+ if (!(vcpu->arch.ctrl & 1))
3734
+ mtspr(SPRN_CTRLT, vcpu->arch.ctrl | 1);
3735
+
3736
+ vcpu->arch.iamr = mfspr(SPRN_IAMR);
3737
+ vcpu->arch.pspb = mfspr(SPRN_PSPB);
3738
+ vcpu->arch.fscr = mfspr(SPRN_FSCR);
3739
+ vcpu->arch.tar = mfspr(SPRN_TAR);
3740
+ vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
3741
+ vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
3742
+ vcpu->arch.bescr = mfspr(SPRN_BESCR);
3743
+ vcpu->arch.wort = mfspr(SPRN_WORT);
3744
+ vcpu->arch.tid = mfspr(SPRN_TIDR);
3745
+ vcpu->arch.amr = mfspr(SPRN_AMR);
3746
+ vcpu->arch.uamor = mfspr(SPRN_UAMOR);
3747
+ vcpu->arch.dscr = mfspr(SPRN_DSCR);
3748
+
3749
+ mtspr(SPRN_PSPB, 0);
3750
+ mtspr(SPRN_WORT, 0);
3751
+ mtspr(SPRN_UAMOR, 0);
3752
+ mtspr(SPRN_DSCR, host_dscr);
3753
+ mtspr(SPRN_TIDR, host_tidr);
3754
+ mtspr(SPRN_IAMR, host_iamr);
3755
+ mtspr(SPRN_PSPB, 0);
3756
+
3757
+ if (host_amr != vcpu->arch.amr)
3758
+ mtspr(SPRN_AMR, host_amr);
3759
+
3760
+ if (host_fscr != vcpu->arch.fscr)
3761
+ mtspr(SPRN_FSCR, host_fscr);
3762
+
3763
+ msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
3764
+ store_fp_state(&vcpu->arch.fp);
3765
+#ifdef CONFIG_ALTIVEC
3766
+ store_vr_state(&vcpu->arch.vr);
3767
+#endif
3768
+ vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
3769
+
3770
+ if (cpu_has_feature(CPU_FTR_TM) ||
3771
+ cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
3772
+ kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
3773
+
3774
+ save_pmu = 1;
3775
+ if (vcpu->arch.vpa.pinned_addr) {
3776
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
3777
+ u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
3778
+ lp->yield_count = cpu_to_be32(yield_count);
3779
+ vcpu->arch.vpa.dirty = 1;
3780
+ save_pmu = lp->pmcregs_in_use;
3781
+ }
3782
+ /* Must save pmu if this guest is capable of running nested guests */
3783
+ save_pmu |= nesting_enabled(vcpu->kvm);
3784
+
3785
+ kvmhv_save_guest_pmu(vcpu, save_pmu);
3786
+#ifdef CONFIG_PPC_PSERIES
3787
+ if (kvmhv_on_pseries()) {
3788
+ barrier();
3789
+ get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
3790
+ barrier();
3791
+ }
3792
+#endif
3793
+
3794
+ vc->entry_exit_map = 0x101;
3795
+ vc->in_guest = 0;
3796
+
3797
+ mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
3798
+ mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
3799
+
3800
+ kvmhv_load_host_pmu();
3801
+
3802
+ kvmppc_subcore_exit_guest();
3803
+
3804
+ return trap;
3805
+}
3806
+
3807
+/*
30913808 * Wait for some other vcpu thread to execute us, and
30923809 * wake us up when we need to handle something in the host.
30933810 */
....@@ -3107,11 +3824,12 @@
31073824
31083825 static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
31093826 {
3110
- /* 10us base */
3111
- if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
3112
- vc->halt_poll_ns = 10000;
3113
- else
3114
- vc->halt_poll_ns *= halt_poll_ns_grow;
3827
+ if (!halt_poll_ns_grow)
3828
+ return;
3829
+
3830
+ vc->halt_poll_ns *= halt_poll_ns_grow;
3831
+ if (vc->halt_poll_ns < halt_poll_ns_grow_start)
3832
+ vc->halt_poll_ns = halt_poll_ns_grow_start;
31153833 }
31163834
31173835 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
....@@ -3125,7 +3843,7 @@
31253843 #ifdef CONFIG_KVM_XICS
31263844 static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
31273845 {
3128
- if (!xive_enabled())
3846
+ if (!xics_on_xive())
31293847 return false;
31303848 return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
31313849 vcpu->arch.xive_saved_state.cppr;
....@@ -3172,7 +3890,6 @@
31723890 ktime_t cur, start_poll, start_wait;
31733891 int do_sleep = 1;
31743892 u64 block_ns;
3175
- DECLARE_SWAITQUEUE(wait);
31763893
31773894 /* Poll for pending exceptions and ceded state */
31783895 cur = start_poll = ktime_get();
....@@ -3200,10 +3917,10 @@
32003917 }
32013918 }
32023919
3203
- prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
3204
-
3920
+ prepare_to_rcuwait(&vc->wait);
3921
+ set_current_state(TASK_INTERRUPTIBLE);
32053922 if (kvmppc_vcore_check_block(vc)) {
3206
- finish_swait(&vc->wq, &wait);
3923
+ finish_rcuwait(&vc->wait);
32073924 do_sleep = 0;
32083925 /* If we polled, count this as a successful poll */
32093926 if (vc->halt_poll_ns)
....@@ -3217,7 +3934,7 @@
32173934 trace_kvmppc_vcore_blocked(vc, 0);
32183935 spin_unlock(&vc->lock);
32193936 schedule();
3220
- finish_swait(&vc->wq, &wait);
3937
+ finish_rcuwait(&vc->wait);
32213938 spin_lock(&vc->lock);
32223939 vc->vcore_state = VCORE_INACTIVE;
32233940 trace_kvmppc_vcore_blocked(vc, 1);
....@@ -3264,12 +3981,17 @@
32643981 trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
32653982 }
32663983
3984
+/*
3985
+ * This never fails for a radix guest, as none of the operations it does
3986
+ * for a radix guest can fail or have a way to report failure.
3987
+ * kvmhv_run_single_vcpu() relies on this fact.
3988
+ */
32673989 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
32683990 {
32693991 int r = 0;
32703992 struct kvm *kvm = vcpu->kvm;
32713993
3272
- mutex_lock(&kvm->lock);
3994
+ mutex_lock(&kvm->arch.mmu_setup_lock);
32733995 if (!kvm->arch.mmu_ready) {
32743996 if (!kvm_is_radix(kvm))
32753997 r = kvmppc_hv_setup_htab_rma(vcpu);
....@@ -3279,19 +4001,20 @@
32794001 kvm->arch.mmu_ready = 1;
32804002 }
32814003 }
3282
- mutex_unlock(&kvm->lock);
4004
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
32834005 return r;
32844006 }
32854007
3286
-static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
4008
+static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
32874009 {
4010
+ struct kvm_run *run = vcpu->run;
32884011 int n_ceded, i, r;
32894012 struct kvmppc_vcore *vc;
32904013 struct kvm_vcpu *v;
32914014
32924015 trace_kvmppc_run_vcpu_enter(vcpu);
32934016
3294
- kvm_run->exit_reason = 0;
4017
+ run->exit_reason = 0;
32954018 vcpu->arch.ret = RESUME_GUEST;
32964019 vcpu->arch.trap = 0;
32974020 kvmppc_update_vpas(vcpu);
....@@ -3303,7 +4026,6 @@
33034026 spin_lock(&vc->lock);
33044027 vcpu->arch.ceded = 0;
33054028 vcpu->arch.run_task = current;
3306
- vcpu->arch.kvm_run = kvm_run;
33074029 vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
33084030 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
33094031 vcpu->arch.busy_preempt = TB_NIL;
....@@ -3323,7 +4045,7 @@
33234045 kvmppc_start_thread(vcpu, vc);
33244046 trace_kvm_guest_enter(vcpu);
33254047 } else if (vc->vcore_state == VCORE_SLEEPING) {
3326
- swake_up_one(&vc->wq);
4048
+ rcuwait_wake_up(&vc->wait);
33274049 }
33284050
33294051 }
....@@ -3336,8 +4058,8 @@
33364058 r = kvmhv_setup_mmu(vcpu);
33374059 spin_lock(&vc->lock);
33384060 if (r) {
3339
- kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3340
- kvm_run->fail_entry.
4061
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4062
+ run->fail_entry.
33414063 hardware_entry_failure_reason = 0;
33424064 vcpu->arch.ret = r;
33434065 break;
....@@ -3356,7 +4078,7 @@
33564078 if (signal_pending(v->arch.run_task)) {
33574079 kvmppc_remove_runnable(vc, v);
33584080 v->stat.signal_exits++;
3359
- v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
4081
+ v->run->exit_reason = KVM_EXIT_INTR;
33604082 v->arch.ret = -EINTR;
33614083 wake_up(&v->arch.cpu_run);
33624084 }
....@@ -3397,7 +4119,7 @@
33974119 if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
33984120 kvmppc_remove_runnable(vc, vcpu);
33994121 vcpu->stat.signal_exits++;
3400
- kvm_run->exit_reason = KVM_EXIT_INTR;
4122
+ run->exit_reason = KVM_EXIT_INTR;
34014123 vcpu->arch.ret = -EINTR;
34024124 }
34034125
....@@ -3408,13 +4130,206 @@
34084130 wake_up(&v->arch.cpu_run);
34094131 }
34104132
3411
- trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
4133
+ trace_kvmppc_run_vcpu_exit(vcpu);
34124134 spin_unlock(&vc->lock);
34134135 return vcpu->arch.ret;
34144136 }
34154137
3416
-static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
4138
+int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit,
4139
+ unsigned long lpcr)
34174140 {
4141
+ struct kvm_run *run = vcpu->run;
4142
+ int trap, r, pcpu;
4143
+ int srcu_idx, lpid;
4144
+ struct kvmppc_vcore *vc;
4145
+ struct kvm *kvm = vcpu->kvm;
4146
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
4147
+
4148
+ trace_kvmppc_run_vcpu_enter(vcpu);
4149
+
4150
+ run->exit_reason = 0;
4151
+ vcpu->arch.ret = RESUME_GUEST;
4152
+ vcpu->arch.trap = 0;
4153
+
4154
+ vc = vcpu->arch.vcore;
4155
+ vcpu->arch.ceded = 0;
4156
+ vcpu->arch.run_task = current;
4157
+ vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
4158
+ vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
4159
+ vcpu->arch.busy_preempt = TB_NIL;
4160
+ vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
4161
+ vc->runnable_threads[0] = vcpu;
4162
+ vc->n_runnable = 1;
4163
+ vc->runner = vcpu;
4164
+
4165
+ /* See if the MMU is ready to go */
4166
+ if (!kvm->arch.mmu_ready)
4167
+ kvmhv_setup_mmu(vcpu);
4168
+
4169
+ if (need_resched())
4170
+ cond_resched();
4171
+
4172
+ kvmppc_update_vpas(vcpu);
4173
+
4174
+ init_vcore_to_run(vc);
4175
+ vc->preempt_tb = TB_NIL;
4176
+
4177
+ preempt_disable();
4178
+ pcpu = smp_processor_id();
4179
+ vc->pcpu = pcpu;
4180
+ kvmppc_prepare_radix_vcpu(vcpu, pcpu);
4181
+
4182
+ local_irq_disable();
4183
+ hard_irq_disable();
4184
+ if (signal_pending(current))
4185
+ goto sigpend;
4186
+ if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
4187
+ goto out;
4188
+
4189
+ if (!nested) {
4190
+ kvmppc_core_prepare_to_enter(vcpu);
4191
+ if (vcpu->arch.doorbell_request) {
4192
+ vc->dpdes = 1;
4193
+ smp_wmb();
4194
+ vcpu->arch.doorbell_request = 0;
4195
+ }
4196
+ if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
4197
+ &vcpu->arch.pending_exceptions))
4198
+ lpcr |= LPCR_MER;
4199
+ } else if (vcpu->arch.pending_exceptions ||
4200
+ vcpu->arch.doorbell_request ||
4201
+ xive_interrupt_pending(vcpu)) {
4202
+ vcpu->arch.ret = RESUME_HOST;
4203
+ goto out;
4204
+ }
4205
+
4206
+ kvmppc_clear_host_core(pcpu);
4207
+
4208
+ local_paca->kvm_hstate.tid = 0;
4209
+ local_paca->kvm_hstate.napping = 0;
4210
+ local_paca->kvm_hstate.kvm_split_mode = NULL;
4211
+ kvmppc_start_thread(vcpu, vc);
4212
+ kvmppc_create_dtl_entry(vcpu, vc);
4213
+ trace_kvm_guest_enter(vcpu);
4214
+
4215
+ vc->vcore_state = VCORE_RUNNING;
4216
+ trace_kvmppc_run_core(vc, 0);
4217
+
4218
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
4219
+ lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
4220
+ mtspr(SPRN_LPID, lpid);
4221
+ isync();
4222
+ kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
4223
+ }
4224
+
4225
+ guest_enter_irqoff();
4226
+
4227
+ srcu_idx = srcu_read_lock(&kvm->srcu);
4228
+
4229
+ this_cpu_disable_ftrace();
4230
+
4231
+ /* Tell lockdep that we're about to enable interrupts */
4232
+ trace_hardirqs_on();
4233
+
4234
+ trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
4235
+ vcpu->arch.trap = trap;
4236
+
4237
+ trace_hardirqs_off();
4238
+
4239
+ this_cpu_enable_ftrace();
4240
+
4241
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
4242
+
4243
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
4244
+ mtspr(SPRN_LPID, kvm->arch.host_lpid);
4245
+ isync();
4246
+ }
4247
+
4248
+ set_irq_happened(trap);
4249
+
4250
+ kvmppc_set_host_core(pcpu);
4251
+
4252
+ context_tracking_guest_exit();
4253
+ if (!vtime_accounting_enabled_this_cpu()) {
4254
+ local_irq_enable();
4255
+ /*
4256
+ * Service IRQs here before vtime_account_guest_exit() so any
4257
+ * ticks that occurred while running the guest are accounted to
4258
+ * the guest. If vtime accounting is enabled, accounting uses
4259
+ * TB rather than ticks, so it can be done without enabling
4260
+ * interrupts here, which has the problem that it accounts
4261
+ * interrupt processing overhead to the host.
4262
+ */
4263
+ local_irq_disable();
4264
+ }
4265
+ vtime_account_guest_exit();
4266
+
4267
+ local_irq_enable();
4268
+
4269
+ cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
4270
+
4271
+ preempt_enable();
4272
+
4273
+ /*
4274
+ * cancel pending decrementer exception if DEC is now positive, or if
4275
+ * entering a nested guest in which case the decrementer is now owned
4276
+ * by L2 and the L1 decrementer is provided in hdec_expires
4277
+ */
4278
+ if (kvmppc_core_pending_dec(vcpu) &&
4279
+ ((get_tb() < vcpu->arch.dec_expires) ||
4280
+ (trap == BOOK3S_INTERRUPT_SYSCALL &&
4281
+ kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
4282
+ kvmppc_core_dequeue_dec(vcpu);
4283
+
4284
+ trace_kvm_guest_exit(vcpu);
4285
+ r = RESUME_GUEST;
4286
+ if (trap) {
4287
+ if (!nested)
4288
+ r = kvmppc_handle_exit_hv(vcpu, current);
4289
+ else
4290
+ r = kvmppc_handle_nested_exit(vcpu);
4291
+ }
4292
+ vcpu->arch.ret = r;
4293
+
4294
+ if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
4295
+ !kvmppc_vcpu_woken(vcpu)) {
4296
+ kvmppc_set_timer(vcpu);
4297
+ while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
4298
+ if (signal_pending(current)) {
4299
+ vcpu->stat.signal_exits++;
4300
+ run->exit_reason = KVM_EXIT_INTR;
4301
+ vcpu->arch.ret = -EINTR;
4302
+ break;
4303
+ }
4304
+ spin_lock(&vc->lock);
4305
+ kvmppc_vcore_blocked(vc);
4306
+ spin_unlock(&vc->lock);
4307
+ }
4308
+ }
4309
+ vcpu->arch.ceded = 0;
4310
+
4311
+ vc->vcore_state = VCORE_INACTIVE;
4312
+ trace_kvmppc_run_core(vc, 1);
4313
+
4314
+ done:
4315
+ kvmppc_remove_runnable(vc, vcpu);
4316
+ trace_kvmppc_run_vcpu_exit(vcpu);
4317
+
4318
+ return vcpu->arch.ret;
4319
+
4320
+ sigpend:
4321
+ vcpu->stat.signal_exits++;
4322
+ run->exit_reason = KVM_EXIT_INTR;
4323
+ vcpu->arch.ret = -EINTR;
4324
+ out:
4325
+ local_irq_enable();
4326
+ preempt_enable();
4327
+ goto done;
4328
+}
4329
+
4330
+static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
4331
+{
4332
+ struct kvm_run *run = vcpu->run;
34184333 int r;
34194334 int srcu_idx;
34204335 unsigned long ebb_regs[3] = {}; /* shut up GCC */
....@@ -3483,12 +4398,25 @@
34834398 }
34844399 user_vrsave = mfspr(SPRN_VRSAVE);
34854400
3486
- vcpu->arch.wqp = &vcpu->arch.vcore->wq;
3487
- vcpu->arch.pgdir = current->mm->pgd;
4401
+ vcpu->arch.waitp = &vcpu->arch.vcore->wait;
4402
+ vcpu->arch.pgdir = kvm->mm->pgd;
34884403 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
34894404
34904405 do {
3491
- r = kvmppc_run_vcpu(run, vcpu);
4406
+ /*
4407
+ * The early POWER9 chips that can't mix radix and HPT threads
4408
+ * on the same core also need the workaround for the problem
4409
+ * where the TLB would prefetch entries in the guest exit path
4410
+ * for radix guests using the guest PIDR value and LPID 0.
4411
+ * The workaround is in the old path (kvmppc_run_vcpu())
4412
+ * but not the new path (kvmhv_run_single_vcpu()).
4413
+ */
4414
+ if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
4415
+ !no_mixing_hpt_and_radix)
4416
+ r = kvmhv_run_single_vcpu(vcpu, ~(u64)0,
4417
+ vcpu->arch.vcore->lpcr);
4418
+ else
4419
+ r = kvmppc_run_vcpu(vcpu);
34924420
34934421 if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
34944422 !(vcpu->arch.shregs.msr & MSR_PR)) {
....@@ -3498,11 +4426,11 @@
34984426 kvmppc_core_prepare_to_enter(vcpu);
34994427 } else if (r == RESUME_PAGE_FAULT) {
35004428 srcu_idx = srcu_read_lock(&kvm->srcu);
3501
- r = kvmppc_book3s_hv_page_fault(run, vcpu,
4429
+ r = kvmppc_book3s_hv_page_fault(vcpu,
35024430 vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
35034431 srcu_read_unlock(&kvm->srcu, srcu_idx);
35044432 } else if (r == RESUME_PASSTHROUGH) {
3505
- if (WARN_ON(xive_enabled()))
4433
+ if (WARN_ON(xics_on_xive()))
35064434 r = H_SUCCESS;
35074435 else
35084436 r = kvmppc_xics_rm_complete(vcpu, 0);
....@@ -3567,6 +4495,10 @@
35674495 kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
35684496 kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
35694497
4498
+ /* If running as a nested hypervisor, we don't support HPT guests */
4499
+ if (kvmhv_on_pseries())
4500
+ info->flags |= KVM_PPC_NO_HASH;
4501
+
35704502 return 0;
35714503 }
35724504
....@@ -3592,7 +4524,7 @@
35924524 slots = kvm_memslots(kvm);
35934525 memslot = id_to_memslot(slots, log->slot);
35944526 r = -ENOENT;
3595
- if (!memslot->dirty_bitmap)
4527
+ if (!memslot || !memslot->dirty_bitmap)
35964528 goto out;
35974529
35984530 /*
....@@ -3639,36 +4571,38 @@
36394571 return r;
36404572 }
36414573
3642
-static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
3643
- struct kvm_memory_slot *dont)
4574
+static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot)
36444575 {
3645
- if (!dont || free->arch.rmap != dont->arch.rmap) {
3646
- vfree(free->arch.rmap);
3647
- free->arch.rmap = NULL;
3648
- }
3649
-}
3650
-
3651
-static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
3652
- unsigned long npages)
3653
-{
3654
- slot->arch.rmap = vzalloc(array_size(npages, sizeof(*slot->arch.rmap)));
3655
- if (!slot->arch.rmap)
3656
- return -ENOMEM;
3657
-
3658
- return 0;
4576
+ vfree(slot->arch.rmap);
4577
+ slot->arch.rmap = NULL;
36594578 }
36604579
36614580 static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
3662
- struct kvm_memory_slot *memslot,
3663
- const struct kvm_userspace_memory_region *mem)
4581
+ struct kvm_memory_slot *slot,
4582
+ const struct kvm_userspace_memory_region *mem,
4583
+ enum kvm_mr_change change)
36644584 {
4585
+ unsigned long npages = mem->memory_size >> PAGE_SHIFT;
4586
+
4587
+ if (change == KVM_MR_CREATE) {
4588
+ unsigned long size = array_size(npages, sizeof(*slot->arch.rmap));
4589
+
4590
+ if ((size >> PAGE_SHIFT) > totalram_pages())
4591
+ return -ENOMEM;
4592
+
4593
+ slot->arch.rmap = vzalloc(size);
4594
+ if (!slot->arch.rmap)
4595
+ return -ENOMEM;
4596
+ }
4597
+
36654598 return 0;
36664599 }
36674600
36684601 static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
36694602 const struct kvm_userspace_memory_region *mem,
36704603 const struct kvm_memory_slot *old,
3671
- const struct kvm_memory_slot *new)
4604
+ const struct kvm_memory_slot *new,
4605
+ enum kvm_mr_change change)
36724606 {
36734607 unsigned long npages = mem->memory_size >> PAGE_SHIFT;
36744608
....@@ -3680,11 +4614,50 @@
36804614 */
36814615 if (npages)
36824616 atomic64_inc(&kvm->arch.mmio_update);
4617
+
4618
+ /*
4619
+ * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
4620
+ * have already called kvm_arch_flush_shadow_memslot() to
4621
+ * flush shadow mappings. For KVM_MR_CREATE we have no
4622
+ * previous mappings. So the only case to handle is
4623
+ * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
4624
+ * has been changed.
4625
+ * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
4626
+ * to get rid of any THP PTEs in the partition-scoped page tables
4627
+ * so we can track dirtiness at the page level; we flush when
4628
+ * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
4629
+ * using THP PTEs.
4630
+ */
4631
+ if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
4632
+ ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
4633
+ kvmppc_radix_flush_memslot(kvm, old);
4634
+ /*
4635
+ * If UV hasn't yet called H_SVM_INIT_START, don't register memslots.
4636
+ */
4637
+ if (!kvm->arch.secure_guest)
4638
+ return;
4639
+
4640
+ switch (change) {
4641
+ case KVM_MR_CREATE:
4642
+ /*
4643
+ * @TODO kvmppc_uvmem_memslot_create() can fail and
4644
+ * return error. Fix this.
4645
+ */
4646
+ kvmppc_uvmem_memslot_create(kvm, new);
4647
+ break;
4648
+ case KVM_MR_DELETE:
4649
+ kvmppc_uvmem_memslot_delete(kvm, old);
4650
+ break;
4651
+ default:
4652
+ /* TODO: Handle KVM_MR_MOVE */
4653
+ break;
4654
+ }
36834655 }
36844656
36854657 /*
36864658 * Update LPCR values in kvm->arch and in vcores.
3687
- * Caller must hold kvm->lock.
4659
+ * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
4660
+ * of kvm->arch.lpcr update).
36884661 */
36894662 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
36904663 {
....@@ -3708,11 +4681,6 @@
37084681 }
37094682 }
37104683
3711
-static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
3712
-{
3713
- return;
3714
-}
3715
-
37164684 void kvmppc_setup_partition_table(struct kvm *kvm)
37174685 {
37184686 unsigned long dw0, dw1;
....@@ -3731,13 +4699,12 @@
37314699 __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
37324700 dw1 = PATB_GR | kvm->arch.process_table;
37334701 }
3734
-
3735
- mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
4702
+ kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
37364703 }
37374704
37384705 /*
37394706 * Set up HPT (hashed page table) and RMA (real-mode area).
3740
- * Must be called with kvm->lock held.
4707
+ * Must be called with kvm->arch.mmu_setup_lock held.
37414708 */
37424709 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
37434710 {
....@@ -3781,14 +4748,14 @@
37814748
37824749 /* Look up the VMA for the start of this memory slot */
37834750 hva = memslot->userspace_addr;
3784
- down_read(&current->mm->mmap_sem);
3785
- vma = find_vma(current->mm, hva);
4751
+ mmap_read_lock(kvm->mm);
4752
+ vma = find_vma(kvm->mm, hva);
37864753 if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
37874754 goto up_out;
37884755
37894756 psize = vma_kernel_pagesize(vma);
37904757
3791
- up_read(&current->mm->mmap_sem);
4758
+ mmap_read_unlock(kvm->mm);
37924759
37934760 /* We can handle 4k, 64k or 16M pages in the VRMA */
37944761 if (psize >= 0x1000000)
....@@ -3821,13 +4788,18 @@
38214788 return err;
38224789
38234790 up_out:
3824
- up_read(&current->mm->mmap_sem);
4791
+ mmap_read_unlock(kvm->mm);
38254792 goto out_srcu;
38264793 }
38274794
3828
-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
4795
+/*
4796
+ * Must be called with kvm->arch.mmu_setup_lock held and
4797
+ * mmu_ready = 0 and no vcpus running.
4798
+ */
38294799 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
38304800 {
4801
+ if (nesting_enabled(kvm))
4802
+ kvmhv_release_all_nested(kvm);
38314803 kvmppc_rmap_reset(kvm);
38324804 kvm->arch.process_table = 0;
38334805 /* Mutual exclusion with kvm_unmap_hva_range etc. */
....@@ -3840,7 +4812,10 @@
38404812 return 0;
38414813 }
38424814
3843
-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
4815
+/*
4816
+ * Must be called with kvm->arch.mmu_setup_lock held and
4817
+ * mmu_ready = 0 and no vcpus running.
4818
+ */
38444819 int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
38454820 {
38464821 int err;
....@@ -3848,7 +4823,6 @@
38484823 err = kvmppc_init_vm_radix(kvm);
38494824 if (err)
38504825 return err;
3851
-
38524826 kvmppc_rmap_reset(kvm);
38534827 /* Mutual exclusion with kvm_unmap_hva_range etc. */
38544828 spin_lock(&kvm->mmu_lock);
....@@ -3946,6 +4920,10 @@
39464920 char buf[32];
39474921 int ret;
39484922
4923
+ mutex_init(&kvm->arch.uvmem_lock);
4924
+ INIT_LIST_HEAD(&kvm->arch.uvmem_pfns);
4925
+ mutex_init(&kvm->arch.mmu_setup_lock);
4926
+
39494927 /* Allocate the guest's logical partition ID */
39504928
39514929 lpid = kvmppc_alloc_lpid();
....@@ -3954,6 +4932,8 @@
39544932 kvm->arch.lpid = lpid;
39554933
39564934 kvmppc_alloc_host_rm_ops();
4935
+
4936
+ kvmhv_vm_nested_init(kvm);
39574937
39584938 /*
39594939 * Since we don't flush the TLB when tearing down a VM,
....@@ -3973,9 +4953,13 @@
39734953 kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
39744954
39754955 /* Init LPCR for virtual RMA mode */
3976
- kvm->arch.host_lpid = mfspr(SPRN_LPID);
3977
- kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
3978
- lpcr &= LPCR_PECE | LPCR_LPES;
4956
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
4957
+ kvm->arch.host_lpid = mfspr(SPRN_LPID);
4958
+ kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
4959
+ lpcr &= LPCR_PECE | LPCR_LPES;
4960
+ } else {
4961
+ lpcr = 0;
4962
+ }
39794963 lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
39804964 LPCR_VPM0 | LPCR_VPM1;
39814965 kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
....@@ -3998,7 +4982,7 @@
39984982 * If xive is enabled, we route 0x500 interrupts directly
39994983 * to the guest.
40004984 */
4001
- if (xive_enabled())
4985
+ if (xics_on_xive())
40024986 lpcr |= LPCR_LPES;
40034987 }
40044988
....@@ -4042,8 +5026,14 @@
40425026 * On POWER9, we only need to do this if the "indep_threads_mode"
40435027 * module parameter has been set to N.
40445028 */
4045
- if (cpu_has_feature(CPU_FTR_ARCH_300))
4046
- kvm->arch.threads_indep = indep_threads_mode;
5029
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5030
+ if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
5031
+ pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
5032
+ kvm->arch.threads_indep = true;
5033
+ } else {
5034
+ kvm->arch.threads_indep = indep_threads_mode;
5035
+ }
5036
+ }
40475037 if (!kvm->arch.threads_indep)
40485038 kvm_hv_vm_activated();
40495039
....@@ -4066,6 +5056,8 @@
40665056 snprintf(buf, sizeof(buf), "vm%d", current->pid);
40675057 kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
40685058 kvmppc_mmu_debugfs_init(kvm);
5059
+ if (radix_enabled())
5060
+ kvmhv_radix_debugfs_init(kvm);
40695061
40705062 return 0;
40715063 }
....@@ -4088,18 +5080,29 @@
40885080
40895081 kvmppc_free_vcores(kvm);
40905082
4091
- kvmppc_free_lpid(kvm->arch.lpid);
40925083
40935084 if (kvm_is_radix(kvm))
40945085 kvmppc_free_radix(kvm);
40955086 else
40965087 kvmppc_free_hpt(&kvm->arch.hpt);
40975088
5089
+ /* Perform global invalidation and return lpid to the pool */
5090
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
5091
+ if (nesting_enabled(kvm))
5092
+ kvmhv_release_all_nested(kvm);
5093
+ kvm->arch.process_table = 0;
5094
+ if (kvm->arch.secure_guest)
5095
+ uv_svm_terminate(kvm->arch.lpid);
5096
+ kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
5097
+ }
5098
+
5099
+ kvmppc_free_lpid(kvm->arch.lpid);
5100
+
40985101 kvmppc_free_pimap(kvm);
40995102 }
41005103
41015104 /* We don't need to emulate any privileged instructions or dcbz */
4102
-static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
5105
+static int kvmppc_core_emulate_op_hv(struct kvm_vcpu *vcpu,
41035106 unsigned int inst, int *advance)
41045107 {
41055108 return EMULATE_FAIL;
....@@ -4119,11 +5122,15 @@
41195122
41205123 static int kvmppc_core_check_processor_compat_hv(void)
41215124 {
4122
- if (!cpu_has_feature(CPU_FTR_HVMODE) ||
4123
- !cpu_has_feature(CPU_FTR_ARCH_206))
4124
- return -EIO;
5125
+ if (cpu_has_feature(CPU_FTR_HVMODE) &&
5126
+ cpu_has_feature(CPU_FTR_ARCH_206))
5127
+ return 0;
41255128
4126
- return 0;
5129
+ /* POWER9 in radix mode is capable of being a nested hypervisor. */
5130
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
5131
+ return 0;
5132
+
5133
+ return -EIO;
41275134 }
41285135
41295136 #ifdef CONFIG_KVM_XICS
....@@ -4214,7 +5221,7 @@
42145221 if (i == pimap->n_mapped)
42155222 pimap->n_mapped++;
42165223
4217
- if (xive_enabled())
5224
+ if (xics_on_xive())
42185225 rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
42195226 else
42205227 kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
....@@ -4255,7 +5262,7 @@
42555262 return -ENODEV;
42565263 }
42575264
4258
- if (xive_enabled())
5265
+ if (xics_on_xive())
42595266 rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
42605267 else
42615268 kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
....@@ -4321,6 +5328,12 @@
43215328
43225329 case KVM_PPC_ALLOCATE_HTAB: {
43235330 u32 htab_order;
5331
+
5332
+ /* If we're a nested hypervisor, we currently only support radix */
5333
+ if (kvmhv_on_pseries()) {
5334
+ r = -EOPNOTSUPP;
5335
+ break;
5336
+ }
43245337
43255338 r = -EFAULT;
43265339 if (get_user(htab_order, (u32 __user *)argp))
....@@ -4441,7 +5454,11 @@
44415454 if (radix && !radix_enabled())
44425455 return -EINVAL;
44435456
4444
- mutex_lock(&kvm->lock);
5457
+ /* If we're a nested hypervisor, we currently only support radix */
5458
+ if (kvmhv_on_pseries() && !radix)
5459
+ return -EINVAL;
5460
+
5461
+ mutex_lock(&kvm->arch.mmu_setup_lock);
44455462 if (radix != kvm_is_radix(kvm)) {
44465463 if (kvm->arch.mmu_ready) {
44475464 kvm->arch.mmu_ready = 0;
....@@ -4469,8 +5486,162 @@
44695486 err = 0;
44705487
44715488 out_unlock:
4472
- mutex_unlock(&kvm->lock);
5489
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
44735490 return err;
5491
+}
5492
+
5493
+static int kvmhv_enable_nested(struct kvm *kvm)
5494
+{
5495
+ if (!nested)
5496
+ return -EPERM;
5497
+ if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
5498
+ return -ENODEV;
5499
+
5500
+ /* kvm == NULL means the caller is testing if the capability exists */
5501
+ if (kvm)
5502
+ kvm->arch.nested_enable = true;
5503
+ return 0;
5504
+}
5505
+
5506
+static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5507
+ int size)
5508
+{
5509
+ int rc = -EINVAL;
5510
+
5511
+ if (kvmhv_vcpu_is_radix(vcpu)) {
5512
+ rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
5513
+
5514
+ if (rc > 0)
5515
+ rc = -EINVAL;
5516
+ }
5517
+
5518
+ /* For now quadrants are the only way to access nested guest memory */
5519
+ if (rc && vcpu->arch.nested)
5520
+ rc = -EAGAIN;
5521
+
5522
+ return rc;
5523
+}
5524
+
5525
+static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
5526
+ int size)
5527
+{
5528
+ int rc = -EINVAL;
5529
+
5530
+ if (kvmhv_vcpu_is_radix(vcpu)) {
5531
+ rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
5532
+
5533
+ if (rc > 0)
5534
+ rc = -EINVAL;
5535
+ }
5536
+
5537
+ /* For now quadrants are the only way to access nested guest memory */
5538
+ if (rc && vcpu->arch.nested)
5539
+ rc = -EAGAIN;
5540
+
5541
+ return rc;
5542
+}
5543
+
5544
+static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa)
5545
+{
5546
+ unpin_vpa(kvm, vpa);
5547
+ vpa->gpa = 0;
5548
+ vpa->pinned_addr = NULL;
5549
+ vpa->dirty = false;
5550
+ vpa->update_pending = 0;
5551
+}
5552
+
5553
+/*
5554
+ * Enable a guest to become a secure VM, or test whether
5555
+ * that could be enabled.
5556
+ * Called when the KVM_CAP_PPC_SECURE_GUEST capability is
5557
+ * tested (kvm == NULL) or enabled (kvm != NULL).
5558
+ */
5559
+static int kvmhv_enable_svm(struct kvm *kvm)
5560
+{
5561
+ if (!kvmppc_uvmem_available())
5562
+ return -EINVAL;
5563
+ if (kvm)
5564
+ kvm->arch.svm_enabled = 1;
5565
+ return 0;
5566
+}
5567
+
5568
+/*
5569
+ * IOCTL handler to turn off secure mode of guest
5570
+ *
5571
+ * - Release all device pages
5572
+ * - Issue ucall to terminate the guest on the UV side
5573
+ * - Unpin the VPA pages.
5574
+ * - Reinit the partition scoped page tables
5575
+ */
5576
+static int kvmhv_svm_off(struct kvm *kvm)
5577
+{
5578
+ struct kvm_vcpu *vcpu;
5579
+ int mmu_was_ready;
5580
+ int srcu_idx;
5581
+ int ret = 0;
5582
+ int i;
5583
+
5584
+ if (!(kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START))
5585
+ return ret;
5586
+
5587
+ mutex_lock(&kvm->arch.mmu_setup_lock);
5588
+ mmu_was_ready = kvm->arch.mmu_ready;
5589
+ if (kvm->arch.mmu_ready) {
5590
+ kvm->arch.mmu_ready = 0;
5591
+ /* order mmu_ready vs. vcpus_running */
5592
+ smp_mb();
5593
+ if (atomic_read(&kvm->arch.vcpus_running)) {
5594
+ kvm->arch.mmu_ready = 1;
5595
+ ret = -EBUSY;
5596
+ goto out;
5597
+ }
5598
+ }
5599
+
5600
+ srcu_idx = srcu_read_lock(&kvm->srcu);
5601
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5602
+ struct kvm_memory_slot *memslot;
5603
+ struct kvm_memslots *slots = __kvm_memslots(kvm, i);
5604
+
5605
+ if (!slots)
5606
+ continue;
5607
+
5608
+ kvm_for_each_memslot(memslot, slots) {
5609
+ kvmppc_uvmem_drop_pages(memslot, kvm, true);
5610
+ uv_unregister_mem_slot(kvm->arch.lpid, memslot->id);
5611
+ }
5612
+ }
5613
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
5614
+
5615
+ ret = uv_svm_terminate(kvm->arch.lpid);
5616
+ if (ret != U_SUCCESS) {
5617
+ ret = -EINVAL;
5618
+ goto out;
5619
+ }
5620
+
5621
+ /*
5622
+ * When secure guest is reset, all the guest pages are sent
5623
+ * to UV via UV_PAGE_IN before the non-boot vcpus get a
5624
+ * chance to run and unpin their VPA pages. Unpinning of all
5625
+ * VPA pages is done here explicitly so that VPA pages
5626
+ * can be migrated to the secure side.
5627
+ *
5628
+ * This is required to for the secure SMP guest to reboot
5629
+ * correctly.
5630
+ */
5631
+ kvm_for_each_vcpu(i, vcpu, kvm) {
5632
+ spin_lock(&vcpu->arch.vpa_update_lock);
5633
+ unpin_vpa_reset(kvm, &vcpu->arch.dtl);
5634
+ unpin_vpa_reset(kvm, &vcpu->arch.slb_shadow);
5635
+ unpin_vpa_reset(kvm, &vcpu->arch.vpa);
5636
+ spin_unlock(&vcpu->arch.vpa_update_lock);
5637
+ }
5638
+
5639
+ kvmppc_setup_partition_table(kvm);
5640
+ kvm->arch.secure_guest = 0;
5641
+ kvm->arch.mmu_ready = mmu_was_ready;
5642
+out:
5643
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
5644
+ return ret;
44745645 }
44755646
44765647 static struct kvmppc_ops kvm_ops_hv = {
....@@ -4480,6 +5651,7 @@
44805651 .set_one_reg = kvmppc_set_one_reg_hv,
44815652 .vcpu_load = kvmppc_core_vcpu_load_hv,
44825653 .vcpu_put = kvmppc_core_vcpu_put_hv,
5654
+ .inject_interrupt = kvmppc_inject_interrupt_hv,
44835655 .set_msr = kvmppc_set_msr_hv,
44845656 .vcpu_run = kvmppc_vcpu_run_hv,
44855657 .vcpu_create = kvmppc_core_vcpu_create_hv,
....@@ -4493,9 +5665,7 @@
44935665 .age_hva = kvm_age_hva_hv,
44945666 .test_age_hva = kvm_test_age_hva_hv,
44955667 .set_spte_hva = kvm_set_spte_hva_hv,
4496
- .mmu_destroy = kvmppc_mmu_destroy_hv,
44975668 .free_memslot = kvmppc_core_free_memslot_hv,
4498
- .create_memslot = kvmppc_core_create_memslot_hv,
44995669 .init_vm = kvmppc_core_init_vm_hv,
45005670 .destroy_vm = kvmppc_core_destroy_vm_hv,
45015671 .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
....@@ -4512,6 +5682,11 @@
45125682 .configure_mmu = kvmhv_configure_mmu,
45135683 .get_rmmu_info = kvmhv_get_rmmu_info,
45145684 .set_smt_mode = kvmhv_set_smt_mode,
5685
+ .enable_nested = kvmhv_enable_nested,
5686
+ .load_from_eaddr = kvmhv_load_from_eaddr,
5687
+ .store_to_eaddr = kvmhv_store_to_eaddr,
5688
+ .enable_svm = kvmhv_enable_svm,
5689
+ .svm_off = kvmhv_svm_off,
45155690 };
45165691
45175692 static int kvm_init_subcore_bitmap(void)
....@@ -4529,13 +5704,11 @@
45295704 continue;
45305705
45315706 sibling_subcore_state =
4532
- kmalloc_node(sizeof(struct sibling_subcore_state),
5707
+ kzalloc_node(sizeof(struct sibling_subcore_state),
45335708 GFP_KERNEL, node);
45345709 if (!sibling_subcore_state)
45355710 return -ENOMEM;
45365711
4537
- memset(sibling_subcore_state, 0,
4538
- sizeof(struct sibling_subcore_state));
45395712
45405713 for (j = 0; j < threads_per_core; j++) {
45415714 int cpu = first_cpu + j;
....@@ -4555,12 +5728,22 @@
45555728 static int kvmppc_book3s_init_hv(void)
45565729 {
45575730 int r;
5731
+
5732
+ if (!tlbie_capable) {
5733
+ pr_err("KVM-HV: Host does not support TLBIE\n");
5734
+ return -ENODEV;
5735
+ }
5736
+
45585737 /*
45595738 * FIXME!! Do we need to check on all cpus ?
45605739 */
45615740 r = kvmppc_core_check_processor_compat_hv();
45625741 if (r < 0)
45635742 return -ENODEV;
5743
+
5744
+ r = kvmhv_nested_init();
5745
+ if (r)
5746
+ return r;
45645747
45655748 r = kvm_init_subcore_bitmap();
45665749 if (r)
....@@ -4572,7 +5755,8 @@
45725755 * indirectly, via OPAL.
45735756 */
45745757 #ifdef CONFIG_SMP
4575
- if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
5758
+ if (!xics_on_xive() && !kvmhv_on_pseries() &&
5759
+ !local_paca->kvm_hstate.xics_phys) {
45765760 struct device_node *np;
45775761
45785762 np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
....@@ -4596,8 +5780,11 @@
45965780 if (r)
45975781 return r;
45985782
4599
- if (kvmppc_radix_possible())
5783
+ if (kvmppc_radix_possible()) {
46005784 r = kvmppc_radix_init();
5785
+ if (r)
5786
+ return r;
5787
+ }
46015788
46025789 /*
46035790 * POWER9 chips before version 2.02 can't have some threads in
....@@ -4611,15 +5798,21 @@
46115798 no_mixing_hpt_and_radix = true;
46125799 }
46135800
5801
+ r = kvmppc_uvmem_init();
5802
+ if (r < 0)
5803
+ pr_err("KVM-HV: kvmppc_uvmem_init failed %d\n", r);
5804
+
46145805 return r;
46155806 }
46165807
46175808 static void kvmppc_book3s_exit_hv(void)
46185809 {
5810
+ kvmppc_uvmem_free();
46195811 kvmppc_free_host_rm_ops();
46205812 if (kvmppc_radix_possible())
46215813 kvmppc_radix_exit();
46225814 kvmppc_hv_ops = NULL;
5815
+ kvmhv_nested_exit();
46235816 }
46245817
46255818 module_init(kvmppc_book3s_init_hv);