hc
2024-05-10 37f49e37ab4cb5d0bc4c60eb5c6d4dd57db767bb
kernel/arch/powerpc/kvm/book3s_64_vio_hv.c
....@@ -1,16 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
2
- * This program is free software; you can redistribute it and/or modify
3
- * it under the terms of the GNU General Public License, version 2, as
4
- * published by the Free Software Foundation.
5
- *
6
- * This program is distributed in the hope that it will be useful,
7
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
8
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
- * GNU General Public License for more details.
10
- *
11
- * You should have received a copy of the GNU General Public License
12
- * along with this program; if not, write to the Free Software
13
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
143 *
154 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
165 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
....@@ -35,7 +24,6 @@
3524 #include <asm/hvcall.h>
3625 #include <asm/synch.h>
3726 #include <asm/ppc-opcode.h>
38
-#include <asm/kvm_host.h>
3927 #include <asm/udbg.h>
4028 #include <asm/iommu.h>
4129 #include <asm/tce.h>
....@@ -44,7 +32,7 @@
4432 #ifdef CONFIG_BUG
4533
4634 #define WARN_ON_ONCE_RM(condition) ({ \
47
- static bool __section(.data.unlikely) __warned; \
35
+ static bool __section(".data.unlikely") __warned; \
4836 int __ret_warn_once = !!(condition); \
4937 \
5038 if (unlikely(__ret_warn_once && !__warned)) { \
....@@ -66,8 +54,6 @@
6654
6755 #endif
6856
69
-#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
70
-
7157 /*
7258 * Finds a TCE table descriptor by LIOBN.
7359 *
....@@ -87,6 +73,23 @@
8773 }
8874 EXPORT_SYMBOL_GPL(kvmppc_find_table);
8975
76
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
77
+static long kvmppc_rm_tce_to_ua(struct kvm *kvm,
78
+ unsigned long tce, unsigned long *ua)
79
+{
80
+ unsigned long gfn = tce >> PAGE_SHIFT;
81
+ struct kvm_memory_slot *memslot;
82
+
83
+ memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
84
+ if (!memslot)
85
+ return -EINVAL;
86
+
87
+ *ua = __gfn_to_hva_memslot(memslot, gfn) |
88
+ (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
89
+
90
+ return 0;
91
+}
92
+
9093 /*
9194 * Validates TCE address.
9295 * At the moment flags and page mask are validated.
....@@ -94,14 +97,14 @@
9497 * to the table and user space is supposed to process them), we can skip
9598 * checking other things (such as TCE is a guest RAM address or the page
9699 * was actually allocated).
97
- *
98
- * WARNING: This will be called in real-mode on HV KVM and virtual
99
- * mode on PR KVM
100100 */
101
-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
101
+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
102
+ unsigned long tce)
102103 {
103104 unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
104105 enum dma_data_direction dir = iommu_tce_direction(tce);
106
+ struct kvmppc_spapr_tce_iommu_table *stit;
107
+ unsigned long ua = 0;
105108
106109 /* Allow userspace to poison TCE table */
107110 if (dir == DMA_NONE)
....@@ -110,9 +113,24 @@
110113 if (iommu_tce_check_gpa(stt->page_shift, gpa))
111114 return H_PARAMETER;
112115
116
+ if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua))
117
+ return H_TOO_HARD;
118
+
119
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
120
+ unsigned long hpa = 0;
121
+ struct mm_iommu_table_group_mem_t *mem;
122
+ long shift = stit->tbl->it_page_shift;
123
+
124
+ mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
125
+ if (!mem)
126
+ return H_TOO_HARD;
127
+
128
+ if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
129
+ return H_TOO_HARD;
130
+ }
131
+
113132 return H_SUCCESS;
114133 }
115
-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
116134
117135 /* Note on the use of page_address() in real mode,
118136 *
....@@ -144,13 +162,9 @@
144162 /*
145163 * Handles TCE requests for emulated devices.
146164 * Puts guest TCE values to the table and expects user space to convert them.
147
- * Called in both real and virtual modes.
148
- * Cannot fail so kvmppc_tce_validate must be called before it.
149
- *
150
- * WARNING: This will be called in real-mode on HV KVM and virtual
151
- * mode on PR KVM
165
+ * Cannot fail so kvmppc_rm_tce_validate must be called before it.
152166 */
153
-void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
167
+static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt,
154168 unsigned long idx, unsigned long tce)
155169 {
156170 struct page *page;
....@@ -158,46 +172,63 @@
158172
159173 idx -= stt->offset;
160174 page = stt->pages[idx / TCES_PER_PAGE];
175
+ /*
176
+ * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is
177
+ * being cleared, otherwise it returns H_TOO_HARD and we skip this.
178
+ */
179
+ if (!page) {
180
+ WARN_ON_ONCE_RM(tce != 0);
181
+ return;
182
+ }
161183 tbl = kvmppc_page_address(page);
162184
163185 tbl[idx % TCES_PER_PAGE] = tce;
164186 }
165
-EXPORT_SYMBOL_GPL(kvmppc_tce_put);
166187
167
-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
168
- unsigned long *ua, unsigned long **prmap)
188
+/*
189
+ * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so
190
+ * in real mode.
191
+ * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is
192
+ * allocated or not required (when clearing a tce entry).
193
+ */
194
+static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
195
+ unsigned long ioba, unsigned long npages, bool clearing)
169196 {
170
- unsigned long gfn = gpa >> PAGE_SHIFT;
171
- struct kvm_memory_slot *memslot;
197
+ unsigned long i, idx, sttpage, sttpages;
198
+ unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
172199
173
- memslot = search_memslots(kvm_memslots(kvm), gfn);
174
- if (!memslot)
175
- return -EINVAL;
200
+ if (ret)
201
+ return ret;
202
+ /*
203
+ * clearing==true says kvmppc_rm_tce_put won't be allocating pages
204
+ * for empty tces.
205
+ */
206
+ if (clearing)
207
+ return H_SUCCESS;
176208
177
- *ua = __gfn_to_hva_memslot(memslot, gfn) |
178
- (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
209
+ idx = (ioba >> stt->page_shift) - stt->offset;
210
+ sttpage = idx / TCES_PER_PAGE;
211
+ sttpages = ALIGN(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
212
+ TCES_PER_PAGE;
213
+ for (i = sttpage; i < sttpage + sttpages; ++i)
214
+ if (!stt->pages[i])
215
+ return H_TOO_HARD;
179216
180
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
181
- if (prmap)
182
- *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
183
-#endif
184
-
185
- return 0;
217
+ return H_SUCCESS;
186218 }
187
-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
188219
189
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
190
-static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
220
+static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
221
+ struct iommu_table *tbl,
191222 unsigned long entry, unsigned long *hpa,
192223 enum dma_data_direction *direction)
193224 {
194225 long ret;
195226
196
- ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
227
+ ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true);
197228
198229 if (!ret && ((*direction == DMA_FROM_DEVICE) ||
199230 (*direction == DMA_BIDIRECTIONAL))) {
200
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
231
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
201232 /*
202233 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
203234 * calling this so we still get here a valid UA.
....@@ -209,13 +240,26 @@
209240 return ret;
210241 }
211242
212
-static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
213
- unsigned long entry)
243
+static void iommu_tce_kill_rm(struct iommu_table *tbl,
244
+ unsigned long entry, unsigned long pages)
214245 {
215
- unsigned long hpa = 0;
216
- enum dma_data_direction dir = DMA_NONE;
246
+ if (tbl->it_ops->tce_kill)
247
+ tbl->it_ops->tce_kill(tbl, entry, pages, true);
248
+}
217249
218
- iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
250
+static void kvmppc_rm_clear_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *stt,
251
+ struct iommu_table *tbl, unsigned long entry)
252
+{
253
+ unsigned long i;
254
+ unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
255
+ unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
256
+
257
+ for (i = 0; i < subpages; ++i) {
258
+ unsigned long hpa = 0;
259
+ enum dma_data_direction dir = DMA_NONE;
260
+
261
+ iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, io_entry + i, &hpa, &dir);
262
+ }
219263 }
220264
221265 static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
....@@ -223,7 +267,7 @@
223267 {
224268 struct mm_iommu_table_group_mem_t *mem = NULL;
225269 const unsigned long pgsize = 1ULL << tbl->it_page_shift;
226
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
270
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
227271
228272 if (!pua)
229273 /* it_userspace allocation might be delayed */
....@@ -247,7 +291,7 @@
247291 unsigned long hpa = 0;
248292 long ret;
249293
250
- if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir))
294
+ if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir))
251295 /*
252296 * real mode xchg can fail if struct page crosses
253297 * a page boundary
....@@ -259,7 +303,7 @@
259303
260304 ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
261305 if (ret)
262
- iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
306
+ iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
263307
264308 return ret;
265309 }
....@@ -278,6 +322,8 @@
278322 break;
279323 }
280324
325
+ iommu_tce_kill_rm(tbl, io_entry, subpages);
326
+
281327 return ret;
282328 }
283329
....@@ -287,7 +333,7 @@
287333 {
288334 long ret;
289335 unsigned long hpa = 0;
290
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
336
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
291337 struct mm_iommu_table_group_mem_t *mem;
292338
293339 if (!pua)
....@@ -305,7 +351,7 @@
305351 if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
306352 return H_TOO_HARD;
307353
308
- ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
354
+ ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
309355 if (ret) {
310356 mm_iommu_mapped_dec(mem);
311357 /*
....@@ -341,6 +387,8 @@
341387 break;
342388 }
343389
390
+ iommu_tce_kill_rm(tbl, io_entry, subpages);
391
+
344392 return ret;
345393 }
346394
....@@ -364,17 +412,16 @@
364412 if (!stt)
365413 return H_TOO_HARD;
366414
367
- ret = kvmppc_ioba_validate(stt, ioba, 1);
415
+ ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
368416 if (ret != H_SUCCESS)
369417 return ret;
370418
371
- ret = kvmppc_tce_validate(stt, tce);
419
+ ret = kvmppc_rm_tce_validate(stt, tce);
372420 if (ret != H_SUCCESS)
373421 return ret;
374422
375423 dir = iommu_tce_direction(tce);
376
- if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
377
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
424
+ if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua))
378425 return H_PARAMETER;
379426
380427 entry = ioba >> stt->page_shift;
....@@ -387,23 +434,19 @@
387434 ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
388435 stit->tbl, entry, ua, dir);
389436
390
- if (ret == H_SUCCESS)
391
- continue;
392
-
393
- if (ret == H_TOO_HARD)
437
+ if (ret != H_SUCCESS) {
438
+ kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry);
394439 return ret;
395
-
396
- WARN_ON_ONCE_RM(1);
397
- kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
440
+ }
398441 }
399442
400
- kvmppc_tce_put(stt, entry, tce);
443
+ kvmppc_rm_tce_put(stt, entry, tce);
401444
402445 return H_SUCCESS;
403446 }
404447
405
-static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
406
- unsigned long ua, unsigned long *phpa)
448
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu, unsigned long mmu_seq,
449
+ unsigned long ua, unsigned long *phpa)
407450 {
408451 pte_t *ptep, pte;
409452 unsigned shift = 0;
....@@ -417,10 +460,17 @@
417460 * to exit which will agains result in the below page table walk
418461 * to finish.
419462 */
420
- ptep = __find_linux_pte(vcpu->arch.pgdir, ua, NULL, &shift);
421
- if (!ptep || !pte_present(*ptep))
463
+ /* an rmap lock won't make it safe. because that just ensure hash
464
+ * page table entries are removed with rmap lock held. After that
465
+ * mmu notifier returns and we go ahead and removing ptes from Qemu page table.
466
+ */
467
+ ptep = find_kvm_host_pte(vcpu->kvm, mmu_seq, ua, &shift);
468
+ if (!ptep)
422469 return -ENXIO;
423
- pte = *ptep;
470
+
471
+ pte = READ_ONCE(*ptep);
472
+ if (!pte_present(pte))
473
+ return -ENXIO;
424474
425475 if (!shift)
426476 shift = PAGE_SHIFT;
....@@ -442,16 +492,23 @@
442492 unsigned long liobn, unsigned long ioba,
443493 unsigned long tce_list, unsigned long npages)
444494 {
495
+ struct kvm *kvm = vcpu->kvm;
445496 struct kvmppc_spapr_tce_table *stt;
446497 long i, ret = H_SUCCESS;
447498 unsigned long tces, entry, ua = 0;
448
- unsigned long *rmap = NULL;
499
+ unsigned long mmu_seq;
449500 bool prereg = false;
450501 struct kvmppc_spapr_tce_iommu_table *stit;
451502
452503 /* For radix, we might be in virtual mode, so punt */
453504 if (kvm_is_radix(vcpu->kvm))
454505 return H_TOO_HARD;
506
+
507
+ /*
508
+ * used to check for invalidations in progress
509
+ */
510
+ mmu_seq = kvm->mmu_notifier_seq;
511
+ smp_rmb();
455512
456513 stt = kvmppc_find_table(vcpu->kvm, liobn);
457514 if (!stt)
....@@ -468,7 +525,7 @@
468525 if (tce_list & (SZ_4K - 1))
469526 return H_PARAMETER;
470527
471
- ret = kvmppc_ioba_validate(stt, ioba, npages);
528
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
472529 if (ret != H_SUCCESS)
473530 return ret;
474531
....@@ -480,7 +537,7 @@
480537 */
481538 struct mm_iommu_table_group_mem_t *mem;
482539
483
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
540
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua))
484541 return H_TOO_HARD;
485542
486543 mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
....@@ -496,23 +553,11 @@
496553 * We do not require memory to be preregistered in this case
497554 * so lock rmap and do __find_linux_pte_or_hugepte().
498555 */
499
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
556
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua))
500557 return H_TOO_HARD;
501558
502
- rmap = (void *) vmalloc_to_phys(rmap);
503
- if (WARN_ON_ONCE_RM(!rmap))
504
- return H_TOO_HARD;
505
-
506
- /*
507
- * Synchronize with the MMU notifier callbacks in
508
- * book3s_64_mmu_hv.c (kvm_unmap_hva_range_hv etc.).
509
- * While we have the rmap lock, code running on other CPUs
510
- * cannot finish unmapping the host real page that backs
511
- * this guest real page, so we are OK to access the host
512
- * real page.
513
- */
514
- lock_rmap(rmap);
515
- if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
559
+ arch_spin_lock(&kvm->mmu_lock.rlock.raw_lock);
560
+ if (kvmppc_rm_ua_to_hpa(vcpu, mmu_seq, ua, &tces)) {
516561 ret = H_TOO_HARD;
517562 goto unlock_exit;
518563 }
....@@ -521,14 +566,16 @@
521566 for (i = 0; i < npages; ++i) {
522567 unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
523568
524
- ret = kvmppc_tce_validate(stt, tce);
569
+ ret = kvmppc_rm_tce_validate(stt, tce);
525570 if (ret != H_SUCCESS)
526571 goto unlock_exit;
572
+ }
573
+
574
+ for (i = 0; i < npages; ++i) {
575
+ unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
527576
528577 ua = 0;
529
- if (kvmppc_gpa_to_ua(vcpu->kvm,
530
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
531
- &ua, NULL)) {
578
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua)) {
532579 ret = H_PARAMETER;
533580 goto unlock_exit;
534581 }
....@@ -538,23 +585,19 @@
538585 stit->tbl, entry + i, ua,
539586 iommu_tce_direction(tce));
540587
541
- if (ret == H_SUCCESS)
542
- continue;
543
-
544
- if (ret == H_TOO_HARD)
588
+ if (ret != H_SUCCESS) {
589
+ kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl,
590
+ entry + i);
545591 goto unlock_exit;
546
-
547
- WARN_ON_ONCE_RM(1);
548
- kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
592
+ }
549593 }
550594
551
- kvmppc_tce_put(stt, entry + i, tce);
595
+ kvmppc_rm_tce_put(stt, entry + i, tce);
552596 }
553597
554598 unlock_exit:
555
- if (rmap)
556
- unlock_rmap(rmap);
557
-
599
+ if (!prereg)
600
+ arch_spin_unlock(&kvm->mmu_lock.rlock.raw_lock);
558601 return ret;
559602 }
560603
....@@ -574,7 +617,7 @@
574617 if (!stt)
575618 return H_TOO_HARD;
576619
577
- ret = kvmppc_ioba_validate(stt, ioba, npages);
620
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
578621 if (ret != H_SUCCESS)
579622 return ret;
580623
....@@ -596,14 +639,14 @@
596639 return ret;
597640
598641 WARN_ON_ONCE_RM(1);
599
- kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
642
+ kvmppc_rm_clear_tce(vcpu->kvm, stt, stit->tbl, entry + i);
600643 }
601644 }
602645
603646 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
604
- kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
647
+ kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
605648
606
- return H_SUCCESS;
649
+ return ret;
607650 }
608651
609652 /* This can be called in either virtual mode or real mode */
....@@ -626,6 +669,10 @@
626669
627670 idx = (ioba >> stt->page_shift) - stt->offset;
628671 page = stt->pages[idx / TCES_PER_PAGE];
672
+ if (!page) {
673
+ vcpu->arch.regs.gpr[4] = 0;
674
+ return H_SUCCESS;
675
+ }
629676 tbl = (u64 *)page_address(page);
630677
631678 vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];