hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/powerpc/kvm/book3s_64_vio.c
....@@ -1,16 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
2
- * This program is free software; you can redistribute it and/or modify
3
- * it under the terms of the GNU General Public License, version 2, as
4
- * published by the Free Software Foundation.
5
- *
6
- * This program is distributed in the hope that it will be useful,
7
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
8
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
- * GNU General Public License for more details.
10
- *
11
- * You should have received a copy of the GNU General Public License
12
- * along with this program; if not, write to the Free Software
13
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
143 *
154 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
165 * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
....@@ -30,6 +19,7 @@
3019 #include <linux/anon_inodes.h>
3120 #include <linux/iommu.h>
3221 #include <linux/file.h>
22
+#include <linux/mm.h>
3323
3424 #include <asm/kvm_ppc.h>
3525 #include <asm/kvm_book3s.h>
....@@ -37,7 +27,6 @@
3727 #include <asm/hvcall.h>
3828 #include <asm/synch.h>
3929 #include <asm/ppc-opcode.h>
40
-#include <asm/kvm_host.h>
4130 #include <asm/udbg.h>
4231 #include <asm/iommu.h>
4332 #include <asm/tce.h>
....@@ -54,43 +43,6 @@
5443 (tce_pages * sizeof(struct page *));
5544
5645 return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
57
-}
58
-
59
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
60
-{
61
- long ret = 0;
62
-
63
- if (!current || !current->mm)
64
- return ret; /* process exited */
65
-
66
- down_write(&current->mm->mmap_sem);
67
-
68
- if (inc) {
69
- unsigned long locked, lock_limit;
70
-
71
- locked = current->mm->locked_vm + stt_pages;
72
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
73
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
74
- ret = -ENOMEM;
75
- else
76
- current->mm->locked_vm += stt_pages;
77
- } else {
78
- if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
79
- stt_pages = current->mm->locked_vm;
80
-
81
- current->mm->locked_vm -= stt_pages;
82
- }
83
-
84
- pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
85
- inc ? '+' : '-',
86
- stt_pages << PAGE_SHIFT,
87
- current->mm->locked_vm << PAGE_SHIFT,
88
- rlimit(RLIMIT_MEMLOCK),
89
- ret ? " - exceeded" : "");
90
-
91
- up_write(&current->mm->mmap_sem);
92
-
93
- return ret;
9446 }
9547
9648 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
....@@ -121,6 +73,7 @@
12173 struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
12274 struct iommu_table_group *table_group = NULL;
12375
76
+ rcu_read_lock();
12477 list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
12578
12679 table_group = iommu_group_get_iommudata(grp);
....@@ -135,7 +88,9 @@
13588 kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
13689 }
13790 }
91
+ cond_resched_rcu();
13892 }
93
+ rcu_read_unlock();
13994 }
14095
14196 extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
....@@ -153,12 +108,14 @@
153108 if (!f.file)
154109 return -EBADF;
155110
111
+ rcu_read_lock();
156112 list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) {
157113 if (stt == f.file->private_data) {
158114 found = true;
159115 break;
160116 }
161117 }
118
+ rcu_read_unlock();
162119
163120 fdput(f);
164121
....@@ -191,6 +148,7 @@
191148 if (!tbl)
192149 return -EINVAL;
193150
151
+ rcu_read_lock();
194152 list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
195153 if (tbl != stit->tbl)
196154 continue;
....@@ -198,14 +156,17 @@
198156 if (!kref_get_unless_zero(&stit->kref)) {
199157 /* stit is being destroyed */
200158 iommu_tce_table_put(tbl);
159
+ rcu_read_unlock();
201160 return -ENOTTY;
202161 }
203162 /*
204163 * The table is already known to this KVM, we just increased
205164 * its KVM reference counter and can return.
206165 */
166
+ rcu_read_unlock();
207167 return 0;
208168 }
169
+ rcu_read_unlock();
209170
210171 stit = kzalloc(sizeof(*stit), GFP_KERNEL);
211172 if (!stit) {
....@@ -228,9 +189,31 @@
228189 unsigned long i, npages = kvmppc_tce_pages(stt->size);
229190
230191 for (i = 0; i < npages; i++)
231
- __free_page(stt->pages[i]);
192
+ if (stt->pages[i])
193
+ __free_page(stt->pages[i]);
232194
233195 kfree(stt);
196
+}
197
+
198
+static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt,
199
+ unsigned long sttpage)
200
+{
201
+ struct page *page = stt->pages[sttpage];
202
+
203
+ if (page)
204
+ return page;
205
+
206
+ mutex_lock(&stt->alloc_lock);
207
+ page = stt->pages[sttpage];
208
+ if (!page) {
209
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
210
+ WARN_ON_ONCE(!page);
211
+ if (page)
212
+ stt->pages[sttpage] = page;
213
+ }
214
+ mutex_unlock(&stt->alloc_lock);
215
+
216
+ return page;
234217 }
235218
236219 static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
....@@ -241,7 +224,10 @@
241224 if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
242225 return VM_FAULT_SIGBUS;
243226
244
- page = stt->pages[vmf->pgoff];
227
+ page = kvm_spapr_get_tce_page(stt, vmf->pgoff);
228
+ if (!page)
229
+ return VM_FAULT_OOM;
230
+
245231 get_page(page);
246232 vmf->page = page;
247233 return 0;
....@@ -275,10 +261,11 @@
275261 }
276262 }
277263
264
+ account_locked_vm(kvm->mm,
265
+ kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
266
+
278267 kvm_put_kvm(stt->kvm);
279268
280
- kvmppc_account_memlimit(
281
- kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
282269 call_rcu(&stt->rcu, release_spapr_tce_table);
283270
284271 return 0;
....@@ -294,16 +281,16 @@
294281 {
295282 struct kvmppc_spapr_tce_table *stt = NULL;
296283 struct kvmppc_spapr_tce_table *siter;
284
+ struct mm_struct *mm = kvm->mm;
297285 unsigned long npages, size = args->size;
298
- int ret = -ENOMEM;
299
- int i;
286
+ int ret;
300287
301288 if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
302289 (args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
303290 return -EINVAL;
304291
305292 npages = kvmppc_tce_pages(size);
306
- ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
293
+ ret = account_locked_vm(mm, kvmppc_stt_pages(npages), true);
307294 if (ret)
308295 return ret;
309296
....@@ -318,13 +305,8 @@
318305 stt->offset = args->offset;
319306 stt->size = size;
320307 stt->kvm = kvm;
308
+ mutex_init(&stt->alloc_lock);
321309 INIT_LIST_HEAD_RCU(&stt->iommu_tables);
322
-
323
- for (i = 0; i < npages; i++) {
324
- stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
325
- if (!stt->pages[i])
326
- goto fail;
327
- }
328310
329311 mutex_lock(&kvm->lock);
330312
....@@ -337,37 +319,121 @@
337319 }
338320 }
339321
322
+ kvm_get_kvm(kvm);
340323 if (!ret)
341324 ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
342325 stt, O_RDWR | O_CLOEXEC);
343326
344
- if (ret >= 0) {
327
+ if (ret >= 0)
345328 list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
346
- kvm_get_kvm(kvm);
347
- }
329
+ else
330
+ kvm_put_kvm_no_destroy(kvm);
348331
349332 mutex_unlock(&kvm->lock);
350333
351334 if (ret >= 0)
352335 return ret;
353336
354
- fail:
355
- for (i = 0; i < npages; i++)
356
- if (stt->pages[i])
357
- __free_page(stt->pages[i]);
358
-
359337 kfree(stt);
360338 fail_acct:
361
- kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
339
+ account_locked_vm(mm, kvmppc_stt_pages(npages), false);
362340 return ret;
363341 }
364342
365
-static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
343
+static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
344
+ unsigned long *ua)
366345 {
367
- unsigned long hpa = 0;
368
- enum dma_data_direction dir = DMA_NONE;
346
+ unsigned long gfn = tce >> PAGE_SHIFT;
347
+ struct kvm_memory_slot *memslot;
369348
370
- iommu_tce_xchg(tbl, entry, &hpa, &dir);
349
+ memslot = search_memslots(kvm_memslots(kvm), gfn);
350
+ if (!memslot)
351
+ return -EINVAL;
352
+
353
+ *ua = __gfn_to_hva_memslot(memslot, gfn) |
354
+ (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
355
+
356
+ return 0;
357
+}
358
+
359
+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
360
+ unsigned long tce)
361
+{
362
+ unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
363
+ enum dma_data_direction dir = iommu_tce_direction(tce);
364
+ struct kvmppc_spapr_tce_iommu_table *stit;
365
+ unsigned long ua = 0;
366
+
367
+ /* Allow userspace to poison TCE table */
368
+ if (dir == DMA_NONE)
369
+ return H_SUCCESS;
370
+
371
+ if (iommu_tce_check_gpa(stt->page_shift, gpa))
372
+ return H_TOO_HARD;
373
+
374
+ if (kvmppc_tce_to_ua(stt->kvm, tce, &ua))
375
+ return H_TOO_HARD;
376
+
377
+ rcu_read_lock();
378
+ list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
379
+ unsigned long hpa = 0;
380
+ struct mm_iommu_table_group_mem_t *mem;
381
+ long shift = stit->tbl->it_page_shift;
382
+
383
+ mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
384
+ if (!mem || mm_iommu_ua_to_hpa(mem, ua, shift, &hpa)) {
385
+ rcu_read_unlock();
386
+ return H_TOO_HARD;
387
+ }
388
+ }
389
+ rcu_read_unlock();
390
+
391
+ return H_SUCCESS;
392
+}
393
+
394
+/*
395
+ * Handles TCE requests for emulated devices.
396
+ * Puts guest TCE values to the table and expects user space to convert them.
397
+ * Cannot fail so kvmppc_tce_validate must be called before it.
398
+ */
399
+static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
400
+ unsigned long idx, unsigned long tce)
401
+{
402
+ struct page *page;
403
+ u64 *tbl;
404
+ unsigned long sttpage;
405
+
406
+ idx -= stt->offset;
407
+ sttpage = idx / TCES_PER_PAGE;
408
+ page = stt->pages[sttpage];
409
+
410
+ if (!page) {
411
+ /* We allow any TCE, not just with read|write permissions */
412
+ if (!tce)
413
+ return;
414
+
415
+ page = kvm_spapr_get_tce_page(stt, sttpage);
416
+ if (!page)
417
+ return;
418
+ }
419
+ tbl = page_to_virt(page);
420
+
421
+ tbl[idx % TCES_PER_PAGE] = tce;
422
+}
423
+
424
+static void kvmppc_clear_tce(struct mm_struct *mm, struct kvmppc_spapr_tce_table *stt,
425
+ struct iommu_table *tbl, unsigned long entry)
426
+{
427
+ unsigned long i;
428
+ unsigned long subpages = 1ULL << (stt->page_shift - tbl->it_page_shift);
429
+ unsigned long io_entry = entry << (stt->page_shift - tbl->it_page_shift);
430
+
431
+ for (i = 0; i < subpages; ++i) {
432
+ unsigned long hpa = 0;
433
+ enum dma_data_direction dir = DMA_NONE;
434
+
435
+ iommu_tce_xchg_no_kill(mm, tbl, io_entry + i, &hpa, &dir);
436
+ }
371437 }
372438
373439 static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
....@@ -375,11 +441,10 @@
375441 {
376442 struct mm_iommu_table_group_mem_t *mem = NULL;
377443 const unsigned long pgsize = 1ULL << tbl->it_page_shift;
378
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
444
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
379445
380446 if (!pua)
381
- /* it_userspace allocation might be delayed */
382
- return H_TOO_HARD;
447
+ return H_SUCCESS;
383448
384449 mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
385450 if (!mem)
....@@ -399,7 +464,8 @@
399464 unsigned long hpa = 0;
400465 long ret;
401466
402
- if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
467
+ if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa,
468
+ &dir)))
403469 return H_TOO_HARD;
404470
405471 if (dir == DMA_NONE)
....@@ -407,7 +473,7 @@
407473
408474 ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
409475 if (ret != H_SUCCESS)
410
- iommu_tce_xchg(tbl, entry, &hpa, &dir);
476
+ iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
411477
412478 return ret;
413479 }
....@@ -426,10 +492,12 @@
426492 break;
427493 }
428494
495
+ iommu_tce_kill(tbl, io_entry, subpages);
496
+
429497 return ret;
430498 }
431499
432
-long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
500
+static long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
433501 unsigned long entry, unsigned long ua,
434502 enum dma_data_direction dir)
435503 {
....@@ -453,7 +521,7 @@
453521 if (mm_iommu_mapped_inc(mem))
454522 return H_TOO_HARD;
455523
456
- ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
524
+ ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
457525 if (WARN_ON_ONCE(ret)) {
458526 mm_iommu_mapped_dec(mem);
459527 return H_TOO_HARD;
....@@ -485,6 +553,8 @@
485553 break;
486554 }
487555
556
+ iommu_tce_kill(tbl, io_entry, subpages);
557
+
488558 return ret;
489559 }
490560
....@@ -508,16 +578,15 @@
508578 if (ret != H_SUCCESS)
509579 return ret;
510580
581
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
582
+
511583 ret = kvmppc_tce_validate(stt, tce);
512584 if (ret != H_SUCCESS)
513
- return ret;
585
+ goto unlock_exit;
514586
515587 dir = iommu_tce_direction(tce);
516588
517
- idx = srcu_read_lock(&vcpu->kvm->srcu);
518
-
519
- if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
520
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
589
+ if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
521590 ret = H_PARAMETER;
522591 goto unlock_exit;
523592 }
....@@ -532,14 +601,11 @@
532601 ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
533602 entry, ua, dir);
534603
535
- if (ret == H_SUCCESS)
536
- continue;
537604
538
- if (ret == H_TOO_HARD)
605
+ if (ret != H_SUCCESS) {
606
+ kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry);
539607 goto unlock_exit;
540
-
541
- WARN_ON_ONCE(1);
542
- kvmppc_clear_tce(stit->tbl, entry);
608
+ }
543609 }
544610
545611 kvmppc_tce_put(stt, entry, tce);
....@@ -582,7 +648,7 @@
582648 return ret;
583649
584650 idx = srcu_read_lock(&vcpu->kvm->srcu);
585
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
651
+ if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) {
586652 ret = H_TOO_HARD;
587653 goto unlock_exit;
588654 }
....@@ -598,10 +664,26 @@
598664 ret = kvmppc_tce_validate(stt, tce);
599665 if (ret != H_SUCCESS)
600666 goto unlock_exit;
667
+ }
601668
602
- if (kvmppc_gpa_to_ua(vcpu->kvm,
603
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
604
- &ua, NULL)) {
669
+ for (i = 0; i < npages; ++i) {
670
+ /*
671
+ * This looks unsafe, because we validate, then regrab
672
+ * the TCE from userspace which could have been changed by
673
+ * another thread.
674
+ *
675
+ * But it actually is safe, because the relevant checks will be
676
+ * re-executed in the following code. If userspace tries to
677
+ * change this dodgily it will result in a messier failure mode
678
+ * but won't threaten the host.
679
+ */
680
+ if (get_user(tce, tces + i)) {
681
+ ret = H_TOO_HARD;
682
+ goto unlock_exit;
683
+ }
684
+ tce = be64_to_cpu(tce);
685
+
686
+ if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
605687 ret = H_PARAMETER;
606688 goto unlock_exit;
607689 }
....@@ -611,14 +693,11 @@
611693 stit->tbl, entry + i, ua,
612694 iommu_tce_direction(tce));
613695
614
- if (ret == H_SUCCESS)
615
- continue;
616
-
617
- if (ret == H_TOO_HARD)
696
+ if (ret != H_SUCCESS) {
697
+ kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl,
698
+ entry + i);
618699 goto unlock_exit;
619
-
620
- WARN_ON_ONCE(1);
621
- kvmppc_clear_tce(stit->tbl, entry);
700
+ }
622701 }
623702
624703 kvmppc_tce_put(stt, entry + i, tce);
....@@ -665,13 +744,13 @@
665744 return ret;
666745
667746 WARN_ON_ONCE(1);
668
- kvmppc_clear_tce(stit->tbl, entry);
747
+ kvmppc_clear_tce(vcpu->kvm->mm, stt, stit->tbl, entry + i);
669748 }
670749 }
671750
672751 for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
673752 kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
674753
675
- return H_SUCCESS;
754
+ return ret;
676755 }
677756 EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);