hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c
....@@ -1,16 +1,5 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
2
- * This program is free software; you can redistribute it and/or modify
3
- * it under the terms of the GNU General Public License, version 2, as
4
- * published by the Free Software Foundation.
5
- *
6
- * This program is distributed in the hope that it will be useful,
7
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
8
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
9
- * GNU General Public License for more details.
10
- *
11
- * You should have received a copy of the GNU General Public License
12
- * along with this program; if not, write to the Free Software
13
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
143 *
154 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
165 */
....@@ -63,7 +52,7 @@
6352 struct work_struct work;
6453 u32 order;
6554
66
- /* These fields protected by kvm->lock */
55
+ /* These fields protected by kvm->arch.mmu_setup_lock */
6756
6857 /* Possible values and their usage:
6958 * <0 an error occurred during allocation,
....@@ -73,7 +62,7 @@
7362 int error;
7463
7564 /* Private to the work thread, until error != -EBUSY,
76
- * then protected by kvm->lock.
65
+ * then protected by kvm->arch.mmu_setup_lock.
7766 */
7867 struct kvm_hpt_info hpt;
7968 };
....@@ -139,7 +128,7 @@
139128 long err = -EBUSY;
140129 struct kvm_hpt_info info;
141130
142
- mutex_lock(&kvm->lock);
131
+ mutex_lock(&kvm->arch.mmu_setup_lock);
143132 if (kvm->arch.mmu_ready) {
144133 kvm->arch.mmu_ready = 0;
145134 /* order mmu_ready vs. vcpus_running */
....@@ -183,7 +172,7 @@
183172 /* Ensure that each vcpu will flush its TLB on next entry. */
184173 cpumask_setall(&kvm->arch.need_tlb_flush);
185174
186
- mutex_unlock(&kvm->lock);
175
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
187176 return err;
188177 }
189178
....@@ -268,15 +257,18 @@
268257 {
269258 unsigned long host_lpid, rsvd_lpid;
270259
271
- if (!cpu_has_feature(CPU_FTR_HVMODE))
272
- return -EINVAL;
273
-
274260 if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
275261 return -EINVAL;
276262
277
- /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
278
- host_lpid = mfspr(SPRN_LPID);
279
- rsvd_lpid = LPID_RSVD;
263
+ host_lpid = 0;
264
+ if (cpu_has_feature(CPU_FTR_HVMODE))
265
+ host_lpid = mfspr(SPRN_LPID);
266
+
267
+ /* POWER8 and above have 12-bit LPIDs (10-bit in POWER7) */
268
+ if (cpu_has_feature(CPU_FTR_ARCH_207S))
269
+ rsvd_lpid = LPID_RSVD;
270
+ else
271
+ rsvd_lpid = LPID_RSVD_POWER7;
280272
281273 kvmppc_init_lpid(rsvd_lpid + 1);
282274
....@@ -287,29 +279,16 @@
287279 return 0;
288280 }
289281
290
-static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
291
-{
292
- unsigned long msr = vcpu->arch.intr_msr;
293
-
294
- /* If transactional, change to suspend mode on IRQ delivery */
295
- if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
296
- msr |= MSR_TS_S;
297
- else
298
- msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
299
- kvmppc_set_msr(vcpu, msr);
300
-}
301
-
302282 static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
303283 long pte_index, unsigned long pteh,
304284 unsigned long ptel, unsigned long *pte_idx_ret)
305285 {
306286 long ret;
307287
308
- /* Protect linux PTE lookup from page table destruction */
309
- rcu_read_lock_sched(); /* this disables preemption too */
288
+ preempt_disable();
310289 ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
311
- current->mm->pgd, false, pte_idx_ret);
312
- rcu_read_unlock_sched();
290
+ kvm->mm->pgd, false, pte_idx_ret);
291
+ preempt_enable();
313292 if (ret == H_TOO_HARD) {
314293 /* this can't happen */
315294 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
....@@ -437,10 +416,28 @@
437416 return (instr & mask) != 0;
438417 }
439418
440
-int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
419
+int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu,
441420 unsigned long gpa, gva_t ea, int is_store)
442421 {
443422 u32 last_inst;
423
+
424
+ /*
425
+ * Fast path - check if the guest physical address corresponds to a
426
+ * device on the FAST_MMIO_BUS, if so we can avoid loading the
427
+ * instruction all together, then we can just handle it and return.
428
+ */
429
+ if (is_store) {
430
+ int idx, ret;
431
+
432
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
433
+ ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0,
434
+ NULL);
435
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
436
+ if (!ret) {
437
+ kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
438
+ return RESUME_GUEST;
439
+ }
440
+ }
444441
445442 /*
446443 * If we fail, we just return to the guest and try executing it again.
....@@ -479,10 +476,10 @@
479476
480477 vcpu->arch.paddr_accessed = gpa;
481478 vcpu->arch.vaddr_accessed = ea;
482
- return kvmppc_emulate_mmio(run, vcpu);
479
+ return kvmppc_emulate_mmio(vcpu);
483480 }
484481
485
-int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
482
+int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
486483 unsigned long ea, unsigned long dsisr)
487484 {
488485 struct kvm *kvm = vcpu->kvm;
....@@ -491,20 +488,21 @@
491488 __be64 *hptep;
492489 unsigned long mmu_seq, psize, pte_size;
493490 unsigned long gpa_base, gfn_base;
494
- unsigned long gpa, gfn, hva, pfn;
491
+ unsigned long gpa, gfn, hva, pfn, hpa;
495492 struct kvm_memory_slot *memslot;
496493 unsigned long *rmap;
497494 struct revmap_entry *rev;
498
- struct page *page, *pages[1];
499
- long index, ret, npages;
495
+ struct page *page;
496
+ long index, ret;
500497 bool is_ci;
501
- unsigned int writing, write_ok;
502
- struct vm_area_struct *vma;
498
+ bool writing, write_ok;
499
+ unsigned int shift;
503500 unsigned long rcbits;
504501 long mmio_update;
502
+ pte_t pte, *ptep;
505503
506504 if (kvm_is_radix(kvm))
507
- return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
505
+ return kvmppc_book3s_radix_page_fault(vcpu, ea, dsisr);
508506
509507 /*
510508 * Real-mode code has already searched the HPT and found the
....@@ -524,7 +522,7 @@
524522 gpa_base = r & HPTE_R_RPN & ~(psize - 1);
525523 gfn_base = gpa_base >> PAGE_SHIFT;
526524 gpa = gpa_base | (ea & (psize - 1));
527
- return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
525
+ return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
528526 dsisr & DSISR_ISSTORE);
529527 }
530528 }
....@@ -560,7 +558,7 @@
560558
561559 /* No memslot means it's an emulated MMIO region */
562560 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
563
- return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
561
+ return kvmppc_hv_emulate_mmio(vcpu, gpa, ea,
564562 dsisr & DSISR_ISSTORE);
565563
566564 /*
....@@ -575,59 +573,63 @@
575573 smp_rmb();
576574
577575 ret = -EFAULT;
578
- is_ci = false;
579
- pfn = 0;
580576 page = NULL;
581
- pte_size = PAGE_SIZE;
582577 writing = (dsisr & DSISR_ISSTORE) != 0;
583578 /* If writing != 0, then the HPTE must allow writing, if we get here */
584579 write_ok = writing;
585580 hva = gfn_to_hva_memslot(memslot, gfn);
586
- npages = get_user_pages_fast(hva, 1, writing, pages);
587
- if (npages < 1) {
588
- /* Check if it's an I/O mapping */
589
- down_read(&current->mm->mmap_sem);
590
- vma = find_vma(current->mm, hva);
591
- if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
592
- (vma->vm_flags & VM_PFNMAP)) {
593
- pfn = vma->vm_pgoff +
594
- ((hva - vma->vm_start) >> PAGE_SHIFT);
595
- pte_size = psize;
596
- is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
597
- write_ok = vma->vm_flags & VM_WRITE;
598
- }
599
- up_read(&current->mm->mmap_sem);
600
- if (!pfn)
601
- goto out_put;
581
+
582
+ /*
583
+ * Do a fast check first, since __gfn_to_pfn_memslot doesn't
584
+ * do it with !atomic && !async, which is how we call it.
585
+ * We always ask for write permission since the common case
586
+ * is that the page is writable.
587
+ */
588
+ if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
589
+ write_ok = true;
602590 } else {
603
- page = pages[0];
604
- pfn = page_to_pfn(page);
605
- if (PageHuge(page)) {
606
- page = compound_head(page);
607
- pte_size <<= compound_order(page);
608
- }
609
- /* if the guest wants write access, see if that is OK */
610
- if (!writing && hpte_is_writable(r)) {
611
- pte_t *ptep, pte;
612
- unsigned long flags;
613
- /*
614
- * We need to protect against page table destruction
615
- * hugepage split and collapse.
616
- */
617
- local_irq_save(flags);
618
- ptep = find_current_mm_pte(current->mm->pgd,
619
- hva, NULL, NULL);
620
- if (ptep) {
621
- pte = kvmppc_read_update_linux_pte(ptep, 1);
622
- if (__pte_write(pte))
623
- write_ok = 1;
624
- }
625
- local_irq_restore(flags);
591
+ /* Call KVM generic code to do the slow-path check */
592
+ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
593
+ writing, &write_ok);
594
+ if (is_error_noslot_pfn(pfn))
595
+ return -EFAULT;
596
+ page = NULL;
597
+ if (pfn_valid(pfn)) {
598
+ page = pfn_to_page(pfn);
599
+ if (PageReserved(page))
600
+ page = NULL;
626601 }
627602 }
628603
604
+ /*
605
+ * Read the PTE from the process' radix tree and use that
606
+ * so we get the shift and attribute bits.
607
+ */
608
+ spin_lock(&kvm->mmu_lock);
609
+ ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
610
+ pte = __pte(0);
611
+ if (ptep)
612
+ pte = READ_ONCE(*ptep);
613
+ spin_unlock(&kvm->mmu_lock);
614
+ /*
615
+ * If the PTE disappeared temporarily due to a THP
616
+ * collapse, just return and let the guest try again.
617
+ */
618
+ if (!pte_present(pte)) {
619
+ if (page)
620
+ put_page(page);
621
+ return RESUME_GUEST;
622
+ }
623
+ hpa = pte_pfn(pte) << PAGE_SHIFT;
624
+ pte_size = PAGE_SIZE;
625
+ if (shift)
626
+ pte_size = 1ul << shift;
627
+ is_ci = pte_ci(pte);
628
+
629629 if (psize > pte_size)
630630 goto out_put;
631
+ if (pte_size > psize)
632
+ hpa |= hva & (pte_size - psize);
631633
632634 /* Check WIMG vs. the actual page we're accessing */
633635 if (!hpte_cache_flags_ok(r, is_ci)) {
....@@ -641,14 +643,13 @@
641643 }
642644
643645 /*
644
- * Set the HPTE to point to pfn.
645
- * Since the pfn is at PAGE_SIZE granularity, make sure we
646
+ * Set the HPTE to point to hpa.
647
+ * Since the hpa is at PAGE_SIZE granularity, make sure we
646648 * don't mask out lower-order bits if psize < PAGE_SIZE.
647649 */
648650 if (psize < PAGE_SIZE)
649651 psize = PAGE_SIZE;
650
- r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
651
- ((pfn << PAGE_SHIFT) & ~(psize - 1));
652
+ r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa;
652653 if (hpte_is_writable(r) && !write_ok)
653654 r = hpte_make_readonly(r);
654655 ret = RESUME_GUEST;
....@@ -713,20 +714,13 @@
713714 asm volatile("ptesync" : : : "memory");
714715 preempt_enable();
715716 if (page && hpte_is_writable(r))
716
- SetPageDirty(page);
717
+ set_page_dirty_lock(page);
717718
718719 out_put:
719720 trace_kvm_page_fault_exit(vcpu, hpte, ret);
720721
721
- if (page) {
722
- /*
723
- * We drop pages[0] here, not page because page might
724
- * have been set to the head page of a compound, but
725
- * we have to drop the reference on the correct tail
726
- * page to match the get inside gup()
727
- */
728
- put_page(pages[0]);
729
- }
722
+ if (page)
723
+ put_page(page);
730724 return ret;
731725
732726 out_unlock:
....@@ -900,11 +894,12 @@
900894
901895 gfn = memslot->base_gfn;
902896 rmapp = memslot->arch.rmap;
897
+ if (kvm_is_radix(kvm)) {
898
+ kvmppc_radix_flush_memslot(kvm, memslot);
899
+ return;
900
+ }
901
+
903902 for (n = memslot->npages; n; --n, ++gfn) {
904
- if (kvm_is_radix(kvm)) {
905
- kvm_unmap_radix(kvm, memslot, gfn);
906
- continue;
907
- }
908903 /*
909904 * Testing the present bit without locking is OK because
910905 * the memslot has been marked invalid already, and hence
....@@ -1175,7 +1170,7 @@
11751170 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
11761171 goto err;
11771172 hva = gfn_to_hva_memslot(memslot, gfn);
1178
- npages = get_user_pages_fast(hva, 1, 1, pages);
1173
+ npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
11791174 if (npages < 1)
11801175 goto err;
11811176 page = pages[0];
....@@ -1429,7 +1424,7 @@
14291424
14301425 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
14311426 {
1432
- if (WARN_ON(!mutex_is_locked(&kvm->lock)))
1427
+ if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock)))
14331428 return;
14341429
14351430 if (!resize)
....@@ -1456,14 +1451,14 @@
14561451 if (WARN_ON(resize->error != -EBUSY))
14571452 return;
14581453
1459
- mutex_lock(&kvm->lock);
1454
+ mutex_lock(&kvm->arch.mmu_setup_lock);
14601455
14611456 /* Request is still current? */
14621457 if (kvm->arch.resize_hpt == resize) {
14631458 /* We may request large allocations here:
1464
- * do not sleep with kvm->lock held for a while.
1459
+ * do not sleep with kvm->arch.mmu_setup_lock held for a while.
14651460 */
1466
- mutex_unlock(&kvm->lock);
1461
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
14671462
14681463 resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
14691464 resize->order);
....@@ -1476,9 +1471,9 @@
14761471 if (WARN_ON(err == -EBUSY))
14771472 err = -EINPROGRESS;
14781473
1479
- mutex_lock(&kvm->lock);
1474
+ mutex_lock(&kvm->arch.mmu_setup_lock);
14801475 /* It is possible that kvm->arch.resize_hpt != resize
1481
- * after we grab kvm->lock again.
1476
+ * after we grab kvm->arch.mmu_setup_lock again.
14821477 */
14831478 }
14841479
....@@ -1487,7 +1482,7 @@
14871482 if (kvm->arch.resize_hpt != resize)
14881483 resize_hpt_release(kvm, resize);
14891484
1490
- mutex_unlock(&kvm->lock);
1485
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
14911486 }
14921487
14931488 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
....@@ -1504,7 +1499,7 @@
15041499 if (shift && ((shift < 18) || (shift > 46)))
15051500 return -EINVAL;
15061501
1507
- mutex_lock(&kvm->lock);
1502
+ mutex_lock(&kvm->arch.mmu_setup_lock);
15081503
15091504 resize = kvm->arch.resize_hpt;
15101505
....@@ -1547,7 +1542,7 @@
15471542 ret = 100; /* estimated time in ms */
15481543
15491544 out:
1550
- mutex_unlock(&kvm->lock);
1545
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
15511546 return ret;
15521547 }
15531548
....@@ -1570,7 +1565,7 @@
15701565 if (shift && ((shift < 18) || (shift > 46)))
15711566 return -EINVAL;
15721567
1573
- mutex_lock(&kvm->lock);
1568
+ mutex_lock(&kvm->arch.mmu_setup_lock);
15741569
15751570 resize = kvm->arch.resize_hpt;
15761571
....@@ -1607,7 +1602,7 @@
16071602 smp_mb();
16081603 out_no_hpt:
16091604 resize_hpt_release(kvm, resize);
1610
- mutex_unlock(&kvm->lock);
1605
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
16111606 return ret;
16121607 }
16131608
....@@ -1744,7 +1739,7 @@
17441739 int first_pass;
17451740 unsigned long hpte[2];
17461741
1747
- if (!access_ok(VERIFY_WRITE, buf, count))
1742
+ if (!access_ok(buf, count))
17481743 return -EFAULT;
17491744 if (kvm_is_radix(kvm))
17501745 return 0;
....@@ -1844,13 +1839,13 @@
18441839 int mmu_ready;
18451840 int pshift;
18461841
1847
- if (!access_ok(VERIFY_READ, buf, count))
1842
+ if (!access_ok(buf, count))
18481843 return -EFAULT;
18491844 if (kvm_is_radix(kvm))
18501845 return -EINVAL;
18511846
18521847 /* lock out vcpus from running while we're doing this */
1853
- mutex_lock(&kvm->lock);
1848
+ mutex_lock(&kvm->arch.mmu_setup_lock);
18541849 mmu_ready = kvm->arch.mmu_ready;
18551850 if (mmu_ready) {
18561851 kvm->arch.mmu_ready = 0; /* temporarily */
....@@ -1858,7 +1853,7 @@
18581853 smp_mb();
18591854 if (atomic_read(&kvm->arch.vcpus_running)) {
18601855 kvm->arch.mmu_ready = 1;
1861
- mutex_unlock(&kvm->lock);
1856
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
18621857 return -EBUSY;
18631858 }
18641859 }
....@@ -1945,7 +1940,7 @@
19451940 /* Order HPTE updates vs. mmu_ready */
19461941 smp_wmb();
19471942 kvm->arch.mmu_ready = mmu_ready;
1948
- mutex_unlock(&kvm->lock);
1943
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
19491944
19501945 if (err)
19511946 return err;
....@@ -1993,7 +1988,7 @@
19931988 ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
19941989 if (ret < 0) {
19951990 kfree(ctx);
1996
- kvm_put_kvm(kvm);
1991
+ kvm_put_kvm_no_destroy(kvm);
19971992 return ret;
19981993 }
19991994
....@@ -2142,9 +2137,8 @@
21422137
21432138 void kvmppc_mmu_debugfs_init(struct kvm *kvm)
21442139 {
2145
- kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
2146
- kvm->arch.debugfs_dir, kvm,
2147
- &debugfs_htab_fops);
2140
+ debugfs_create_file("htab", 0400, kvm->arch.debugfs_dir, kvm,
2141
+ &debugfs_htab_fops);
21482142 }
21492143
21502144 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
....@@ -2154,7 +2148,6 @@
21542148 vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
21552149
21562150 mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2157
- mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
21582151
21592152 vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
21602153 }