hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/s390/mm/gmap.c
....@@ -9,7 +9,7 @@
99 */
1010
1111 #include <linux/kernel.h>
12
-#include <linux/mm.h>
12
+#include <linux/pagewalk.h>
1313 #include <linux/swap.h>
1414 #include <linux/smp.h>
1515 #include <linux/spinlock.h>
....@@ -17,8 +17,8 @@
1717 #include <linux/swapops.h>
1818 #include <linux/ksm.h>
1919 #include <linux/mman.h>
20
+#include <linux/pgtable.h>
2021
21
-#include <asm/pgtable.h>
2222 #include <asm/pgalloc.h>
2323 #include <asm/gmap.h>
2424 #include <asm/tlb.h>
....@@ -67,7 +67,7 @@
6767 INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
6868 spin_lock_init(&gmap->guest_table_lock);
6969 spin_lock_init(&gmap->shadow_lock);
70
- atomic_set(&gmap->ref_count, 1);
70
+ refcount_set(&gmap->ref_count, 1);
7171 page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
7272 if (!page)
7373 goto out_free;
....@@ -214,7 +214,7 @@
214214 */
215215 struct gmap *gmap_get(struct gmap *gmap)
216216 {
217
- atomic_inc(&gmap->ref_count);
217
+ refcount_inc(&gmap->ref_count);
218218 return gmap;
219219 }
220220 EXPORT_SYMBOL_GPL(gmap_get);
....@@ -227,7 +227,7 @@
227227 */
228228 void gmap_put(struct gmap *gmap)
229229 {
230
- if (atomic_dec_return(&gmap->ref_count) == 0)
230
+ if (refcount_dec_and_test(&gmap->ref_count))
231231 gmap_free(gmap);
232232 }
233233 EXPORT_SYMBOL_GPL(gmap_put);
....@@ -300,7 +300,7 @@
300300 EXPORT_SYMBOL_GPL(gmap_get_enabled);
301301
302302 /*
303
- * gmap_alloc_table is assumed to be called with mmap_sem held
303
+ * gmap_alloc_table is assumed to be called with mmap_lock held
304304 */
305305 static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
306306 unsigned long init, unsigned long gaddr)
....@@ -405,10 +405,10 @@
405405 return -EINVAL;
406406
407407 flush = 0;
408
- down_write(&gmap->mm->mmap_sem);
408
+ mmap_write_lock(gmap->mm);
409409 for (off = 0; off < len; off += PMD_SIZE)
410410 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
411
- up_write(&gmap->mm->mmap_sem);
411
+ mmap_write_unlock(gmap->mm);
412412 if (flush)
413413 gmap_flush_tlb(gmap);
414414 return 0;
....@@ -438,7 +438,7 @@
438438 return -EINVAL;
439439
440440 flush = 0;
441
- down_write(&gmap->mm->mmap_sem);
441
+ mmap_write_lock(gmap->mm);
442442 for (off = 0; off < len; off += PMD_SIZE) {
443443 /* Remove old translation */
444444 flush |= __gmap_unmap_by_gaddr(gmap, to + off);
....@@ -448,7 +448,7 @@
448448 (void *) from + off))
449449 break;
450450 }
451
- up_write(&gmap->mm->mmap_sem);
451
+ mmap_write_unlock(gmap->mm);
452452 if (flush)
453453 gmap_flush_tlb(gmap);
454454 if (off >= len)
....@@ -466,7 +466,7 @@
466466 * Returns user space address which corresponds to the guest address or
467467 * -EFAULT if no such mapping exists.
468468 * This function does not establish potentially missing page table entries.
469
- * The mmap_sem of the mm that belongs to the address space must be held
469
+ * The mmap_lock of the mm that belongs to the address space must be held
470470 * when this function gets called.
471471 *
472472 * Note: Can also be called for shadow gmaps.
....@@ -495,9 +495,9 @@
495495 {
496496 unsigned long rc;
497497
498
- down_read(&gmap->mm->mmap_sem);
498
+ mmap_read_lock(gmap->mm);
499499 rc = __gmap_translate(gmap, gaddr);
500
- up_read(&gmap->mm->mmap_sem);
500
+ mmap_read_unlock(gmap->mm);
501501 return rc;
502502 }
503503 EXPORT_SYMBOL_GPL(gmap_translate);
....@@ -534,7 +534,7 @@
534534 *
535535 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
536536 * if the vm address is already mapped to a different guest segment.
537
- * The mmap_sem of the mm that belongs to the address space must be held
537
+ * The mmap_lock of the mm that belongs to the address space must be held
538538 * when this function gets called.
539539 */
540540 int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
....@@ -640,7 +640,7 @@
640640 int rc;
641641 bool unlocked;
642642
643
- down_read(&gmap->mm->mmap_sem);
643
+ mmap_read_lock(gmap->mm);
644644
645645 retry:
646646 unlocked = false;
....@@ -649,13 +649,13 @@
649649 rc = vmaddr;
650650 goto out_up;
651651 }
652
- if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
652
+ if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
653653 &unlocked)) {
654654 rc = -EFAULT;
655655 goto out_up;
656656 }
657657 /*
658
- * In the case that fixup_user_fault unlocked the mmap_sem during
658
+ * In the case that fixup_user_fault unlocked the mmap_lock during
659659 * faultin redo __gmap_translate to not race with a map/unmap_segment.
660660 */
661661 if (unlocked)
....@@ -663,13 +663,13 @@
663663
664664 rc = __gmap_link(gmap, gaddr, vmaddr);
665665 out_up:
666
- up_read(&gmap->mm->mmap_sem);
666
+ mmap_read_unlock(gmap->mm);
667667 return rc;
668668 }
669669 EXPORT_SYMBOL_GPL(gmap_fault);
670670
671671 /*
672
- * this function is assumed to be called with mmap_sem held
672
+ * this function is assumed to be called with mmap_lock held
673673 */
674674 void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
675675 {
....@@ -697,7 +697,7 @@
697697 unsigned long gaddr, vmaddr, size;
698698 struct vm_area_struct *vma;
699699
700
- down_read(&gmap->mm->mmap_sem);
700
+ mmap_read_lock(gmap->mm);
701701 for (gaddr = from; gaddr < to;
702702 gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
703703 /* Find the vm address for the guest address */
....@@ -720,7 +720,7 @@
720720 size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
721721 zap_page_range(vma, vmaddr, size);
722722 }
723
- up_read(&gmap->mm->mmap_sem);
723
+ mmap_read_unlock(gmap->mm);
724724 }
725725 EXPORT_SYMBOL_GPL(gmap_discard);
726726
....@@ -789,19 +789,19 @@
789789 unsigned long gaddr, int level)
790790 {
791791 const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
792
- unsigned long *table;
792
+ unsigned long *table = gmap->table;
793793
794
- if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
795
- return NULL;
796794 if (gmap_is_shadow(gmap) && gmap->removed)
795
+ return NULL;
796
+
797
+ if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
797798 return NULL;
798799
799800 if (asce_type != _ASCE_TYPE_REGION1 &&
800801 gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
801802 return NULL;
802803
803
- table = gmap->table;
804
- switch (gmap->asce & _ASCE_TYPE_MASK) {
804
+ switch (asce_type) {
805805 case _ASCE_TYPE_REGION1:
806806 table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
807807 if (level == 4)
....@@ -809,7 +809,7 @@
809809 if (*table & _REGION_ENTRY_INVALID)
810810 return NULL;
811811 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
812
- /* Fallthrough */
812
+ fallthrough;
813813 case _ASCE_TYPE_REGION2:
814814 table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
815815 if (level == 3)
....@@ -817,7 +817,7 @@
817817 if (*table & _REGION_ENTRY_INVALID)
818818 return NULL;
819819 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
820
- /* Fallthrough */
820
+ fallthrough;
821821 case _ASCE_TYPE_REGION3:
822822 table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
823823 if (level == 2)
....@@ -825,7 +825,7 @@
825825 if (*table & _REGION_ENTRY_INVALID)
826826 return NULL;
827827 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
828
- /* Fallthrough */
828
+ fallthrough;
829829 case _ASCE_TYPE_SEGMENT:
830830 table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
831831 if (level == 1)
....@@ -880,10 +880,10 @@
880880
881881 BUG_ON(gmap_is_shadow(gmap));
882882 fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
883
- if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
883
+ if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
884884 return -EFAULT;
885885 if (unlocked)
886
- /* lost mmap_sem, caller has to retry __gmap_translate */
886
+ /* lost mmap_lock, caller has to retry __gmap_translate */
887887 return 0;
888888 /* Connect the page tables */
889889 return __gmap_link(gmap, gaddr, vmaddr);
....@@ -912,10 +912,16 @@
912912 pmd_t *pmdp;
913913
914914 BUG_ON(gmap_is_shadow(gmap));
915
- spin_lock(&gmap->guest_table_lock);
916915 pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
916
+ if (!pmdp)
917
+ return NULL;
917918
918
- if (!pmdp || pmd_none(*pmdp)) {
919
+ /* without huge pages, there is no need to take the table lock */
920
+ if (!gmap->mm->context.allow_gmap_hpage_1m)
921
+ return pmd_none(*pmdp) ? NULL : pmdp;
922
+
923
+ spin_lock(&gmap->guest_table_lock);
924
+ if (pmd_none(*pmdp)) {
919925 spin_unlock(&gmap->guest_table_lock);
920926 return NULL;
921927 }
....@@ -948,7 +954,7 @@
948954 * -EAGAIN if a fixup is needed
949955 * -EINVAL if unsupported notifier bits have been specified
950956 *
951
- * Expected to be called with sg->mm->mmap_sem in read and
957
+ * Expected to be called with sg->mm->mmap_lock in read and
952958 * guest_table_lock held.
953959 */
954960 static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
....@@ -994,7 +1000,7 @@
9941000 * Returns 0 if successfully protected, -ENOMEM if out of memory and
9951001 * -EAGAIN if a fixup is needed.
9961002 *
997
- * Expected to be called with sg->mm->mmap_sem in read
1003
+ * Expected to be called with sg->mm->mmap_lock in read
9981004 */
9991005 static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
10001006 pmd_t *pmdp, int prot, unsigned long bits)
....@@ -1030,7 +1036,7 @@
10301036 * Returns 0 if successfully protected, -ENOMEM if out of memory and
10311037 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
10321038 *
1033
- * Called with sg->mm->mmap_sem in read.
1039
+ * Called with sg->mm->mmap_lock in read.
10341040 */
10351041 static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
10361042 unsigned long len, int prot, unsigned long bits)
....@@ -1101,9 +1107,9 @@
11011107 return -EINVAL;
11021108 if (!MACHINE_HAS_ESOP && prot == PROT_READ)
11031109 return -EINVAL;
1104
- down_read(&gmap->mm->mmap_sem);
1110
+ mmap_read_lock(gmap->mm);
11051111 rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
1106
- up_read(&gmap->mm->mmap_sem);
1112
+ mmap_read_unlock(gmap->mm);
11071113 return rc;
11081114 }
11091115 EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
....@@ -1119,7 +1125,7 @@
11191125 * if reading using the virtual address failed. -EINVAL if called on a gmap
11201126 * shadow.
11211127 *
1122
- * Called with gmap->mm->mmap_sem in read.
1128
+ * Called with gmap->mm->mmap_lock in read.
11231129 */
11241130 int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
11251131 {
....@@ -1593,7 +1599,7 @@
15931599 continue;
15941600 if (!sg->initialized)
15951601 return ERR_PTR(-EAGAIN);
1596
- atomic_inc(&sg->ref_count);
1602
+ refcount_inc(&sg->ref_count);
15971603 return sg;
15981604 }
15991605 return NULL;
....@@ -1681,7 +1687,7 @@
16811687 }
16821688 }
16831689 }
1684
- atomic_set(&new->ref_count, 2);
1690
+ refcount_set(&new->ref_count, 2);
16851691 list_add(&new->list, &parent->children);
16861692 if (asce & _ASCE_REAL_SPACE) {
16871693 /* nothing to protect, return right away */
....@@ -1691,11 +1697,11 @@
16911697 }
16921698 spin_unlock(&parent->shadow_lock);
16931699 /* protect after insertion, so it will get properly invalidated */
1694
- down_read(&parent->mm->mmap_sem);
1700
+ mmap_read_lock(parent->mm);
16951701 rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
16961702 ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
16971703 PROT_READ, GMAP_NOTIFY_SHADOW);
1698
- up_read(&parent->mm->mmap_sem);
1704
+ mmap_read_unlock(parent->mm);
16991705 spin_lock(&parent->shadow_lock);
17001706 new->initialized = true;
17011707 if (rc) {
....@@ -1724,7 +1730,7 @@
17241730 * shadow table structure is incomplete, -ENOMEM if out of memory and
17251731 * -EFAULT if an address in the parent gmap could not be resolved.
17261732 *
1727
- * Called with sg->mm->mmap_sem in read.
1733
+ * Called with sg->mm->mmap_lock in read.
17281734 */
17291735 int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
17301736 int fake)
....@@ -1808,7 +1814,7 @@
18081814 * shadow table structure is incomplete, -ENOMEM if out of memory and
18091815 * -EFAULT if an address in the parent gmap could not be resolved.
18101816 *
1811
- * Called with sg->mm->mmap_sem in read.
1817
+ * Called with sg->mm->mmap_lock in read.
18121818 */
18131819 int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
18141820 int fake)
....@@ -1892,7 +1898,7 @@
18921898 * shadow table structure is incomplete, -ENOMEM if out of memory and
18931899 * -EFAULT if an address in the parent gmap could not be resolved.
18941900 *
1895
- * Called with sg->mm->mmap_sem in read.
1901
+ * Called with sg->mm->mmap_lock in read.
18961902 */
18971903 int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
18981904 int fake)
....@@ -1976,7 +1982,7 @@
19761982 * Returns 0 if the shadow page table was found and -EAGAIN if the page
19771983 * table was not found.
19781984 *
1979
- * Called with sg->mm->mmap_sem in read.
1985
+ * Called with sg->mm->mmap_lock in read.
19801986 */
19811987 int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
19821988 unsigned long *pgt, int *dat_protection,
....@@ -2016,7 +2022,7 @@
20162022 * shadow table structure is incomplete, -ENOMEM if out of memory,
20172023 * -EFAULT if an address in the parent gmap could not be resolved and
20182024 *
2019
- * Called with gmap->mm->mmap_sem in read
2025
+ * Called with gmap->mm->mmap_lock in read
20202026 */
20212027 int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
20222028 int fake)
....@@ -2095,7 +2101,7 @@
20952101 * shadow table structure is incomplete, -ENOMEM if out of memory and
20962102 * -EFAULT if an address in the parent gmap could not be resolved.
20972103 *
2098
- * Called with sg->mm->mmap_sem in read.
2104
+ * Called with sg->mm->mmap_lock in read.
20992105 */
21002106 int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
21012107 {
....@@ -2424,8 +2430,8 @@
24242430 * This function is assumed to be called with the guest_table_lock
24252431 * held.
24262432 */
2427
-bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2428
- unsigned long gaddr)
2433
+static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2434
+ unsigned long gaddr)
24292435 {
24302436 if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
24312437 return false;
....@@ -2480,23 +2486,36 @@
24802486 }
24812487 EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
24822488
2489
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2490
+static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2491
+ unsigned long end, struct mm_walk *walk)
2492
+{
2493
+ struct vm_area_struct *vma = walk->vma;
2494
+
2495
+ split_huge_pmd(vma, pmd, addr);
2496
+ return 0;
2497
+}
2498
+
2499
+static const struct mm_walk_ops thp_split_walk_ops = {
2500
+ .pmd_entry = thp_split_walk_pmd_entry,
2501
+};
2502
+
24832503 static inline void thp_split_mm(struct mm_struct *mm)
24842504 {
2485
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
24862505 struct vm_area_struct *vma;
2487
- unsigned long addr;
24882506
24892507 for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
2490
- for (addr = vma->vm_start;
2491
- addr < vma->vm_end;
2492
- addr += PAGE_SIZE)
2493
- follow_page(vma, addr, FOLL_SPLIT);
24942508 vma->vm_flags &= ~VM_HUGEPAGE;
24952509 vma->vm_flags |= VM_NOHUGEPAGE;
2510
+ walk_page_vma(vma, &thp_split_walk_ops, NULL);
24962511 }
24972512 mm->def_flags |= VM_NOHUGEPAGE;
2498
-#endif
24992513 }
2514
+#else
2515
+static inline void thp_split_mm(struct mm_struct *mm)
2516
+{
2517
+}
2518
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
25002519
25012520 /*
25022521 * Remove all empty zero pages from the mapping for lazy refaulting
....@@ -2521,13 +2540,9 @@
25212540 return 0;
25222541 }
25232542
2524
-static inline void zap_zero_pages(struct mm_struct *mm)
2525
-{
2526
- struct mm_walk walk = { .pmd_entry = __zap_zero_pages };
2527
-
2528
- walk.mm = mm;
2529
- walk_page_range(0, TASK_SIZE, &walk);
2530
-}
2543
+static const struct mm_walk_ops zap_zero_walk_ops = {
2544
+ .pmd_entry = __zap_zero_pages,
2545
+};
25312546
25322547 /*
25332548 * switch on pgstes for its userspace process (for kvm)
....@@ -2542,15 +2557,32 @@
25422557 /* Fail if the page tables are 2K */
25432558 if (!mm_alloc_pgste(mm))
25442559 return -EINVAL;
2545
- down_write(&mm->mmap_sem);
2560
+ mmap_write_lock(mm);
25462561 mm->context.has_pgste = 1;
25472562 /* split thp mappings and disable thp for future mappings */
25482563 thp_split_mm(mm);
2549
- zap_zero_pages(mm);
2550
- up_write(&mm->mmap_sem);
2564
+ walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2565
+ mmap_write_unlock(mm);
25512566 return 0;
25522567 }
25532568 EXPORT_SYMBOL_GPL(s390_enable_sie);
2569
+
2570
+int gmap_mark_unmergeable(void)
2571
+{
2572
+ struct mm_struct *mm = current->mm;
2573
+ struct vm_area_struct *vma;
2574
+ int ret;
2575
+
2576
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
2577
+ ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
2578
+ MADV_UNMERGEABLE, &vma->vm_flags);
2579
+ if (ret)
2580
+ return ret;
2581
+ }
2582
+ mm->def_flags &= ~VM_MERGEABLE;
2583
+ return 0;
2584
+}
2585
+EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
25542586
25552587 /*
25562588 * Enable storage key handling from now on and initialize the storage
....@@ -2561,6 +2593,18 @@
25612593 {
25622594 /* Clear storage key */
25632595 ptep_zap_key(walk->mm, addr, pte);
2596
+ return 0;
2597
+}
2598
+
2599
+/*
2600
+ * Give a chance to schedule after setting a key to 256 pages.
2601
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
2602
+ * Both can sleep.
2603
+ */
2604
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
2605
+ unsigned long next, struct mm_walk *walk)
2606
+{
2607
+ cond_resched();
25642608 return 0;
25652609 }
25662610
....@@ -2586,39 +2630,35 @@
25862630 end = start + HPAGE_SIZE - 1;
25872631 __storage_key_init_range(start, end);
25882632 set_bit(PG_arch_1, &page->flags);
2633
+ cond_resched();
25892634 return 0;
25902635 }
25912636
2637
+static const struct mm_walk_ops enable_skey_walk_ops = {
2638
+ .hugetlb_entry = __s390_enable_skey_hugetlb,
2639
+ .pte_entry = __s390_enable_skey_pte,
2640
+ .pmd_entry = __s390_enable_skey_pmd,
2641
+};
2642
+
25922643 int s390_enable_skey(void)
25932644 {
2594
- struct mm_walk walk = {
2595
- .hugetlb_entry = __s390_enable_skey_hugetlb,
2596
- .pte_entry = __s390_enable_skey_pte,
2597
- };
25982645 struct mm_struct *mm = current->mm;
2599
- struct vm_area_struct *vma;
26002646 int rc = 0;
26012647
2602
- down_write(&mm->mmap_sem);
2648
+ mmap_write_lock(mm);
26032649 if (mm_uses_skeys(mm))
26042650 goto out_up;
26052651
26062652 mm->context.uses_skeys = 1;
2607
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
2608
- if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
2609
- MADV_UNMERGEABLE, &vma->vm_flags)) {
2610
- mm->context.uses_skeys = 0;
2611
- rc = -ENOMEM;
2612
- goto out_up;
2613
- }
2653
+ rc = gmap_mark_unmergeable();
2654
+ if (rc) {
2655
+ mm->context.uses_skeys = 0;
2656
+ goto out_up;
26142657 }
2615
- mm->def_flags &= ~VM_MERGEABLE;
2616
-
2617
- walk.mm = mm;
2618
- walk_page_range(0, TASK_SIZE, &walk);
2658
+ walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
26192659
26202660 out_up:
2621
- up_write(&mm->mmap_sem);
2661
+ mmap_write_unlock(mm);
26222662 return rc;
26232663 }
26242664 EXPORT_SYMBOL_GPL(s390_enable_skey);
....@@ -2633,13 +2673,138 @@
26332673 return 0;
26342674 }
26352675
2676
+static const struct mm_walk_ops reset_cmma_walk_ops = {
2677
+ .pte_entry = __s390_reset_cmma,
2678
+};
2679
+
26362680 void s390_reset_cmma(struct mm_struct *mm)
26372681 {
2638
- struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
2639
-
2640
- down_write(&mm->mmap_sem);
2641
- walk.mm = mm;
2642
- walk_page_range(0, TASK_SIZE, &walk);
2643
- up_write(&mm->mmap_sem);
2682
+ mmap_write_lock(mm);
2683
+ walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2684
+ mmap_write_unlock(mm);
26442685 }
26452686 EXPORT_SYMBOL_GPL(s390_reset_cmma);
2687
+
2688
+/*
2689
+ * make inaccessible pages accessible again
2690
+ */
2691
+static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
2692
+ unsigned long next, struct mm_walk *walk)
2693
+{
2694
+ pte_t pte = READ_ONCE(*ptep);
2695
+
2696
+ if (pte_present(pte))
2697
+ WARN_ON_ONCE(uv_destroy_page(pte_val(pte) & PAGE_MASK));
2698
+ return 0;
2699
+}
2700
+
2701
+static const struct mm_walk_ops reset_acc_walk_ops = {
2702
+ .pte_entry = __s390_reset_acc,
2703
+};
2704
+
2705
+#include <linux/sched/mm.h>
2706
+void s390_reset_acc(struct mm_struct *mm)
2707
+{
2708
+ if (!mm_is_protected(mm))
2709
+ return;
2710
+ /*
2711
+ * we might be called during
2712
+ * reset: we walk the pages and clear
2713
+ * close of all kvm file descriptors: we walk the pages and clear
2714
+ * exit of process on fd closure: vma already gone, do nothing
2715
+ */
2716
+ if (!mmget_not_zero(mm))
2717
+ return;
2718
+ mmap_read_lock(mm);
2719
+ walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
2720
+ mmap_read_unlock(mm);
2721
+ mmput(mm);
2722
+}
2723
+EXPORT_SYMBOL_GPL(s390_reset_acc);
2724
+
2725
+/**
2726
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
2727
+ * list of page tables of the gmap.
2728
+ * @gmap: the gmap whose table is to be removed
2729
+ *
2730
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
2731
+ * gmap (the CRST list). This list is used at tear down time to free all
2732
+ * pages that are now not needed anymore.
2733
+ *
2734
+ * This function removes the topmost page of the tree (the one pointed to by
2735
+ * the ASCE) from the CRST list.
2736
+ *
2737
+ * This means that it will not be freed when the VM is torn down, and needs
2738
+ * to be handled separately by the caller, unless a leak is actually
2739
+ * intended. Notice that this function will only remove the page from the
2740
+ * list, the page will still be used as a top level page table (and ASCE).
2741
+ */
2742
+void s390_unlist_old_asce(struct gmap *gmap)
2743
+{
2744
+ struct page *old;
2745
+
2746
+ old = virt_to_page(gmap->table);
2747
+ spin_lock(&gmap->guest_table_lock);
2748
+ list_del(&old->lru);
2749
+ /*
2750
+ * Sometimes the topmost page might need to be "removed" multiple
2751
+ * times, for example if the VM is rebooted into secure mode several
2752
+ * times concurrently, or if s390_replace_asce fails after calling
2753
+ * s390_remove_old_asce and is attempted again later. In that case
2754
+ * the old asce has been removed from the list, and therefore it
2755
+ * will not be freed when the VM terminates, but the ASCE is still
2756
+ * in use and still pointed to.
2757
+ * A subsequent call to replace_asce will follow the pointer and try
2758
+ * to remove the same page from the list again.
2759
+ * Therefore it's necessary that the page of the ASCE has valid
2760
+ * pointers, so list_del can work (and do nothing) without
2761
+ * dereferencing stale or invalid pointers.
2762
+ */
2763
+ INIT_LIST_HEAD(&old->lru);
2764
+ spin_unlock(&gmap->guest_table_lock);
2765
+}
2766
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
2767
+
2768
+/**
2769
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
2770
+ * @gmap: the gmap whose ASCE needs to be replaced
2771
+ *
2772
+ * If the allocation of the new top level page table fails, the ASCE is not
2773
+ * replaced.
2774
+ * In any case, the old ASCE is always removed from the gmap CRST list.
2775
+ * Therefore the caller has to make sure to save a pointer to it
2776
+ * beforehand, unless a leak is actually intended.
2777
+ */
2778
+int s390_replace_asce(struct gmap *gmap)
2779
+{
2780
+ unsigned long asce;
2781
+ struct page *page;
2782
+ void *table;
2783
+
2784
+ s390_unlist_old_asce(gmap);
2785
+
2786
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
2787
+ if (!page)
2788
+ return -ENOMEM;
2789
+ page->index = 0;
2790
+ table = page_to_virt(page);
2791
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
2792
+
2793
+ /*
2794
+ * The caller has to deal with the old ASCE, but here we make sure
2795
+ * the new one is properly added to the CRST list, so that
2796
+ * it will be freed when the VM is torn down.
2797
+ */
2798
+ spin_lock(&gmap->guest_table_lock);
2799
+ list_add(&page->lru, &gmap->crst_list);
2800
+ spin_unlock(&gmap->guest_table_lock);
2801
+
2802
+ /* Set new table origin while preserving existing ASCE control bits */
2803
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
2804
+ WRITE_ONCE(gmap->asce, asce);
2805
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
2806
+ WRITE_ONCE(gmap->table, table);
2807
+
2808
+ return 0;
2809
+}
2810
+EXPORT_SYMBOL_GPL(s390_replace_asce);