hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/s390/mm/fault.c
....@@ -33,22 +33,22 @@
3333 #include <linux/hugetlb.h>
3434 #include <asm/asm-offsets.h>
3535 #include <asm/diag.h>
36
-#include <asm/pgtable.h>
3736 #include <asm/gmap.h>
3837 #include <asm/irq.h>
3938 #include <asm/mmu_context.h>
4039 #include <asm/facility.h>
40
+#include <asm/uv.h>
4141 #include "../kernel/entry.h"
4242
4343 #define __FAIL_ADDR_MASK -4096L
4444 #define __SUBCODE_MASK 0x0600
4545 #define __PF_RES_FIELD 0x8000000000000000ULL
4646
47
-#define VM_FAULT_BADCONTEXT 0x010000
48
-#define VM_FAULT_BADMAP 0x020000
49
-#define VM_FAULT_BADACCESS 0x040000
50
-#define VM_FAULT_SIGNAL 0x080000
51
-#define VM_FAULT_PFAULT 0x100000
47
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
48
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
49
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
50
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
51
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
5252
5353 enum fault_type {
5454 KERNEL_FAULT,
....@@ -67,48 +67,10 @@
6767 }
6868 early_initcall(fault_init);
6969
70
-static inline int notify_page_fault(struct pt_regs *regs)
71
-{
72
- int ret = 0;
73
-
74
- /* kprobe_running() needs smp_processor_id() */
75
- if (kprobes_built_in() && !user_mode(regs)) {
76
- preempt_disable();
77
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
78
- ret = 1;
79
- preempt_enable();
80
- }
81
- return ret;
82
-}
83
-
84
-
85
-/*
86
- * Unlock any spinlocks which will prevent us from getting the
87
- * message out.
88
- */
89
-void bust_spinlocks(int yes)
90
-{
91
- if (yes) {
92
- oops_in_progress = 1;
93
- } else {
94
- int loglevel_save = console_loglevel;
95
- console_unblank();
96
- oops_in_progress = 0;
97
- /*
98
- * OK, the message is on the console. Now we call printk()
99
- * without oops_in_progress set so that printk will give klogd
100
- * a poke. Hold onto your hats...
101
- */
102
- console_loglevel = 15;
103
- printk(" ");
104
- console_loglevel = loglevel_save;
105
- }
106
-}
107
-
10870 /*
10971 * Find out which address space caused the exception.
11072 */
111
-static inline enum fault_type get_fault_type(struct pt_regs *regs)
73
+static enum fault_type get_fault_type(struct pt_regs *regs)
11274 {
11375 unsigned long trans_exc_code;
11476
....@@ -143,7 +105,7 @@
143105 {
144106 unsigned long dummy;
145107
146
- return probe_kernel_address((unsigned long *)p, dummy);
108
+ return get_kernel_nofault(dummy, (unsigned long *)p);
147109 }
148110
149111 static void dump_pagetable(unsigned long asce, unsigned long address)
....@@ -160,7 +122,7 @@
160122 if (*table & _REGION_ENTRY_INVALID)
161123 goto out;
162124 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
163
- /* fallthrough */
125
+ fallthrough;
164126 case _ASCE_TYPE_REGION2:
165127 table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
166128 if (bad_address(table))
....@@ -169,7 +131,7 @@
169131 if (*table & _REGION_ENTRY_INVALID)
170132 goto out;
171133 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
172
- /* fallthrough */
134
+ fallthrough;
173135 case _ASCE_TYPE_REGION3:
174136 table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
175137 if (bad_address(table))
....@@ -178,7 +140,7 @@
178140 if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
179141 goto out;
180142 table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
181
- /* fallthrough */
143
+ fallthrough;
182144 case _ASCE_TYPE_SEGMENT:
183145 table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
184146 if (bad_address(table))
....@@ -238,6 +200,8 @@
238200 asce = S390_lowcore.kernel_asce;
239201 pr_cont("kernel ");
240202 break;
203
+ default:
204
+ unreachable();
241205 }
242206 pr_cont("ASCE.\n");
243207 dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
....@@ -270,8 +234,19 @@
270234 {
271235 report_user_fault(regs, SIGSEGV, 1);
272236 force_sig_fault(SIGSEGV, si_code,
273
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
274
- current);
237
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
238
+}
239
+
240
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
241
+{
242
+ const struct exception_table_entry *fixup;
243
+
244
+ fixup = search_extable(__start_dma_ex_table,
245
+ __stop_dma_ex_table - __start_dma_ex_table,
246
+ addr);
247
+ if (!fixup)
248
+ fixup = search_exception_tables(addr);
249
+ return fixup;
275250 }
276251
277252 static noinline void do_no_context(struct pt_regs *regs)
....@@ -279,11 +254,9 @@
279254 const struct exception_table_entry *fixup;
280255
281256 /* Are we prepared to handle this kernel fault? */
282
- fixup = search_exception_tables(regs->psw.addr);
283
- if (fixup) {
284
- regs->psw.addr = extable_fixup(fixup);
257
+ fixup = s390_search_extables(regs->psw.addr);
258
+ if (fixup && ex_handle(fixup, regs))
285259 return;
286
- }
287260
288261 /*
289262 * Oops. The kernel tried to access some bad page. We'll have to
....@@ -320,8 +293,7 @@
320293 * or user mode.
321294 */
322295 force_sig_fault(SIGBUS, BUS_ADRERR,
323
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
324
- current);
296
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
325297 }
326298
327299 static noinline int signal_return(struct pt_regs *regs)
....@@ -353,6 +325,7 @@
353325 case VM_FAULT_BADACCESS:
354326 if (access == VM_EXEC && signal_return(regs) == 0)
355327 break;
328
+ fallthrough;
356329 case VM_FAULT_BADMAP:
357330 /* Bad memory access. Check if it is kernel or user space. */
358331 if (user_mode(regs)) {
....@@ -362,6 +335,7 @@
362335 do_sigsegv(regs, si_code);
363336 break;
364337 }
338
+ fallthrough;
365339 case VM_FAULT_BADCONTEXT:
366340 case VM_FAULT_PFAULT:
367341 do_no_context(regs);
....@@ -400,7 +374,7 @@
400374 * routines.
401375 *
402376 * interruption code (int_code):
403
- * 04 Protection -> Write-Protection (suprression)
377
+ * 04 Protection -> Write-Protection (suppression)
404378 * 10 Segment translation -> Not present (nullification)
405379 * 11 Page translation -> Not present (nullification)
406380 * 3b Region third trans. -> Not present (nullification)
....@@ -424,7 +398,7 @@
424398 */
425399 clear_pt_regs_flag(regs, PIF_PER_TRAP);
426400
427
- if (notify_page_fault(regs))
401
+ if (kprobe_page_fault(regs, 14))
428402 return 0;
429403
430404 mm = tsk->mm;
....@@ -452,12 +426,14 @@
452426
453427 address = trans_exc_code & __FAIL_ADDR_MASK;
454428 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
455
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
429
+ flags = FAULT_FLAG_DEFAULT;
456430 if (user_mode(regs))
457431 flags |= FAULT_FLAG_USER;
458
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
432
+ if ((trans_exc_code & store_indication) == 0x400)
433
+ access = VM_WRITE;
434
+ if (access == VM_WRITE)
459435 flags |= FAULT_FLAG_WRITE;
460
- down_read(&mm->mmap_sem);
436
+ mmap_read_lock(mm);
461437
462438 gmap = NULL;
463439 if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
....@@ -502,9 +478,8 @@
502478 * make sure we exit gracefully rather than endlessly redo
503479 * the fault.
504480 */
505
- fault = handle_mm_fault(vma, address, flags);
506
- /* No reason to continue if interrupted by SIGKILL. */
507
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
481
+ fault = handle_mm_fault(vma, address, flags, regs);
482
+ if (fault_signal_pending(fault, regs)) {
508483 fault = VM_FAULT_SIGNAL;
509484 if (flags & FAULT_FLAG_RETRY_NOWAIT)
510485 goto out_up;
....@@ -513,36 +488,19 @@
513488 if (unlikely(fault & VM_FAULT_ERROR))
514489 goto out_up;
515490
516
- /*
517
- * Major/minor page fault accounting is only done on the
518
- * initial attempt. If we go through a retry, it is extremely
519
- * likely that the page will be found in page cache at that point.
520
- */
521491 if (flags & FAULT_FLAG_ALLOW_RETRY) {
522
- if (fault & VM_FAULT_MAJOR) {
523
- tsk->maj_flt++;
524
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
525
- regs, address);
526
- } else {
527
- tsk->min_flt++;
528
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
529
- regs, address);
530
- }
531492 if (fault & VM_FAULT_RETRY) {
532493 if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
533494 (flags & FAULT_FLAG_RETRY_NOWAIT)) {
534495 /* FAULT_FLAG_RETRY_NOWAIT has been set,
535
- * mmap_sem has not been released */
496
+ * mmap_lock has not been released */
536497 current->thread.gmap_pfault = 1;
537498 fault = VM_FAULT_PFAULT;
538499 goto out_up;
539500 }
540
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
541
- * of starvation. */
542
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
543
- FAULT_FLAG_RETRY_NOWAIT);
501
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
544502 flags |= FAULT_FLAG_TRIED;
545
- down_read(&mm->mmap_sem);
503
+ mmap_read_lock(mm);
546504 goto retry;
547505 }
548506 }
....@@ -560,7 +518,7 @@
560518 }
561519 fault = 0;
562520 out_up:
563
- up_read(&mm->mmap_sem);
521
+ mmap_read_unlock(mm);
564522 out:
565523 return fault;
566524 }
....@@ -607,7 +565,7 @@
607565 int access;
608566 vm_fault_t fault;
609567
610
- access = VM_READ | VM_EXEC | VM_WRITE;
568
+ access = VM_ACCESS_FLAGS;
611569 fault = do_exception(regs, access);
612570 if (unlikely(fault))
613571 do_fault_error(regs, access, fault);
....@@ -639,17 +597,19 @@
639597 u64 reserved;
640598 } __attribute__ ((packed, aligned(8)));
641599
600
+static struct pfault_refbk pfault_init_refbk = {
601
+ .refdiagc = 0x258,
602
+ .reffcode = 0,
603
+ .refdwlen = 5,
604
+ .refversn = 2,
605
+ .refgaddr = __LC_LPP,
606
+ .refselmk = 1ULL << 48,
607
+ .refcmpmk = 1ULL << 48,
608
+ .reserved = __PF_RES_FIELD
609
+};
610
+
642611 int pfault_init(void)
643612 {
644
- struct pfault_refbk refbk = {
645
- .refdiagc = 0x258,
646
- .reffcode = 0,
647
- .refdwlen = 5,
648
- .refversn = 2,
649
- .refgaddr = __LC_LPP,
650
- .refselmk = 1ULL << 48,
651
- .refcmpmk = 1ULL << 48,
652
- .reserved = __PF_RES_FIELD };
653613 int rc;
654614
655615 if (pfault_disable)
....@@ -661,18 +621,20 @@
661621 "1: la %0,8\n"
662622 "2:\n"
663623 EX_TABLE(0b,1b)
664
- : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
624
+ : "=d" (rc)
625
+ : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
665626 return rc;
666627 }
667628
629
+static struct pfault_refbk pfault_fini_refbk = {
630
+ .refdiagc = 0x258,
631
+ .reffcode = 1,
632
+ .refdwlen = 5,
633
+ .refversn = 2,
634
+};
635
+
668636 void pfault_fini(void)
669637 {
670
- struct pfault_refbk refbk = {
671
- .refdiagc = 0x258,
672
- .reffcode = 1,
673
- .refdwlen = 5,
674
- .refversn = 2,
675
- };
676638
677639 if (pfault_disable)
678640 return;
....@@ -681,7 +643,7 @@
681643 " diag %0,0,0x258\n"
682644 "0: nopr %%r7\n"
683645 EX_TABLE(0b,0b)
684
- : : "a" (&refbk), "m" (refbk) : "cc");
646
+ : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
685647 }
686648
687649 static DEFINE_SPINLOCK(pfault_lock);
....@@ -835,3 +797,124 @@
835797 early_initcall(pfault_irq_init);
836798
837799 #endif /* CONFIG_PFAULT */
800
+
801
+#if IS_ENABLED(CONFIG_PGSTE)
802
+void do_secure_storage_access(struct pt_regs *regs)
803
+{
804
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
805
+ struct vm_area_struct *vma;
806
+ struct mm_struct *mm;
807
+ struct page *page;
808
+ int rc;
809
+
810
+ /*
811
+ * bit 61 tells us if the address is valid, if it's not we
812
+ * have a major problem and should stop the kernel or send a
813
+ * SIGSEGV to the process. Unfortunately bit 61 is not
814
+ * reliable without the misc UV feature so we need to check
815
+ * for that as well.
816
+ */
817
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
818
+ !test_bit_inv(61, &regs->int_parm_long)) {
819
+ /*
820
+ * When this happens, userspace did something that it
821
+ * was not supposed to do, e.g. branching into secure
822
+ * memory. Trigger a segmentation fault.
823
+ */
824
+ if (user_mode(regs)) {
825
+ send_sig(SIGSEGV, current, 0);
826
+ return;
827
+ }
828
+
829
+ /*
830
+ * The kernel should never run into this case and we
831
+ * have no way out of this situation.
832
+ */
833
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
834
+ }
835
+
836
+ switch (get_fault_type(regs)) {
837
+ case USER_FAULT:
838
+ mm = current->mm;
839
+ mmap_read_lock(mm);
840
+ vma = find_vma(mm, addr);
841
+ if (!vma) {
842
+ mmap_read_unlock(mm);
843
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
844
+ break;
845
+ }
846
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
847
+ if (IS_ERR_OR_NULL(page)) {
848
+ mmap_read_unlock(mm);
849
+ break;
850
+ }
851
+ if (arch_make_page_accessible(page))
852
+ send_sig(SIGSEGV, current, 0);
853
+ put_page(page);
854
+ mmap_read_unlock(mm);
855
+ break;
856
+ case KERNEL_FAULT:
857
+ page = phys_to_page(addr);
858
+ if (unlikely(!try_get_page(page)))
859
+ break;
860
+ rc = arch_make_page_accessible(page);
861
+ put_page(page);
862
+ if (rc)
863
+ BUG();
864
+ break;
865
+ case VDSO_FAULT:
866
+ case GMAP_FAULT:
867
+ default:
868
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
869
+ WARN_ON_ONCE(1);
870
+ }
871
+}
872
+NOKPROBE_SYMBOL(do_secure_storage_access);
873
+
874
+void do_non_secure_storage_access(struct pt_regs *regs)
875
+{
876
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
877
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
878
+
879
+ if (get_fault_type(regs) != GMAP_FAULT) {
880
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
881
+ WARN_ON_ONCE(1);
882
+ return;
883
+ }
884
+
885
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
886
+ send_sig(SIGSEGV, current, 0);
887
+}
888
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
889
+
890
+void do_secure_storage_violation(struct pt_regs *regs)
891
+{
892
+ /*
893
+ * Either KVM messed up the secure guest mapping or the same
894
+ * page is mapped into multiple secure guests.
895
+ *
896
+ * This exception is only triggered when a guest 2 is running
897
+ * and can therefore never occur in kernel context.
898
+ */
899
+ printk_ratelimited(KERN_WARNING
900
+ "Secure storage violation in task: %s, pid %d\n",
901
+ current->comm, current->pid);
902
+ send_sig(SIGSEGV, current, 0);
903
+}
904
+
905
+#else
906
+void do_secure_storage_access(struct pt_regs *regs)
907
+{
908
+ default_trap_handler(regs);
909
+}
910
+
911
+void do_non_secure_storage_access(struct pt_regs *regs)
912
+{
913
+ default_trap_handler(regs);
914
+}
915
+
916
+void do_secure_storage_violation(struct pt_regs *regs)
917
+{
918
+ default_trap_handler(regs);
919
+}
920
+#endif