From ea08eeccae9297f7aabd2ef7f0c2517ac4549acc Mon Sep 17 00:00:00 2001
From: hc <hc@nodka.com>
Date: Tue, 20 Feb 2024 01:18:26 +0000
Subject: [PATCH] write in 30M
---
kernel/arch/s390/mm/fault.c | 291 +++++++++++++++++++++++++++++++++++++--------------------
1 files changed, 187 insertions(+), 104 deletions(-)
diff --git a/kernel/arch/s390/mm/fault.c b/kernel/arch/s390/mm/fault.c
index a6e3c70..1866374 100644
--- a/kernel/arch/s390/mm/fault.c
+++ b/kernel/arch/s390/mm/fault.c
@@ -33,22 +33,22 @@
#include <linux/hugetlb.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
-#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/irq.h>
#include <asm/mmu_context.h>
#include <asm/facility.h>
+#include <asm/uv.h>
#include "../kernel/entry.h"
#define __FAIL_ADDR_MASK -4096L
#define __SUBCODE_MASK 0x0600
#define __PF_RES_FIELD 0x8000000000000000ULL
-#define VM_FAULT_BADCONTEXT 0x010000
-#define VM_FAULT_BADMAP 0x020000
-#define VM_FAULT_BADACCESS 0x040000
-#define VM_FAULT_SIGNAL 0x080000
-#define VM_FAULT_PFAULT 0x100000
+#define VM_FAULT_BADCONTEXT ((__force vm_fault_t) 0x010000)
+#define VM_FAULT_BADMAP ((__force vm_fault_t) 0x020000)
+#define VM_FAULT_BADACCESS ((__force vm_fault_t) 0x040000)
+#define VM_FAULT_SIGNAL ((__force vm_fault_t) 0x080000)
+#define VM_FAULT_PFAULT ((__force vm_fault_t) 0x100000)
enum fault_type {
KERNEL_FAULT,
@@ -67,48 +67,10 @@
}
early_initcall(fault_init);
-static inline int notify_page_fault(struct pt_regs *regs)
-{
- int ret = 0;
-
- /* kprobe_running() needs smp_processor_id() */
- if (kprobes_built_in() && !user_mode(regs)) {
- preempt_disable();
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
- ret = 1;
- preempt_enable();
- }
- return ret;
-}
-
-
-/*
- * Unlock any spinlocks which will prevent us from getting the
- * message out.
- */
-void bust_spinlocks(int yes)
-{
- if (yes) {
- oops_in_progress = 1;
- } else {
- int loglevel_save = console_loglevel;
- console_unblank();
- oops_in_progress = 0;
- /*
- * OK, the message is on the console. Now we call printk()
- * without oops_in_progress set so that printk will give klogd
- * a poke. Hold onto your hats...
- */
- console_loglevel = 15;
- printk(" ");
- console_loglevel = loglevel_save;
- }
-}
-
/*
* Find out which address space caused the exception.
*/
-static inline enum fault_type get_fault_type(struct pt_regs *regs)
+static enum fault_type get_fault_type(struct pt_regs *regs)
{
unsigned long trans_exc_code;
@@ -143,7 +105,7 @@
{
unsigned long dummy;
- return probe_kernel_address((unsigned long *)p, dummy);
+ return get_kernel_nofault(dummy, (unsigned long *)p);
}
static void dump_pagetable(unsigned long asce, unsigned long address)
@@ -160,7 +122,7 @@
if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
if (bad_address(table))
@@ -169,7 +131,7 @@
if (*table & _REGION_ENTRY_INVALID)
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
if (bad_address(table))
@@ -178,7 +140,7 @@
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
- /* fallthrough */
+ fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
if (bad_address(table))
@@ -238,6 +200,8 @@
asce = S390_lowcore.kernel_asce;
pr_cont("kernel ");
break;
+ default:
+ unreachable();
}
pr_cont("ASCE.\n");
dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
@@ -270,8 +234,19 @@
{
report_user_fault(regs, SIGSEGV, 1);
force_sig_fault(SIGSEGV, si_code,
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
- current);
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
+}
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+ const struct exception_table_entry *fixup;
+
+ fixup = search_extable(__start_dma_ex_table,
+ __stop_dma_ex_table - __start_dma_ex_table,
+ addr);
+ if (!fixup)
+ fixup = search_exception_tables(addr);
+ return fixup;
}
static noinline void do_no_context(struct pt_regs *regs)
@@ -279,11 +254,9 @@
const struct exception_table_entry *fixup;
/* Are we prepared to handle this kernel fault? */
- fixup = search_exception_tables(regs->psw.addr);
- if (fixup) {
- regs->psw.addr = extable_fixup(fixup);
+ fixup = s390_search_extables(regs->psw.addr);
+ if (fixup && ex_handle(fixup, regs))
return;
- }
/*
* Oops. The kernel tried to access some bad page. We'll have to
@@ -320,8 +293,7 @@
* or user mode.
*/
force_sig_fault(SIGBUS, BUS_ADRERR,
- (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
- current);
+ (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
static noinline int signal_return(struct pt_regs *regs)
@@ -353,6 +325,7 @@
case VM_FAULT_BADACCESS:
if (access == VM_EXEC && signal_return(regs) == 0)
break;
+ fallthrough;
case VM_FAULT_BADMAP:
/* Bad memory access. Check if it is kernel or user space. */
if (user_mode(regs)) {
@@ -362,6 +335,7 @@
do_sigsegv(regs, si_code);
break;
}
+ fallthrough;
case VM_FAULT_BADCONTEXT:
case VM_FAULT_PFAULT:
do_no_context(regs);
@@ -400,7 +374,7 @@
* routines.
*
* interruption code (int_code):
- * 04 Protection -> Write-Protection (suprression)
+ * 04 Protection -> Write-Protection (suppression)
* 10 Segment translation -> Not present (nullification)
* 11 Page translation -> Not present (nullification)
* 3b Region third trans. -> Not present (nullification)
@@ -424,7 +398,7 @@
*/
clear_pt_regs_flag(regs, PIF_PER_TRAP);
- if (notify_page_fault(regs))
+ if (kprobe_page_fault(regs, 14))
return 0;
mm = tsk->mm;
@@ -452,12 +426,14 @@
address = trans_exc_code & __FAIL_ADDR_MASK;
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
- flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+ flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
+ if ((trans_exc_code & store_indication) == 0x400)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
gmap = NULL;
if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
@@ -502,9 +478,8 @@
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(vma, address, flags);
- /* No reason to continue if interrupted by SIGKILL. */
- if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
+ fault = handle_mm_fault(vma, address, flags, regs);
+ if (fault_signal_pending(fault, regs)) {
fault = VM_FAULT_SIGNAL;
if (flags & FAULT_FLAG_RETRY_NOWAIT)
goto out_up;
@@ -513,36 +488,19 @@
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- /*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
- */
if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- tsk->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
- } else {
- tsk->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
if (fault & VM_FAULT_RETRY) {
if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
(flags & FAULT_FLAG_RETRY_NOWAIT)) {
/* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_sem has not been released */
+ * mmap_lock has not been released */
current->thread.gmap_pfault = 1;
fault = VM_FAULT_PFAULT;
goto out_up;
}
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~(FAULT_FLAG_ALLOW_RETRY |
- FAULT_FLAG_RETRY_NOWAIT);
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
flags |= FAULT_FLAG_TRIED;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
goto retry;
}
}
@@ -560,7 +518,7 @@
}
fault = 0;
out_up:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return fault;
}
@@ -607,7 +565,7 @@
int access;
vm_fault_t fault;
- access = VM_READ | VM_EXEC | VM_WRITE;
+ access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
do_fault_error(regs, access, fault);
@@ -639,17 +597,19 @@
u64 reserved;
} __attribute__ ((packed, aligned(8)));
+static struct pfault_refbk pfault_init_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 0,
+ .refdwlen = 5,
+ .refversn = 2,
+ .refgaddr = __LC_LPP,
+ .refselmk = 1ULL << 48,
+ .refcmpmk = 1ULL << 48,
+ .reserved = __PF_RES_FIELD
+};
+
int pfault_init(void)
{
- struct pfault_refbk refbk = {
- .refdiagc = 0x258,
- .reffcode = 0,
- .refdwlen = 5,
- .refversn = 2,
- .refgaddr = __LC_LPP,
- .refselmk = 1ULL << 48,
- .refcmpmk = 1ULL << 48,
- .reserved = __PF_RES_FIELD };
int rc;
if (pfault_disable)
@@ -661,18 +621,20 @@
"1: la %0,8\n"
"2:\n"
EX_TABLE(0b,1b)
- : "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
+ : "=d" (rc)
+ : "a" (&pfault_init_refbk), "m" (pfault_init_refbk) : "cc");
return rc;
}
+static struct pfault_refbk pfault_fini_refbk = {
+ .refdiagc = 0x258,
+ .reffcode = 1,
+ .refdwlen = 5,
+ .refversn = 2,
+};
+
void pfault_fini(void)
{
- struct pfault_refbk refbk = {
- .refdiagc = 0x258,
- .reffcode = 1,
- .refdwlen = 5,
- .refversn = 2,
- };
if (pfault_disable)
return;
@@ -681,7 +643,7 @@
" diag %0,0,0x258\n"
"0: nopr %%r7\n"
EX_TABLE(0b,0b)
- : : "a" (&refbk), "m" (refbk) : "cc");
+ : : "a" (&pfault_fini_refbk), "m" (pfault_fini_refbk) : "cc");
}
static DEFINE_SPINLOCK(pfault_lock);
@@ -835,3 +797,124 @@
early_initcall(pfault_irq_init);
#endif /* CONFIG_PFAULT */
+
+#if IS_ENABLED(CONFIG_PGSTE)
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ struct page *page;
+ int rc;
+
+ /*
+ * bit 61 tells us if the address is valid, if it's not we
+ * have a major problem and should stop the kernel or send a
+ * SIGSEGV to the process. Unfortunately bit 61 is not
+ * reliable without the misc UV feature so we need to check
+ * for that as well.
+ */
+ if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) &&
+ !test_bit_inv(61, ®s->int_parm_long)) {
+ /*
+ * When this happens, userspace did something that it
+ * was not supposed to do, e.g. branching into secure
+ * memory. Trigger a segmentation fault.
+ */
+ if (user_mode(regs)) {
+ send_sig(SIGSEGV, current, 0);
+ return;
+ }
+
+ /*
+ * The kernel should never run into this case and we
+ * have no way out of this situation.
+ */
+ panic("Unexpected PGM 0x3d with TEID bit 61=0");
+ }
+
+ switch (get_fault_type(regs)) {
+ case USER_FAULT:
+ mm = current->mm;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, addr);
+ if (!vma) {
+ mmap_read_unlock(mm);
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ break;
+ }
+ page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
+ if (IS_ERR_OR_NULL(page)) {
+ mmap_read_unlock(mm);
+ break;
+ }
+ if (arch_make_page_accessible(page))
+ send_sig(SIGSEGV, current, 0);
+ put_page(page);
+ mmap_read_unlock(mm);
+ break;
+ case KERNEL_FAULT:
+ page = phys_to_page(addr);
+ if (unlikely(!try_get_page(page)))
+ break;
+ rc = arch_make_page_accessible(page);
+ put_page(page);
+ if (rc)
+ BUG();
+ break;
+ case VDSO_FAULT:
+ case GMAP_FAULT:
+ default:
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ }
+}
+NOKPROBE_SYMBOL(do_secure_storage_access);
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ if (get_fault_type(regs) != GMAP_FAULT) {
+ do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL)
+ send_sig(SIGSEGV, current, 0);
+}
+NOKPROBE_SYMBOL(do_non_secure_storage_access);
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ /*
+ * Either KVM messed up the secure guest mapping or the same
+ * page is mapped into multiple secure guests.
+ *
+ * This exception is only triggered when a guest 2 is running
+ * and can therefore never occur in kernel context.
+ */
+ printk_ratelimited(KERN_WARNING
+ "Secure storage violation in task: %s, pid %d\n",
+ current->comm, current->pid);
+ send_sig(SIGSEGV, current, 0);
+}
+
+#else
+void do_secure_storage_access(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+
+void do_non_secure_storage_access(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+
+void do_secure_storage_violation(struct pt_regs *regs)
+{
+ default_trap_handler(regs);
+}
+#endif
--
Gitblit v1.6.2