| .. | .. |
|---|
| 1 | +// SPDX-License-Identifier: GPL-2.0-or-later |
|---|
| 1 | 2 | /* |
|---|
| 2 | 3 | * PowerPC version |
|---|
| 3 | 4 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) |
|---|
| .. | .. |
|---|
| 8 | 9 | * Modified by Cort Dougan and Paul Mackerras. |
|---|
| 9 | 10 | * |
|---|
| 10 | 11 | * Modified for PPC64 by Dave Engebretsen (engebret@ibm.com) |
|---|
| 11 | | - * |
|---|
| 12 | | - * This program is free software; you can redistribute it and/or |
|---|
| 13 | | - * modify it under the terms of the GNU General Public License |
|---|
| 14 | | - * as published by the Free Software Foundation; either version |
|---|
| 15 | | - * 2 of the License, or (at your option) any later version. |
|---|
| 16 | 12 | */ |
|---|
| 17 | 13 | |
|---|
| 18 | 14 | #include <linux/signal.h> |
|---|
| .. | .. |
|---|
| 39 | 35 | |
|---|
| 40 | 36 | #include <asm/firmware.h> |
|---|
| 41 | 37 | #include <asm/page.h> |
|---|
| 42 | | -#include <asm/pgtable.h> |
|---|
| 43 | 38 | #include <asm/mmu.h> |
|---|
| 44 | 39 | #include <asm/mmu_context.h> |
|---|
| 45 | 40 | #include <asm/siginfo.h> |
|---|
| 46 | 41 | #include <asm/debug.h> |
|---|
| 42 | +#include <asm/kup.h> |
|---|
| 43 | +#include <asm/inst.h> |
|---|
| 47 | 44 | |
|---|
| 48 | | -static inline bool notify_page_fault(struct pt_regs *regs) |
|---|
| 49 | | -{ |
|---|
| 50 | | - bool ret = false; |
|---|
| 51 | 45 | |
|---|
| 52 | | -#ifdef CONFIG_KPROBES |
|---|
| 53 | | - /* kprobe_running() needs smp_processor_id() */ |
|---|
| 54 | | - if (!user_mode(regs)) { |
|---|
| 55 | | - preempt_disable(); |
|---|
| 56 | | - if (kprobe_running() && kprobe_fault_handler(regs, 11)) |
|---|
| 57 | | - ret = true; |
|---|
| 58 | | - preempt_enable(); |
|---|
| 59 | | - } |
|---|
| 60 | | -#endif /* CONFIG_KPROBES */ |
|---|
| 61 | | - |
|---|
| 62 | | - if (unlikely(debugger_fault_handler(regs))) |
|---|
| 63 | | - ret = true; |
|---|
| 64 | | - |
|---|
| 65 | | - return ret; |
|---|
| 66 | | -} |
|---|
| 67 | | - |
|---|
| 68 | | -/* |
|---|
| 69 | | - * Check whether the instruction inst is a store using |
|---|
| 70 | | - * an update addressing form which will update r1. |
|---|
| 71 | | - */ |
|---|
| 72 | | -static bool store_updates_sp(unsigned int inst) |
|---|
| 73 | | -{ |
|---|
| 74 | | - /* check for 1 in the rA field */ |
|---|
| 75 | | - if (((inst >> 16) & 0x1f) != 1) |
|---|
| 76 | | - return false; |
|---|
| 77 | | - /* check major opcode */ |
|---|
| 78 | | - switch (inst >> 26) { |
|---|
| 79 | | - case OP_STWU: |
|---|
| 80 | | - case OP_STBU: |
|---|
| 81 | | - case OP_STHU: |
|---|
| 82 | | - case OP_STFSU: |
|---|
| 83 | | - case OP_STFDU: |
|---|
| 84 | | - return true; |
|---|
| 85 | | - case OP_STD: /* std or stdu */ |
|---|
| 86 | | - return (inst & 3) == 1; |
|---|
| 87 | | - case OP_31: |
|---|
| 88 | | - /* check minor opcode */ |
|---|
| 89 | | - switch ((inst >> 1) & 0x3ff) { |
|---|
| 90 | | - case OP_31_XOP_STDUX: |
|---|
| 91 | | - case OP_31_XOP_STWUX: |
|---|
| 92 | | - case OP_31_XOP_STBUX: |
|---|
| 93 | | - case OP_31_XOP_STHUX: |
|---|
| 94 | | - case OP_31_XOP_STFSUX: |
|---|
| 95 | | - case OP_31_XOP_STFDUX: |
|---|
| 96 | | - return true; |
|---|
| 97 | | - } |
|---|
| 98 | | - } |
|---|
| 99 | | - return false; |
|---|
| 100 | | -} |
|---|
| 101 | 46 | /* |
|---|
| 102 | 47 | * do_page_fault error handling helpers |
|---|
| 103 | 48 | */ |
|---|
| 104 | 49 | |
|---|
| 105 | 50 | static int |
|---|
| 106 | | -__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code, |
|---|
| 107 | | - int pkey) |
|---|
| 51 | +__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code) |
|---|
| 108 | 52 | { |
|---|
| 109 | 53 | /* |
|---|
| 110 | 54 | * If we are in kernel mode, bail out with a SEGV, this will |
|---|
| .. | .. |
|---|
| 114 | 58 | if (!user_mode(regs)) |
|---|
| 115 | 59 | return SIGSEGV; |
|---|
| 116 | 60 | |
|---|
| 117 | | - _exception_pkey(SIGSEGV, regs, si_code, address, pkey); |
|---|
| 61 | + _exception(SIGSEGV, regs, si_code, address); |
|---|
| 118 | 62 | |
|---|
| 119 | 63 | return 0; |
|---|
| 120 | 64 | } |
|---|
| 121 | 65 | |
|---|
| 122 | 66 | static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address) |
|---|
| 123 | 67 | { |
|---|
| 124 | | - return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0); |
|---|
| 68 | + return __bad_area_nosemaphore(regs, address, SEGV_MAPERR); |
|---|
| 125 | 69 | } |
|---|
| 126 | 70 | |
|---|
| 127 | | -static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code, |
|---|
| 128 | | - int pkey) |
|---|
| 71 | +static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code) |
|---|
| 129 | 72 | { |
|---|
| 130 | 73 | struct mm_struct *mm = current->mm; |
|---|
| 131 | 74 | |
|---|
| .. | .. |
|---|
| 133 | 76 | * Something tried to access memory that isn't in our memory map.. |
|---|
| 134 | 77 | * Fix it, but check if it's kernel or user first.. |
|---|
| 135 | 78 | */ |
|---|
| 136 | | - up_read(&mm->mmap_sem); |
|---|
| 79 | + mmap_read_unlock(mm); |
|---|
| 137 | 80 | |
|---|
| 138 | | - return __bad_area_nosemaphore(regs, address, si_code, pkey); |
|---|
| 81 | + return __bad_area_nosemaphore(regs, address, si_code); |
|---|
| 139 | 82 | } |
|---|
| 140 | 83 | |
|---|
| 141 | 84 | static noinline int bad_area(struct pt_regs *regs, unsigned long address) |
|---|
| 142 | 85 | { |
|---|
| 143 | | - return __bad_area(regs, address, SEGV_MAPERR, 0); |
|---|
| 86 | + return __bad_area(regs, address, SEGV_MAPERR); |
|---|
| 144 | 87 | } |
|---|
| 145 | 88 | |
|---|
| 146 | | -static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address, |
|---|
| 147 | | - int pkey) |
|---|
| 89 | +#ifdef CONFIG_PPC_MEM_KEYS |
|---|
| 90 | +static noinline int bad_access_pkey(struct pt_regs *regs, unsigned long address, |
|---|
| 91 | + struct vm_area_struct *vma) |
|---|
| 148 | 92 | { |
|---|
| 149 | | - return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey); |
|---|
| 93 | + struct mm_struct *mm = current->mm; |
|---|
| 94 | + int pkey; |
|---|
| 95 | + |
|---|
| 96 | + /* |
|---|
| 97 | + * We don't try to fetch the pkey from page table because reading |
|---|
| 98 | + * page table without locking doesn't guarantee stable pte value. |
|---|
| 99 | + * Hence the pkey value that we return to userspace can be different |
|---|
| 100 | + * from the pkey that actually caused access error. |
|---|
| 101 | + * |
|---|
| 102 | + * It does *not* guarantee that the VMA we find here |
|---|
| 103 | + * was the one that we faulted on. |
|---|
| 104 | + * |
|---|
| 105 | + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); |
|---|
| 106 | + * 2. T1 : set AMR to deny access to pkey=4, touches, page |
|---|
| 107 | + * 3. T1 : faults... |
|---|
| 108 | + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); |
|---|
| 109 | + * 5. T1 : enters fault handler, takes mmap_lock, etc... |
|---|
| 110 | + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really |
|---|
| 111 | + * faulted on a pte with its pkey=4. |
|---|
| 112 | + */ |
|---|
| 113 | + pkey = vma_pkey(vma); |
|---|
| 114 | + |
|---|
| 115 | + mmap_read_unlock(mm); |
|---|
| 116 | + |
|---|
| 117 | + /* |
|---|
| 118 | + * If we are in kernel mode, bail out with a SEGV, this will |
|---|
| 119 | + * be caught by the assembly which will restore the non-volatile |
|---|
| 120 | + * registers before calling bad_page_fault() |
|---|
| 121 | + */ |
|---|
| 122 | + if (!user_mode(regs)) |
|---|
| 123 | + return SIGSEGV; |
|---|
| 124 | + |
|---|
| 125 | + _exception_pkey(regs, address, pkey); |
|---|
| 126 | + |
|---|
| 127 | + return 0; |
|---|
| 150 | 128 | } |
|---|
| 129 | +#endif |
|---|
| 151 | 130 | |
|---|
| 152 | 131 | static noinline int bad_access(struct pt_regs *regs, unsigned long address) |
|---|
| 153 | 132 | { |
|---|
| 154 | | - return __bad_area(regs, address, SEGV_ACCERR, 0); |
|---|
| 133 | + return __bad_area(regs, address, SEGV_ACCERR); |
|---|
| 155 | 134 | } |
|---|
| 156 | 135 | |
|---|
| 157 | 136 | static int do_sigbus(struct pt_regs *regs, unsigned long address, |
|---|
| 158 | 137 | vm_fault_t fault) |
|---|
| 159 | 138 | { |
|---|
| 160 | | - siginfo_t info; |
|---|
| 161 | | - unsigned int lsb = 0; |
|---|
| 162 | | - |
|---|
| 163 | 139 | if (!user_mode(regs)) |
|---|
| 164 | 140 | return SIGBUS; |
|---|
| 165 | 141 | |
|---|
| 166 | 142 | current->thread.trap_nr = BUS_ADRERR; |
|---|
| 167 | | - clear_siginfo(&info); |
|---|
| 168 | | - info.si_signo = SIGBUS; |
|---|
| 169 | | - info.si_errno = 0; |
|---|
| 170 | | - info.si_code = BUS_ADRERR; |
|---|
| 171 | | - info.si_addr = (void __user *)address; |
|---|
| 172 | 143 | #ifdef CONFIG_MEMORY_FAILURE |
|---|
| 173 | 144 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
|---|
| 145 | + unsigned int lsb = 0; /* shutup gcc */ |
|---|
| 146 | + |
|---|
| 174 | 147 | pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
|---|
| 175 | 148 | current->comm, current->pid, address); |
|---|
| 176 | | - info.si_code = BUS_MCEERR_AR; |
|---|
| 149 | + |
|---|
| 150 | + if (fault & VM_FAULT_HWPOISON_LARGE) |
|---|
| 151 | + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); |
|---|
| 152 | + if (fault & VM_FAULT_HWPOISON) |
|---|
| 153 | + lsb = PAGE_SHIFT; |
|---|
| 154 | + |
|---|
| 155 | + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); |
|---|
| 156 | + return 0; |
|---|
| 177 | 157 | } |
|---|
| 178 | 158 | |
|---|
| 179 | | - if (fault & VM_FAULT_HWPOISON_LARGE) |
|---|
| 180 | | - lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); |
|---|
| 181 | | - if (fault & VM_FAULT_HWPOISON) |
|---|
| 182 | | - lsb = PAGE_SHIFT; |
|---|
| 183 | 159 | #endif |
|---|
| 184 | | - info.si_addr_lsb = lsb; |
|---|
| 185 | | - force_sig_info(SIGBUS, &info, current); |
|---|
| 160 | + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); |
|---|
| 186 | 161 | return 0; |
|---|
| 187 | 162 | } |
|---|
| 188 | 163 | |
|---|
| .. | .. |
|---|
| 218 | 193 | } |
|---|
| 219 | 194 | |
|---|
| 220 | 195 | /* Is this a bad kernel fault ? */ |
|---|
| 221 | | -static bool bad_kernel_fault(bool is_exec, unsigned long error_code, |
|---|
| 222 | | - unsigned long address) |
|---|
| 196 | +static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code, |
|---|
| 197 | + unsigned long address, bool is_write) |
|---|
| 223 | 198 | { |
|---|
| 224 | | - /* NX faults set DSISR_PROTFAULT on the 8xx, DSISR_NOEXEC_OR_G on others */ |
|---|
| 225 | | - if (is_exec && (error_code & (DSISR_NOEXEC_OR_G | DSISR_KEYFAULT | |
|---|
| 226 | | - DSISR_PROTFAULT))) { |
|---|
| 227 | | - printk_ratelimited(KERN_CRIT "kernel tried to execute" |
|---|
| 228 | | - " exec-protected page (%lx) -" |
|---|
| 229 | | - "exploit attempt? (uid: %d)\n", |
|---|
| 230 | | - address, from_kuid(&init_user_ns, |
|---|
| 231 | | - current_uid())); |
|---|
| 232 | | - } |
|---|
| 233 | | - return is_exec || (address >= TASK_SIZE); |
|---|
| 234 | | -} |
|---|
| 199 | + int is_exec = TRAP(regs) == 0x400; |
|---|
| 235 | 200 | |
|---|
| 236 | | -// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE |
|---|
| 237 | | -#define SIGFRAME_MAX_SIZE (4096 + 128) |
|---|
| 201 | + if (is_exec) { |
|---|
| 202 | + pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n", |
|---|
| 203 | + address >= TASK_SIZE ? "exec-protected" : "user", |
|---|
| 204 | + address, |
|---|
| 205 | + from_kuid(&init_user_ns, current_uid())); |
|---|
| 238 | 206 | |
|---|
| 239 | | -static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address, |
|---|
| 240 | | - struct vm_area_struct *vma, unsigned int flags, |
|---|
| 241 | | - bool *must_retry) |
|---|
| 242 | | -{ |
|---|
| 243 | | - /* |
|---|
| 244 | | - * N.B. The POWER/Open ABI allows programs to access up to |
|---|
| 245 | | - * 288 bytes below the stack pointer. |
|---|
| 246 | | - * The kernel signal delivery code writes a bit over 4KB |
|---|
| 247 | | - * below the stack pointer (r1) before decrementing it. |
|---|
| 248 | | - * The exec code can write slightly over 640kB to the stack |
|---|
| 249 | | - * before setting the user r1. Thus we allow the stack to |
|---|
| 250 | | - * expand to 1MB without further checks. |
|---|
| 251 | | - */ |
|---|
| 252 | | - if (address + 0x100000 < vma->vm_end) { |
|---|
| 253 | | - unsigned int __user *nip = (unsigned int __user *)regs->nip; |
|---|
| 254 | | - /* get user regs even if this fault is in kernel mode */ |
|---|
| 255 | | - struct pt_regs *uregs = current->thread.regs; |
|---|
| 256 | | - if (uregs == NULL) |
|---|
| 257 | | - return true; |
|---|
| 258 | | - |
|---|
| 259 | | - /* |
|---|
| 260 | | - * A user-mode access to an address a long way below |
|---|
| 261 | | - * the stack pointer is only valid if the instruction |
|---|
| 262 | | - * is one which would update the stack pointer to the |
|---|
| 263 | | - * address accessed if the instruction completed, |
|---|
| 264 | | - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb |
|---|
| 265 | | - * (or the byte, halfword, float or double forms). |
|---|
| 266 | | - * |
|---|
| 267 | | - * If we don't check this then any write to the area |
|---|
| 268 | | - * between the last mapped region and the stack will |
|---|
| 269 | | - * expand the stack rather than segfaulting. |
|---|
| 270 | | - */ |
|---|
| 271 | | - if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1]) |
|---|
| 272 | | - return false; |
|---|
| 273 | | - |
|---|
| 274 | | - if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) && |
|---|
| 275 | | - access_ok(VERIFY_READ, nip, sizeof(*nip))) { |
|---|
| 276 | | - unsigned int inst; |
|---|
| 277 | | - int res; |
|---|
| 278 | | - |
|---|
| 279 | | - pagefault_disable(); |
|---|
| 280 | | - res = __get_user_inatomic(inst, nip); |
|---|
| 281 | | - pagefault_enable(); |
|---|
| 282 | | - if (!res) |
|---|
| 283 | | - return !store_updates_sp(inst); |
|---|
| 284 | | - *must_retry = true; |
|---|
| 285 | | - } |
|---|
| 207 | + // Kernel exec fault is always bad |
|---|
| 286 | 208 | return true; |
|---|
| 287 | 209 | } |
|---|
| 210 | + |
|---|
| 211 | + if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) && |
|---|
| 212 | + !search_exception_tables(regs->nip)) { |
|---|
| 213 | + pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n", |
|---|
| 214 | + address, |
|---|
| 215 | + from_kuid(&init_user_ns, current_uid())); |
|---|
| 216 | + } |
|---|
| 217 | + |
|---|
| 218 | + // Kernel fault on kernel address is bad |
|---|
| 219 | + if (address >= TASK_SIZE) |
|---|
| 220 | + return true; |
|---|
| 221 | + |
|---|
| 222 | + // Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad |
|---|
| 223 | + if (!search_exception_tables(regs->nip)) |
|---|
| 224 | + return true; |
|---|
| 225 | + |
|---|
| 226 | + // Read/write fault in a valid region (the exception table search passed |
|---|
| 227 | + // above), but blocked by KUAP is bad, it can never succeed. |
|---|
| 228 | + if (bad_kuap_fault(regs, address, is_write)) |
|---|
| 229 | + return true; |
|---|
| 230 | + |
|---|
| 231 | + // What's left? Kernel fault on user in well defined regions (extable |
|---|
| 232 | + // matched), and allowed by KUAP in the faulting context. |
|---|
| 288 | 233 | return false; |
|---|
| 289 | 234 | } |
|---|
| 290 | 235 | |
|---|
| 291 | | -static bool access_error(bool is_write, bool is_exec, |
|---|
| 292 | | - struct vm_area_struct *vma) |
|---|
| 236 | +#ifdef CONFIG_PPC_MEM_KEYS |
|---|
| 237 | +static bool access_pkey_error(bool is_write, bool is_exec, bool is_pkey, |
|---|
| 238 | + struct vm_area_struct *vma) |
|---|
| 239 | +{ |
|---|
| 240 | + /* |
|---|
| 241 | + * Make sure to check the VMA so that we do not perform |
|---|
| 242 | + * faults just to hit a pkey fault as soon as we fill in a |
|---|
| 243 | + * page. Only called for current mm, hence foreign == 0 |
|---|
| 244 | + */ |
|---|
| 245 | + if (!arch_vma_access_permitted(vma, is_write, is_exec, 0)) |
|---|
| 246 | + return true; |
|---|
| 247 | + |
|---|
| 248 | + return false; |
|---|
| 249 | +} |
|---|
| 250 | +#endif |
|---|
| 251 | + |
|---|
| 252 | +static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma) |
|---|
| 293 | 253 | { |
|---|
| 294 | 254 | /* |
|---|
| 295 | 255 | * Allow execution from readable areas if the MMU does not |
|---|
| .. | .. |
|---|
| 313 | 273 | return false; |
|---|
| 314 | 274 | } |
|---|
| 315 | 275 | |
|---|
| 316 | | - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) |
|---|
| 276 | + if (unlikely(!vma_is_accessible(vma))) |
|---|
| 317 | 277 | return true; |
|---|
| 318 | 278 | /* |
|---|
| 319 | 279 | * We should ideally do the vma pkey access check here. But in the |
|---|
| .. | .. |
|---|
| 341 | 301 | static inline void cmo_account_page_fault(void) { } |
|---|
| 342 | 302 | #endif /* CONFIG_PPC_SMLPAR */ |
|---|
| 343 | 303 | |
|---|
| 344 | | -#ifdef CONFIG_PPC_STD_MMU |
|---|
| 345 | | -static void sanity_check_fault(bool is_write, unsigned long error_code) |
|---|
| 304 | +static void sanity_check_fault(bool is_write, bool is_user, |
|---|
| 305 | + unsigned long error_code, unsigned long address) |
|---|
| 346 | 306 | { |
|---|
| 307 | + /* |
|---|
| 308 | + * Userspace trying to access kernel address, we get PROTFAULT for that. |
|---|
| 309 | + */ |
|---|
| 310 | + if (is_user && address >= TASK_SIZE) { |
|---|
| 311 | + if ((long)address == -1) |
|---|
| 312 | + return; |
|---|
| 313 | + |
|---|
| 314 | + pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n", |
|---|
| 315 | + current->comm, current->pid, address, |
|---|
| 316 | + from_kuid(&init_user_ns, current_uid())); |
|---|
| 317 | + return; |
|---|
| 318 | + } |
|---|
| 319 | + |
|---|
| 320 | + if (!IS_ENABLED(CONFIG_PPC_BOOK3S)) |
|---|
| 321 | + return; |
|---|
| 322 | + |
|---|
| 347 | 323 | /* |
|---|
| 348 | 324 | * For hash translation mode, we should never get a |
|---|
| 349 | 325 | * PROTFAULT. Any update to pte to reduce access will result in us |
|---|
| .. | .. |
|---|
| 373 | 349 | * For radix, we can get prot fault for autonuma case, because radix |
|---|
| 374 | 350 | * page table will have them marked noaccess for user. |
|---|
| 375 | 351 | */ |
|---|
| 376 | | - if (!radix_enabled() && !is_write) |
|---|
| 377 | | - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); |
|---|
| 352 | + if (radix_enabled() || is_write) |
|---|
| 353 | + return; |
|---|
| 354 | + |
|---|
| 355 | + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); |
|---|
| 378 | 356 | } |
|---|
| 379 | | -#else |
|---|
| 380 | | -static void sanity_check_fault(bool is_write, unsigned long error_code) { } |
|---|
| 381 | | -#endif /* CONFIG_PPC_STD_MMU */ |
|---|
| 382 | 357 | |
|---|
| 383 | 358 | /* |
|---|
| 384 | 359 | * Define the correct "is_write" bit in error_code based |
|---|
| .. | .. |
|---|
| 416 | 391 | { |
|---|
| 417 | 392 | struct vm_area_struct * vma; |
|---|
| 418 | 393 | struct mm_struct *mm = current->mm; |
|---|
| 419 | | - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
|---|
| 394 | + unsigned int flags = FAULT_FLAG_DEFAULT; |
|---|
| 420 | 395 | int is_exec = TRAP(regs) == 0x400; |
|---|
| 421 | 396 | int is_user = user_mode(regs); |
|---|
| 422 | 397 | int is_write = page_fault_is_write(error_code); |
|---|
| 423 | 398 | vm_fault_t fault, major = 0; |
|---|
| 424 | | - bool must_retry = false; |
|---|
| 399 | + bool kprobe_fault = kprobe_page_fault(regs, 11); |
|---|
| 425 | 400 | |
|---|
| 426 | | - if (notify_page_fault(regs)) |
|---|
| 401 | + if (unlikely(debugger_fault_handler(regs) || kprobe_fault)) |
|---|
| 427 | 402 | return 0; |
|---|
| 428 | 403 | |
|---|
| 429 | 404 | if (unlikely(page_fault_is_bad(error_code))) { |
|---|
| .. | .. |
|---|
| 435 | 410 | } |
|---|
| 436 | 411 | |
|---|
| 437 | 412 | /* Additional sanity check(s) */ |
|---|
| 438 | | - sanity_check_fault(is_write, error_code); |
|---|
| 413 | + sanity_check_fault(is_write, is_user, error_code, address); |
|---|
| 439 | 414 | |
|---|
| 440 | 415 | /* |
|---|
| 441 | 416 | * The kernel should never take an execute fault nor should it |
|---|
| 442 | | - * take a page fault to a kernel address. |
|---|
| 417 | + * take a page fault to a kernel address or a page fault to a user |
|---|
| 418 | + * address outside of dedicated places |
|---|
| 443 | 419 | */ |
|---|
| 444 | | - if (unlikely(!is_user && bad_kernel_fault(is_exec, error_code, address))) |
|---|
| 420 | + if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write))) |
|---|
| 445 | 421 | return SIGSEGV; |
|---|
| 446 | 422 | |
|---|
| 447 | 423 | /* |
|---|
| .. | .. |
|---|
| 463 | 439 | |
|---|
| 464 | 440 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
|---|
| 465 | 441 | |
|---|
| 466 | | - if (error_code & DSISR_KEYFAULT) |
|---|
| 467 | | - return bad_key_fault_exception(regs, address, |
|---|
| 468 | | - get_mm_addr_key(mm, address)); |
|---|
| 469 | | - |
|---|
| 470 | 442 | /* |
|---|
| 471 | | - * We want to do this outside mmap_sem, because reading code around nip |
|---|
| 443 | + * We want to do this outside mmap_lock, because reading code around nip |
|---|
| 472 | 444 | * can result in fault, which will cause a deadlock when called with |
|---|
| 473 | | - * mmap_sem held |
|---|
| 445 | + * mmap_lock held |
|---|
| 474 | 446 | */ |
|---|
| 475 | 447 | if (is_user) |
|---|
| 476 | 448 | flags |= FAULT_FLAG_USER; |
|---|
| .. | .. |
|---|
| 482 | 454 | /* When running in the kernel we expect faults to occur only to |
|---|
| 483 | 455 | * addresses in user space. All other faults represent errors in the |
|---|
| 484 | 456 | * kernel and should generate an OOPS. Unfortunately, in the case of an |
|---|
| 485 | | - * erroneous fault occurring in a code path which already holds mmap_sem |
|---|
| 457 | + * erroneous fault occurring in a code path which already holds mmap_lock |
|---|
| 486 | 458 | * we will deadlock attempting to validate the fault against the |
|---|
| 487 | 459 | * address space. Luckily the kernel only validly references user |
|---|
| 488 | 460 | * space from well defined areas of code, which are listed in the |
|---|
| .. | .. |
|---|
| 494 | 466 | * source. If this is invalid we can skip the address space check, |
|---|
| 495 | 467 | * thus avoiding the deadlock. |
|---|
| 496 | 468 | */ |
|---|
| 497 | | - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
|---|
| 469 | + if (unlikely(!mmap_read_trylock(mm))) { |
|---|
| 498 | 470 | if (!is_user && !search_exception_tables(regs->nip)) |
|---|
| 499 | 471 | return bad_area_nosemaphore(regs, address); |
|---|
| 500 | 472 | |
|---|
| 501 | 473 | retry: |
|---|
| 502 | | - down_read(&mm->mmap_sem); |
|---|
| 474 | + mmap_read_lock(mm); |
|---|
| 503 | 475 | } else { |
|---|
| 504 | 476 | /* |
|---|
| 505 | 477 | * The above down_read_trylock() might have succeeded in |
|---|
| .. | .. |
|---|
| 512 | 484 | vma = find_vma(mm, address); |
|---|
| 513 | 485 | if (unlikely(!vma)) |
|---|
| 514 | 486 | return bad_area(regs, address); |
|---|
| 515 | | - if (likely(vma->vm_start <= address)) |
|---|
| 516 | | - goto good_area; |
|---|
| 517 | | - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) |
|---|
| 518 | | - return bad_area(regs, address); |
|---|
| 519 | 487 | |
|---|
| 520 | | - /* The stack is being expanded, check if it's valid */ |
|---|
| 521 | | - if (unlikely(bad_stack_expansion(regs, address, vma, flags, |
|---|
| 522 | | - &must_retry))) { |
|---|
| 523 | | - if (!must_retry) |
|---|
| 488 | + if (unlikely(vma->vm_start > address)) { |
|---|
| 489 | + if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) |
|---|
| 524 | 490 | return bad_area(regs, address); |
|---|
| 525 | 491 | |
|---|
| 526 | | - up_read(&mm->mmap_sem); |
|---|
| 527 | | - if (fault_in_pages_readable((const char __user *)regs->nip, |
|---|
| 528 | | - sizeof(unsigned int))) |
|---|
| 529 | | - return bad_area_nosemaphore(regs, address); |
|---|
| 530 | | - goto retry; |
|---|
| 492 | + if (unlikely(expand_stack(vma, address))) |
|---|
| 493 | + return bad_area(regs, address); |
|---|
| 531 | 494 | } |
|---|
| 532 | 495 | |
|---|
| 533 | | - /* Try to expand it */ |
|---|
| 534 | | - if (unlikely(expand_stack(vma, address))) |
|---|
| 535 | | - return bad_area(regs, address); |
|---|
| 496 | +#ifdef CONFIG_PPC_MEM_KEYS |
|---|
| 497 | + if (unlikely(access_pkey_error(is_write, is_exec, |
|---|
| 498 | + (error_code & DSISR_KEYFAULT), vma))) |
|---|
| 499 | + return bad_access_pkey(regs, address, vma); |
|---|
| 500 | +#endif /* CONFIG_PPC_MEM_KEYS */ |
|---|
| 536 | 501 | |
|---|
| 537 | | -good_area: |
|---|
| 538 | 502 | if (unlikely(access_error(is_write, is_exec, vma))) |
|---|
| 539 | 503 | return bad_access(regs, address); |
|---|
| 540 | 504 | |
|---|
| .. | .. |
|---|
| 543 | 507 | * make sure we exit gracefully rather than endlessly redo |
|---|
| 544 | 508 | * the fault. |
|---|
| 545 | 509 | */ |
|---|
| 546 | | - fault = handle_mm_fault(vma, address, flags); |
|---|
| 547 | | - |
|---|
| 548 | | -#ifdef CONFIG_PPC_MEM_KEYS |
|---|
| 549 | | - /* |
|---|
| 550 | | - * we skipped checking for access error due to key earlier. |
|---|
| 551 | | - * Check that using handle_mm_fault error return. |
|---|
| 552 | | - */ |
|---|
| 553 | | - if (unlikely(fault & VM_FAULT_SIGSEGV) && |
|---|
| 554 | | - !arch_vma_access_permitted(vma, is_write, is_exec, 0)) { |
|---|
| 555 | | - |
|---|
| 556 | | - int pkey = vma_pkey(vma); |
|---|
| 557 | | - |
|---|
| 558 | | - up_read(&mm->mmap_sem); |
|---|
| 559 | | - return bad_key_fault_exception(regs, address, pkey); |
|---|
| 560 | | - } |
|---|
| 561 | | -#endif /* CONFIG_PPC_MEM_KEYS */ |
|---|
| 510 | + fault = handle_mm_fault(vma, address, flags, regs); |
|---|
| 562 | 511 | |
|---|
| 563 | 512 | major |= fault & VM_FAULT_MAJOR; |
|---|
| 564 | 513 | |
|---|
| 514 | + if (fault_signal_pending(fault, regs)) |
|---|
| 515 | + return user_mode(regs) ? 0 : SIGBUS; |
|---|
| 516 | + |
|---|
| 565 | 517 | /* |
|---|
| 566 | | - * Handle the retry right now, the mmap_sem has been released in that |
|---|
| 518 | + * Handle the retry right now, the mmap_lock has been released in that |
|---|
| 567 | 519 | * case. |
|---|
| 568 | 520 | */ |
|---|
| 569 | 521 | if (unlikely(fault & VM_FAULT_RETRY)) { |
|---|
| 570 | | - /* We retry only once */ |
|---|
| 571 | 522 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
|---|
| 572 | | - /* |
|---|
| 573 | | - * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk |
|---|
| 574 | | - * of starvation. |
|---|
| 575 | | - */ |
|---|
| 576 | | - flags &= ~FAULT_FLAG_ALLOW_RETRY; |
|---|
| 577 | 523 | flags |= FAULT_FLAG_TRIED; |
|---|
| 578 | | - if (!fatal_signal_pending(current)) |
|---|
| 579 | | - goto retry; |
|---|
| 524 | + goto retry; |
|---|
| 580 | 525 | } |
|---|
| 581 | | - |
|---|
| 582 | | - /* |
|---|
| 583 | | - * User mode? Just return to handle the fatal exception otherwise |
|---|
| 584 | | - * return to bad_page_fault |
|---|
| 585 | | - */ |
|---|
| 586 | | - return is_user ? 0 : SIGBUS; |
|---|
| 587 | 526 | } |
|---|
| 588 | 527 | |
|---|
| 589 | | - up_read(¤t->mm->mmap_sem); |
|---|
| 528 | + mmap_read_unlock(current->mm); |
|---|
| 590 | 529 | |
|---|
| 591 | 530 | if (unlikely(fault & VM_FAULT_ERROR)) |
|---|
| 592 | 531 | return mm_fault_error(regs, address, fault); |
|---|
| .. | .. |
|---|
| 594 | 533 | /* |
|---|
| 595 | 534 | * Major/minor page fault accounting. |
|---|
| 596 | 535 | */ |
|---|
| 597 | | - if (major) { |
|---|
| 598 | | - current->maj_flt++; |
|---|
| 599 | | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); |
|---|
| 536 | + if (major) |
|---|
| 600 | 537 | cmo_account_page_fault(); |
|---|
| 601 | | - } else { |
|---|
| 602 | | - current->min_flt++; |
|---|
| 603 | | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); |
|---|
| 604 | | - } |
|---|
| 538 | + |
|---|
| 605 | 539 | return 0; |
|---|
| 606 | 540 | } |
|---|
| 607 | 541 | NOKPROBE_SYMBOL(__do_page_fault); |
|---|
| .. | .. |
|---|
| 624 | 558 | void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) |
|---|
| 625 | 559 | { |
|---|
| 626 | 560 | const struct exception_table_entry *entry; |
|---|
| 561 | + int is_write = page_fault_is_write(regs->dsisr); |
|---|
| 627 | 562 | |
|---|
| 628 | 563 | /* Are we prepared to handle this fault? */ |
|---|
| 629 | 564 | if ((entry = search_exception_tables(regs->nip)) != NULL) { |
|---|
| .. | .. |
|---|
| 636 | 571 | switch (TRAP(regs)) { |
|---|
| 637 | 572 | case 0x300: |
|---|
| 638 | 573 | case 0x380: |
|---|
| 639 | | - pr_alert("BUG: %s at 0x%08lx\n", |
|---|
| 574 | + case 0xe00: |
|---|
| 575 | + pr_alert("BUG: %s on %s at 0x%08lx\n", |
|---|
| 640 | 576 | regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" : |
|---|
| 641 | | - "Unable to handle kernel data access", regs->dar); |
|---|
| 577 | + "Unable to handle kernel data access", |
|---|
| 578 | + is_write ? "write" : "read", regs->dar); |
|---|
| 642 | 579 | break; |
|---|
| 643 | 580 | case 0x400: |
|---|
| 644 | 581 | case 0x480: |
|---|