.. | .. |
---|
8 | 8 | #include <linux/sched/task_stack.h> /* task_stack_*(), ... */ |
---|
9 | 9 | #include <linux/kdebug.h> /* oops_begin/end, ... */ |
---|
10 | 10 | #include <linux/extable.h> /* search_exception_tables */ |
---|
11 | | -#include <linux/bootmem.h> /* max_low_pfn */ |
---|
| 11 | +#include <linux/memblock.h> /* max_low_pfn */ |
---|
| 12 | +#include <linux/kfence.h> /* kfence_handle_page_fault */ |
---|
12 | 13 | #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ |
---|
13 | 14 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
---|
14 | 15 | #include <linux/perf_event.h> /* perf_sw_event */ |
---|
.. | .. |
---|
16 | 17 | #include <linux/prefetch.h> /* prefetchw */ |
---|
17 | 18 | #include <linux/context_tracking.h> /* exception_enter(), ... */ |
---|
18 | 19 | #include <linux/uaccess.h> /* faulthandler_disabled() */ |
---|
| 20 | +#include <linux/efi.h> /* efi_recover_from_page_fault()*/ |
---|
19 | 21 | #include <linux/mm_types.h> |
---|
20 | 22 | |
---|
21 | 23 | #include <asm/cpufeature.h> /* boot_cpu_has, ... */ |
---|
22 | 24 | #include <asm/traps.h> /* dotraplinkage, ... */ |
---|
23 | | -#include <asm/pgalloc.h> /* pgd_*(), ... */ |
---|
24 | 25 | #include <asm/fixmap.h> /* VSYSCALL_ADDR */ |
---|
25 | 26 | #include <asm/vsyscall.h> /* emulate_vsyscall */ |
---|
26 | 27 | #include <asm/vm86.h> /* struct vm86 */ |
---|
27 | 28 | #include <asm/mmu_context.h> /* vma_pkey() */ |
---|
| 29 | +#include <asm/efi.h> /* efi_recover_from_page_fault()*/ |
---|
| 30 | +#include <asm/desc.h> /* store_idt(), ... */ |
---|
| 31 | +#include <asm/cpu_entry_area.h> /* exception stack */ |
---|
| 32 | +#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */ |
---|
| 33 | +#include <asm/kvm_para.h> /* kvm_handle_async_pf */ |
---|
28 | 34 | |
---|
29 | 35 | #define CREATE_TRACE_POINTS |
---|
30 | 36 | #include <asm/trace/exceptions.h> |
---|
.. | .. |
---|
42 | 48 | return 0; |
---|
43 | 49 | } |
---|
44 | 50 | |
---|
45 | | -static nokprobe_inline int kprobes_fault(struct pt_regs *regs) |
---|
46 | | -{ |
---|
47 | | - int ret = 0; |
---|
48 | | - |
---|
49 | | - /* kprobe_running() needs smp_processor_id() */ |
---|
50 | | - if (kprobes_built_in() && !user_mode(regs)) { |
---|
51 | | - preempt_disable(); |
---|
52 | | - if (kprobe_running() && kprobe_fault_handler(regs, 14)) |
---|
53 | | - ret = 1; |
---|
54 | | - preempt_enable(); |
---|
55 | | - } |
---|
56 | | - |
---|
57 | | - return ret; |
---|
58 | | -} |
---|
59 | | - |
---|
60 | 51 | /* |
---|
61 | 52 | * Prefetch quirks: |
---|
62 | 53 | * |
---|
63 | 54 | * 32-bit mode: |
---|
64 | 55 | * |
---|
65 | 56 | * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. |
---|
66 | | - * Check that here and ignore it. |
---|
| 57 | + * Check that here and ignore it. This is AMD erratum #91. |
---|
67 | 58 | * |
---|
68 | 59 | * 64-bit mode: |
---|
69 | 60 | * |
---|
.. | .. |
---|
92 | 83 | #ifdef CONFIG_X86_64 |
---|
93 | 84 | case 0x40: |
---|
94 | 85 | /* |
---|
95 | | - * In AMD64 long mode 0x40..0x4F are valid REX prefixes |
---|
96 | | - * Need to figure out under what instruction mode the |
---|
97 | | - * instruction was issued. Could check the LDT for lm, |
---|
98 | | - * but for now it's good enough to assume that long |
---|
99 | | - * mode only uses well known segments or kernel. |
---|
| 86 | + * In 64-bit mode 0x40..0x4F are valid REX prefixes |
---|
100 | 87 | */ |
---|
101 | 88 | return (!user_mode(regs) || user_64bit_mode(regs)); |
---|
102 | 89 | #endif |
---|
.. | .. |
---|
108 | 95 | return !instr_lo || (instr_lo>>1) == 1; |
---|
109 | 96 | case 0x00: |
---|
110 | 97 | /* Prefetch instruction is 0x0F0D or 0x0F18 */ |
---|
111 | | - if (probe_kernel_address(instr, opcode)) |
---|
| 98 | + if (get_kernel_nofault(opcode, instr)) |
---|
112 | 99 | return 0; |
---|
113 | 100 | |
---|
114 | 101 | *prefetch = (instr_lo == 0xF) && |
---|
.. | .. |
---|
136 | 123 | instr = (void *)convert_ip_to_linear(current, regs); |
---|
137 | 124 | max_instr = instr + 15; |
---|
138 | 125 | |
---|
139 | | - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) |
---|
140 | | - return 0; |
---|
| 126 | + /* |
---|
| 127 | + * This code has historically always bailed out if IP points to a |
---|
| 128 | + * not-present page (e.g. due to a race). No one has ever |
---|
| 129 | + * complained about this. |
---|
| 130 | + */ |
---|
| 131 | + pagefault_disable(); |
---|
141 | 132 | |
---|
142 | 133 | while (instr < max_instr) { |
---|
143 | 134 | unsigned char opcode; |
---|
144 | 135 | |
---|
145 | | - if (probe_kernel_address(instr, opcode)) |
---|
146 | | - break; |
---|
| 136 | + if (user_mode(regs)) { |
---|
| 137 | + if (get_user(opcode, instr)) |
---|
| 138 | + break; |
---|
| 139 | + } else { |
---|
| 140 | + if (get_kernel_nofault(opcode, instr)) |
---|
| 141 | + break; |
---|
| 142 | + } |
---|
147 | 143 | |
---|
148 | 144 | instr++; |
---|
149 | 145 | |
---|
150 | 146 | if (!check_prefetch_opcode(regs, instr, opcode, &prefetch)) |
---|
151 | 147 | break; |
---|
152 | 148 | } |
---|
| 149 | + |
---|
| 150 | + pagefault_enable(); |
---|
153 | 151 | return prefetch; |
---|
154 | | -} |
---|
155 | | - |
---|
156 | | -/* |
---|
157 | | - * A protection key fault means that the PKRU value did not allow |
---|
158 | | - * access to some PTE. Userspace can figure out what PKRU was |
---|
159 | | - * from the XSAVE state, and this function fills out a field in |
---|
160 | | - * siginfo so userspace can discover which protection key was set |
---|
161 | | - * on the PTE. |
---|
162 | | - * |
---|
163 | | - * If we get here, we know that the hardware signaled a X86_PF_PK |
---|
164 | | - * fault and that there was a VMA once we got in the fault |
---|
165 | | - * handler. It does *not* guarantee that the VMA we find here |
---|
166 | | - * was the one that we faulted on. |
---|
167 | | - * |
---|
168 | | - * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); |
---|
169 | | - * 2. T1 : set PKRU to deny access to pkey=4, touches page |
---|
170 | | - * 3. T1 : faults... |
---|
171 | | - * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); |
---|
172 | | - * 5. T1 : enters fault handler, takes mmap_sem, etc... |
---|
173 | | - * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really |
---|
174 | | - * faulted on a pte with its pkey=4. |
---|
175 | | - */ |
---|
176 | | -static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, |
---|
177 | | - u32 *pkey) |
---|
178 | | -{ |
---|
179 | | - /* This is effectively an #ifdef */ |
---|
180 | | - if (!boot_cpu_has(X86_FEATURE_OSPKE)) |
---|
181 | | - return; |
---|
182 | | - |
---|
183 | | - /* Fault not from Protection Keys: nothing to do */ |
---|
184 | | - if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) |
---|
185 | | - return; |
---|
186 | | - /* |
---|
187 | | - * force_sig_info_fault() is called from a number of |
---|
188 | | - * contexts, some of which have a VMA and some of which |
---|
189 | | - * do not. The X86_PF_PK handing happens after we have a |
---|
190 | | - * valid VMA, so we should never reach this without a |
---|
191 | | - * valid VMA. |
---|
192 | | - */ |
---|
193 | | - if (!pkey) { |
---|
194 | | - WARN_ONCE(1, "PKU fault with no VMA passed in"); |
---|
195 | | - info->si_pkey = 0; |
---|
196 | | - return; |
---|
197 | | - } |
---|
198 | | - /* |
---|
199 | | - * si_pkey should be thought of as a strong hint, but not |
---|
200 | | - * absolutely guranteed to be 100% accurate because of |
---|
201 | | - * the race explained above. |
---|
202 | | - */ |
---|
203 | | - info->si_pkey = *pkey; |
---|
204 | | -} |
---|
205 | | - |
---|
206 | | -static void |
---|
207 | | -force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
---|
208 | | - struct task_struct *tsk, u32 *pkey, int fault) |
---|
209 | | -{ |
---|
210 | | - unsigned lsb = 0; |
---|
211 | | - siginfo_t info; |
---|
212 | | - |
---|
213 | | - clear_siginfo(&info); |
---|
214 | | - info.si_signo = si_signo; |
---|
215 | | - info.si_errno = 0; |
---|
216 | | - info.si_code = si_code; |
---|
217 | | - info.si_addr = (void __user *)address; |
---|
218 | | - if (fault & VM_FAULT_HWPOISON_LARGE) |
---|
219 | | - lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); |
---|
220 | | - if (fault & VM_FAULT_HWPOISON) |
---|
221 | | - lsb = PAGE_SHIFT; |
---|
222 | | - info.si_addr_lsb = lsb; |
---|
223 | | - |
---|
224 | | - fill_sig_info_pkey(si_signo, si_code, &info, pkey); |
---|
225 | | - |
---|
226 | | - force_sig_info(si_signo, &info, tsk); |
---|
227 | 152 | } |
---|
228 | 153 | |
---|
229 | 154 | DEFINE_SPINLOCK(pgd_lock); |
---|
.. | .. |
---|
273 | 198 | return pmd_k; |
---|
274 | 199 | } |
---|
275 | 200 | |
---|
276 | | -static void vmalloc_sync(void) |
---|
277 | | -{ |
---|
278 | | - unsigned long address; |
---|
279 | | - |
---|
280 | | - if (SHARED_KERNEL_PMD) |
---|
281 | | - return; |
---|
282 | | - |
---|
283 | | - for (address = VMALLOC_START & PMD_MASK; |
---|
284 | | - address >= TASK_SIZE_MAX && address < VMALLOC_END; |
---|
285 | | - address += PMD_SIZE) { |
---|
286 | | - struct page *page; |
---|
287 | | - |
---|
288 | | - spin_lock(&pgd_lock); |
---|
289 | | - list_for_each_entry(page, &pgd_list, lru) { |
---|
290 | | - spinlock_t *pgt_lock; |
---|
291 | | - |
---|
292 | | - /* the pgt_lock only for Xen */ |
---|
293 | | - pgt_lock = &pgd_page_get_mm(page)->page_table_lock; |
---|
294 | | - |
---|
295 | | - spin_lock(pgt_lock); |
---|
296 | | - vmalloc_sync_one(page_address(page), address); |
---|
297 | | - spin_unlock(pgt_lock); |
---|
298 | | - } |
---|
299 | | - spin_unlock(&pgd_lock); |
---|
300 | | - } |
---|
301 | | -} |
---|
302 | | - |
---|
303 | | -void vmalloc_sync_mappings(void) |
---|
304 | | -{ |
---|
305 | | - vmalloc_sync(); |
---|
306 | | -} |
---|
307 | | - |
---|
308 | | -void vmalloc_sync_unmappings(void) |
---|
309 | | -{ |
---|
310 | | - vmalloc_sync(); |
---|
311 | | -} |
---|
312 | | - |
---|
313 | 201 | /* |
---|
314 | | - * 32-bit: |
---|
315 | | - * |
---|
316 | 202 | * Handle a fault on the vmalloc or module mapping area |
---|
| 203 | + * |
---|
| 204 | + * This is needed because there is a race condition between the time |
---|
| 205 | + * when the vmalloc mapping code updates the PMD to the point in time |
---|
| 206 | + * where it synchronizes this update with the other page-tables in the |
---|
| 207 | + * system. |
---|
| 208 | + * |
---|
| 209 | + * In this race window another thread/CPU can map an area on the same |
---|
| 210 | + * PMD, finds it already present and does not synchronize it with the |
---|
| 211 | + * rest of the system yet. As a result v[mz]alloc might return areas |
---|
| 212 | + * which are not mapped in every page-table in the system, causing an |
---|
| 213 | + * unhandled page-fault when they are accessed. |
---|
317 | 214 | */ |
---|
318 | 215 | static noinline int vmalloc_fault(unsigned long address) |
---|
319 | 216 | { |
---|
.. | .. |
---|
347 | 244 | return 0; |
---|
348 | 245 | } |
---|
349 | 246 | NOKPROBE_SYMBOL(vmalloc_fault); |
---|
| 247 | + |
---|
| 248 | +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) |
---|
| 249 | +{ |
---|
| 250 | + unsigned long addr; |
---|
| 251 | + |
---|
| 252 | + for (addr = start & PMD_MASK; |
---|
| 253 | + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; |
---|
| 254 | + addr += PMD_SIZE) { |
---|
| 255 | + struct page *page; |
---|
| 256 | + |
---|
| 257 | + spin_lock(&pgd_lock); |
---|
| 258 | + list_for_each_entry(page, &pgd_list, lru) { |
---|
| 259 | + spinlock_t *pgt_lock; |
---|
| 260 | + |
---|
| 261 | + /* the pgt_lock only for Xen */ |
---|
| 262 | + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; |
---|
| 263 | + |
---|
| 264 | + spin_lock(pgt_lock); |
---|
| 265 | + vmalloc_sync_one(page_address(page), addr); |
---|
| 266 | + spin_unlock(pgt_lock); |
---|
| 267 | + } |
---|
| 268 | + spin_unlock(&pgd_lock); |
---|
| 269 | + } |
---|
| 270 | +} |
---|
350 | 271 | |
---|
351 | 272 | /* |
---|
352 | 273 | * Did it hit the DOS screen memory VA from vm86 mode? |
---|
.. | .. |
---|
412 | 333 | |
---|
413 | 334 | #else /* CONFIG_X86_64: */ |
---|
414 | 335 | |
---|
415 | | -void vmalloc_sync_mappings(void) |
---|
416 | | -{ |
---|
417 | | - /* |
---|
418 | | - * 64-bit mappings might allocate new p4d/pud pages |
---|
419 | | - * that need to be propagated to all tasks' PGDs. |
---|
420 | | - */ |
---|
421 | | - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
---|
422 | | -} |
---|
423 | | - |
---|
424 | | -void vmalloc_sync_unmappings(void) |
---|
425 | | -{ |
---|
426 | | - /* |
---|
427 | | - * Unmappings never allocate or free p4d/pud pages. |
---|
428 | | - * No work is required here. |
---|
429 | | - */ |
---|
430 | | -} |
---|
431 | | - |
---|
432 | | -/* |
---|
433 | | - * 64-bit: |
---|
434 | | - * |
---|
435 | | - * Handle a fault on the vmalloc area |
---|
436 | | - */ |
---|
437 | | -static noinline int vmalloc_fault(unsigned long address) |
---|
438 | | -{ |
---|
439 | | - pgd_t *pgd, *pgd_k; |
---|
440 | | - p4d_t *p4d, *p4d_k; |
---|
441 | | - pud_t *pud; |
---|
442 | | - pmd_t *pmd; |
---|
443 | | - pte_t *pte; |
---|
444 | | - |
---|
445 | | - /* Make sure we are in vmalloc area: */ |
---|
446 | | - if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
---|
447 | | - return -1; |
---|
448 | | - |
---|
449 | | - /* |
---|
450 | | - * Copy kernel mappings over when needed. This can also |
---|
451 | | - * happen within a race in page table update. In the later |
---|
452 | | - * case just flush: |
---|
453 | | - */ |
---|
454 | | - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); |
---|
455 | | - pgd_k = pgd_offset_k(address); |
---|
456 | | - if (pgd_none(*pgd_k)) |
---|
457 | | - return -1; |
---|
458 | | - |
---|
459 | | - if (pgtable_l5_enabled()) { |
---|
460 | | - if (pgd_none(*pgd)) { |
---|
461 | | - set_pgd(pgd, *pgd_k); |
---|
462 | | - arch_flush_lazy_mmu_mode(); |
---|
463 | | - } else { |
---|
464 | | - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); |
---|
465 | | - } |
---|
466 | | - } |
---|
467 | | - |
---|
468 | | - /* With 4-level paging, copying happens on the p4d level. */ |
---|
469 | | - p4d = p4d_offset(pgd, address); |
---|
470 | | - p4d_k = p4d_offset(pgd_k, address); |
---|
471 | | - if (p4d_none(*p4d_k)) |
---|
472 | | - return -1; |
---|
473 | | - |
---|
474 | | - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { |
---|
475 | | - set_p4d(p4d, *p4d_k); |
---|
476 | | - arch_flush_lazy_mmu_mode(); |
---|
477 | | - } else { |
---|
478 | | - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); |
---|
479 | | - } |
---|
480 | | - |
---|
481 | | - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); |
---|
482 | | - |
---|
483 | | - pud = pud_offset(p4d, address); |
---|
484 | | - if (pud_none(*pud)) |
---|
485 | | - return -1; |
---|
486 | | - |
---|
487 | | - if (pud_large(*pud)) |
---|
488 | | - return 0; |
---|
489 | | - |
---|
490 | | - pmd = pmd_offset(pud, address); |
---|
491 | | - if (pmd_none(*pmd)) |
---|
492 | | - return -1; |
---|
493 | | - |
---|
494 | | - if (pmd_large(*pmd)) |
---|
495 | | - return 0; |
---|
496 | | - |
---|
497 | | - pte = pte_offset_kernel(pmd, address); |
---|
498 | | - if (!pte_present(*pte)) |
---|
499 | | - return -1; |
---|
500 | | - |
---|
501 | | - return 0; |
---|
502 | | -} |
---|
503 | | -NOKPROBE_SYMBOL(vmalloc_fault); |
---|
504 | | - |
---|
505 | 336 | #ifdef CONFIG_CPU_SUP_AMD |
---|
506 | 337 | static const char errata93_warning[] = |
---|
507 | 338 | KERN_ERR |
---|
.. | .. |
---|
524 | 355 | { |
---|
525 | 356 | unsigned long dummy; |
---|
526 | 357 | |
---|
527 | | - return probe_kernel_address((unsigned long *)p, dummy); |
---|
| 358 | + return get_kernel_nofault(dummy, (unsigned long *)p); |
---|
528 | 359 | } |
---|
529 | 360 | |
---|
530 | 361 | static void dump_pagetable(unsigned long address) |
---|
.. | .. |
---|
637 | 468 | return 0; |
---|
638 | 469 | } |
---|
639 | 470 | |
---|
| 471 | +/* Pentium F0 0F C7 C8 bug workaround: */ |
---|
640 | 472 | static int is_f00f_bug(struct pt_regs *regs, unsigned long address) |
---|
641 | 473 | { |
---|
642 | 474 | #ifdef CONFIG_X86_F00F_BUG |
---|
643 | | - unsigned long nr; |
---|
644 | | - |
---|
645 | | - /* |
---|
646 | | - * Pentium F0 0F C7 C8 bug workaround: |
---|
647 | | - */ |
---|
648 | | - if (boot_cpu_has_bug(X86_BUG_F00F)) { |
---|
649 | | - nr = (address - idt_descr.address) >> 3; |
---|
650 | | - |
---|
651 | | - if (nr == 6) { |
---|
652 | | - do_invalid_op(regs, 0); |
---|
653 | | - return 1; |
---|
654 | | - } |
---|
| 475 | + if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) { |
---|
| 476 | + handle_invalid_op(regs); |
---|
| 477 | + return 1; |
---|
655 | 478 | } |
---|
656 | 479 | #endif |
---|
657 | 480 | return 0; |
---|
658 | 481 | } |
---|
659 | 482 | |
---|
| 483 | +static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) |
---|
| 484 | +{ |
---|
| 485 | + u32 offset = (index >> 3) * sizeof(struct desc_struct); |
---|
| 486 | + unsigned long addr; |
---|
| 487 | + struct ldttss_desc desc; |
---|
| 488 | + |
---|
| 489 | + if (index == 0) { |
---|
| 490 | + pr_alert("%s: NULL\n", name); |
---|
| 491 | + return; |
---|
| 492 | + } |
---|
| 493 | + |
---|
| 494 | + if (offset + sizeof(struct ldttss_desc) >= gdt->size) { |
---|
| 495 | + pr_alert("%s: 0x%hx -- out of bounds\n", name, index); |
---|
| 496 | + return; |
---|
| 497 | + } |
---|
| 498 | + |
---|
| 499 | + if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset), |
---|
| 500 | + sizeof(struct ldttss_desc))) { |
---|
| 501 | + pr_alert("%s: 0x%hx -- GDT entry is not readable\n", |
---|
| 502 | + name, index); |
---|
| 503 | + return; |
---|
| 504 | + } |
---|
| 505 | + |
---|
| 506 | + addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24); |
---|
| 507 | +#ifdef CONFIG_X86_64 |
---|
| 508 | + addr |= ((u64)desc.base3 << 32); |
---|
| 509 | +#endif |
---|
| 510 | + pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n", |
---|
| 511 | + name, index, addr, (desc.limit0 | (desc.limit1 << 16))); |
---|
| 512 | +} |
---|
| 513 | + |
---|
660 | 514 | static void |
---|
661 | | -show_fault_oops(struct pt_regs *regs, unsigned long error_code, |
---|
662 | | - unsigned long address) |
---|
| 515 | +show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
---|
663 | 516 | { |
---|
664 | 517 | if (!oops_may_print()) |
---|
665 | 518 | return; |
---|
.. | .. |
---|
684 | 537 | from_kuid(&init_user_ns, current_uid())); |
---|
685 | 538 | } |
---|
686 | 539 | |
---|
687 | | - pr_alert("BUG: unable to handle kernel %s at %px\n", |
---|
688 | | - address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", |
---|
689 | | - (void *)address); |
---|
| 540 | + if (address < PAGE_SIZE && !user_mode(regs)) |
---|
| 541 | + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", |
---|
| 542 | + (void *)address); |
---|
| 543 | + else |
---|
| 544 | + pr_alert("BUG: unable to handle page fault for address: %px\n", |
---|
| 545 | + (void *)address); |
---|
| 546 | + |
---|
| 547 | + pr_alert("#PF: %s %s in %s mode\n", |
---|
| 548 | + (error_code & X86_PF_USER) ? "user" : "supervisor", |
---|
| 549 | + (error_code & X86_PF_INSTR) ? "instruction fetch" : |
---|
| 550 | + (error_code & X86_PF_WRITE) ? "write access" : |
---|
| 551 | + "read access", |
---|
| 552 | + user_mode(regs) ? "user" : "kernel"); |
---|
| 553 | + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, |
---|
| 554 | + !(error_code & X86_PF_PROT) ? "not-present page" : |
---|
| 555 | + (error_code & X86_PF_RSVD) ? "reserved bit violation" : |
---|
| 556 | + (error_code & X86_PF_PK) ? "protection keys violation" : |
---|
| 557 | + "permissions violation"); |
---|
| 558 | + |
---|
| 559 | + if (!(error_code & X86_PF_USER) && user_mode(regs)) { |
---|
| 560 | + struct desc_ptr idt, gdt; |
---|
| 561 | + u16 ldtr, tr; |
---|
| 562 | + |
---|
| 563 | + /* |
---|
| 564 | + * This can happen for quite a few reasons. The more obvious |
---|
| 565 | + * ones are faults accessing the GDT, or LDT. Perhaps |
---|
| 566 | + * surprisingly, if the CPU tries to deliver a benign or |
---|
| 567 | + * contributory exception from user code and gets a page fault |
---|
| 568 | + * during delivery, the page fault can be delivered as though |
---|
| 569 | + * it originated directly from user code. This could happen |
---|
| 570 | + * due to wrong permissions on the IDT, GDT, LDT, TSS, or |
---|
| 571 | + * kernel or IST stack. |
---|
| 572 | + */ |
---|
| 573 | + store_idt(&idt); |
---|
| 574 | + |
---|
| 575 | + /* Usable even on Xen PV -- it's just slow. */ |
---|
| 576 | + native_store_gdt(&gdt); |
---|
| 577 | + |
---|
| 578 | + pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n", |
---|
| 579 | + idt.address, idt.size, gdt.address, gdt.size); |
---|
| 580 | + |
---|
| 581 | + store_ldt(ldtr); |
---|
| 582 | + show_ldttss(&gdt, "LDTR", ldtr); |
---|
| 583 | + |
---|
| 584 | + store_tr(tr); |
---|
| 585 | + show_ldttss(&gdt, "TR", tr); |
---|
| 586 | + } |
---|
690 | 587 | |
---|
691 | 588 | dump_pagetable(address); |
---|
692 | 589 | } |
---|
.. | .. |
---|
707 | 604 | tsk->comm, address); |
---|
708 | 605 | dump_pagetable(address); |
---|
709 | 606 | |
---|
710 | | - tsk->thread.cr2 = address; |
---|
711 | | - tsk->thread.trap_nr = X86_TRAP_PF; |
---|
712 | | - tsk->thread.error_code = error_code; |
---|
713 | | - |
---|
714 | 607 | if (__die("Bad pagetable", regs, error_code)) |
---|
715 | 608 | sig = 0; |
---|
716 | 609 | |
---|
717 | 610 | oops_end(flags, regs, sig); |
---|
| 611 | +} |
---|
| 612 | + |
---|
| 613 | +static void set_signal_archinfo(unsigned long address, |
---|
| 614 | + unsigned long error_code) |
---|
| 615 | +{ |
---|
| 616 | + struct task_struct *tsk = current; |
---|
| 617 | + |
---|
| 618 | + /* |
---|
| 619 | + * To avoid leaking information about the kernel page |
---|
| 620 | + * table layout, pretend that user-mode accesses to |
---|
| 621 | + * kernel addresses are always protection faults. |
---|
| 622 | + * |
---|
| 623 | + * NB: This means that failed vsyscalls with vsyscall=none |
---|
| 624 | + * will have the PROT bit. This doesn't leak any |
---|
| 625 | + * information and does not appear to cause any problems. |
---|
| 626 | + */ |
---|
| 627 | + if (address >= TASK_SIZE_MAX) |
---|
| 628 | + error_code |= X86_PF_PROT; |
---|
| 629 | + |
---|
| 630 | + tsk->thread.trap_nr = X86_TRAP_PF; |
---|
| 631 | + tsk->thread.error_code = error_code | X86_PF_USER; |
---|
| 632 | + tsk->thread.cr2 = address; |
---|
718 | 633 | } |
---|
719 | 634 | |
---|
720 | 635 | static noinline void |
---|
.. | .. |
---|
725 | 640 | unsigned long flags; |
---|
726 | 641 | int sig; |
---|
727 | 642 | |
---|
| 643 | + if (user_mode(regs)) { |
---|
| 644 | + /* |
---|
| 645 | + * This is an implicit supervisor-mode access from user |
---|
| 646 | + * mode. Bypass all the kernel-mode recovery code and just |
---|
| 647 | + * OOPS. |
---|
| 648 | + */ |
---|
| 649 | + goto oops; |
---|
| 650 | + } |
---|
| 651 | + |
---|
728 | 652 | /* Are we prepared to handle this kernel fault? */ |
---|
729 | | - if (fixup_exception(regs, X86_TRAP_PF)) { |
---|
| 653 | + if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) { |
---|
730 | 654 | /* |
---|
731 | 655 | * Any interrupt that takes a fault gets the fixup. This makes |
---|
732 | 656 | * the below recursive fault logic only apply to a faults from |
---|
.. | .. |
---|
742 | 666 | * faulting through the emulate_vsyscall() logic. |
---|
743 | 667 | */ |
---|
744 | 668 | if (current->thread.sig_on_uaccess_err && signal) { |
---|
745 | | - tsk->thread.trap_nr = X86_TRAP_PF; |
---|
746 | | - tsk->thread.error_code = error_code | X86_PF_USER; |
---|
747 | | - tsk->thread.cr2 = address; |
---|
| 669 | + set_signal_archinfo(address, error_code); |
---|
748 | 670 | |
---|
749 | 671 | /* XXX: hwpoison faults will set the wrong code. */ |
---|
750 | | - force_sig_info_fault(signal, si_code, address, |
---|
751 | | - tsk, NULL, 0); |
---|
| 672 | + force_sig_fault(signal, si_code, (void __user *)address); |
---|
752 | 673 | } |
---|
753 | 674 | |
---|
754 | 675 | /* |
---|
.. | .. |
---|
766 | 687 | if (is_vmalloc_addr((void *)address) && |
---|
767 | 688 | (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || |
---|
768 | 689 | address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { |
---|
769 | | - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); |
---|
| 690 | + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); |
---|
770 | 691 | /* |
---|
771 | 692 | * We're likely to be running with very little stack space |
---|
772 | 693 | * left. It's plausible that we'd hit this condition but |
---|
.. | .. |
---|
806 | 727 | return; |
---|
807 | 728 | |
---|
808 | 729 | /* |
---|
| 730 | + * Buggy firmware could access regions which might page fault, try to |
---|
| 731 | + * recover from such faults. |
---|
| 732 | + */ |
---|
| 733 | + if (IS_ENABLED(CONFIG_EFI)) |
---|
| 734 | + efi_recover_from_page_fault(address); |
---|
| 735 | + |
---|
| 736 | + /* Only not-present faults should be handled by KFENCE. */ |
---|
| 737 | + if (!(error_code & X86_PF_PROT) && |
---|
| 738 | + kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs)) |
---|
| 739 | + return; |
---|
| 740 | + |
---|
| 741 | +oops: |
---|
| 742 | + /* |
---|
809 | 743 | * Oops. The kernel tried to access some bad page. We'll have to |
---|
810 | 744 | * terminate things with extreme prejudice: |
---|
811 | 745 | */ |
---|
.. | .. |
---|
815 | 749 | |
---|
816 | 750 | if (task_stack_end_corrupted(tsk)) |
---|
817 | 751 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); |
---|
818 | | - |
---|
819 | | - tsk->thread.cr2 = address; |
---|
820 | | - tsk->thread.trap_nr = X86_TRAP_PF; |
---|
821 | | - tsk->thread.error_code = error_code; |
---|
822 | 752 | |
---|
823 | 753 | sig = SIGKILL; |
---|
824 | 754 | if (__die("Oops", regs, error_code)) |
---|
.. | .. |
---|
857 | 787 | show_opcodes(regs, loglvl); |
---|
858 | 788 | } |
---|
859 | 789 | |
---|
| 790 | +/* |
---|
| 791 | + * The (legacy) vsyscall page is the long page in the kernel portion |
---|
| 792 | + * of the address space that has user-accessible permissions. |
---|
| 793 | + */ |
---|
| 794 | +static bool is_vsyscall_vaddr(unsigned long vaddr) |
---|
| 795 | +{ |
---|
| 796 | + return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR); |
---|
| 797 | +} |
---|
| 798 | + |
---|
860 | 799 | static void |
---|
861 | 800 | __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, |
---|
862 | | - unsigned long address, u32 *pkey, int si_code) |
---|
| 801 | + unsigned long address, u32 pkey, int si_code) |
---|
863 | 802 | { |
---|
864 | 803 | struct task_struct *tsk = current; |
---|
865 | 804 | |
---|
866 | 805 | /* User mode accesses just cause a SIGSEGV */ |
---|
867 | | - if (error_code & X86_PF_USER) { |
---|
| 806 | + if (user_mode(regs) && (error_code & X86_PF_USER)) { |
---|
868 | 807 | /* |
---|
869 | 808 | * It's possible to have interrupts off here: |
---|
870 | 809 | */ |
---|
.. | .. |
---|
880 | 819 | if (is_errata100(regs, address)) |
---|
881 | 820 | return; |
---|
882 | 821 | |
---|
883 | | -#ifdef CONFIG_X86_64 |
---|
884 | | - /* |
---|
885 | | - * Instruction fetch faults in the vsyscall page might need |
---|
886 | | - * emulation. |
---|
887 | | - */ |
---|
888 | | - if (unlikely((error_code & X86_PF_INSTR) && |
---|
889 | | - ((address & ~0xfff) == VSYSCALL_ADDR))) { |
---|
890 | | - if (emulate_vsyscall(regs, address)) |
---|
891 | | - return; |
---|
892 | | - } |
---|
893 | | -#endif |
---|
894 | | - |
---|
895 | 822 | /* |
---|
896 | 823 | * To avoid leaking information about the kernel page table |
---|
897 | 824 | * layout, pretend that user-mode accesses to kernel addresses |
---|
.. | .. |
---|
903 | 830 | if (likely(show_unhandled_signals)) |
---|
904 | 831 | show_signal_msg(regs, error_code, address, tsk); |
---|
905 | 832 | |
---|
906 | | - tsk->thread.cr2 = address; |
---|
907 | | - tsk->thread.error_code = error_code; |
---|
908 | | - tsk->thread.trap_nr = X86_TRAP_PF; |
---|
| 833 | + set_signal_archinfo(address, error_code); |
---|
909 | 834 | |
---|
910 | | - force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); |
---|
| 835 | + if (si_code == SEGV_PKUERR) |
---|
| 836 | + force_sig_pkuerr((void __user *)address, pkey); |
---|
| 837 | + |
---|
| 838 | + force_sig_fault(SIGSEGV, si_code, (void __user *)address); |
---|
| 839 | + |
---|
| 840 | + local_irq_disable(); |
---|
911 | 841 | |
---|
912 | 842 | return; |
---|
913 | 843 | } |
---|
.. | .. |
---|
920 | 850 | |
---|
921 | 851 | static noinline void |
---|
922 | 852 | bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, |
---|
923 | | - unsigned long address, u32 *pkey) |
---|
| 853 | + unsigned long address) |
---|
924 | 854 | { |
---|
925 | | - __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); |
---|
| 855 | + __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR); |
---|
926 | 856 | } |
---|
927 | 857 | |
---|
928 | 858 | static void |
---|
929 | 859 | __bad_area(struct pt_regs *regs, unsigned long error_code, |
---|
930 | | - unsigned long address, struct vm_area_struct *vma, int si_code) |
---|
| 860 | + unsigned long address, u32 pkey, int si_code) |
---|
931 | 861 | { |
---|
932 | 862 | struct mm_struct *mm = current->mm; |
---|
933 | | - u32 pkey; |
---|
934 | | - |
---|
935 | | - if (vma) |
---|
936 | | - pkey = vma_pkey(vma); |
---|
937 | | - |
---|
938 | 863 | /* |
---|
939 | 864 | * Something tried to access memory that isn't in our memory map.. |
---|
940 | 865 | * Fix it, but check if it's kernel or user first.. |
---|
941 | 866 | */ |
---|
942 | | - up_read(&mm->mmap_sem); |
---|
| 867 | + mmap_read_unlock(mm); |
---|
943 | 868 | |
---|
944 | | - __bad_area_nosemaphore(regs, error_code, address, |
---|
945 | | - (vma) ? &pkey : NULL, si_code); |
---|
| 869 | + __bad_area_nosemaphore(regs, error_code, address, pkey, si_code); |
---|
946 | 870 | } |
---|
947 | 871 | |
---|
948 | 872 | static noinline void |
---|
949 | 873 | bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
---|
950 | 874 | { |
---|
951 | | - __bad_area(regs, error_code, address, NULL, SEGV_MAPERR); |
---|
| 875 | + __bad_area(regs, error_code, address, 0, SEGV_MAPERR); |
---|
952 | 876 | } |
---|
953 | 877 | |
---|
954 | 878 | static inline bool bad_area_access_from_pkeys(unsigned long error_code, |
---|
.. | .. |
---|
977 | 901 | * But, doing it this way allows compiler optimizations |
---|
978 | 902 | * if pkeys are compiled out. |
---|
979 | 903 | */ |
---|
980 | | - if (bad_area_access_from_pkeys(error_code, vma)) |
---|
981 | | - __bad_area(regs, error_code, address, vma, SEGV_PKUERR); |
---|
982 | | - else |
---|
983 | | - __bad_area(regs, error_code, address, vma, SEGV_ACCERR); |
---|
| 904 | + if (bad_area_access_from_pkeys(error_code, vma)) { |
---|
| 905 | + /* |
---|
| 906 | + * A protection key fault means that the PKRU value did not allow |
---|
| 907 | + * access to some PTE. Userspace can figure out what PKRU was |
---|
| 908 | + * from the XSAVE state. This function captures the pkey from |
---|
| 909 | + * the vma and passes it to userspace so userspace can discover |
---|
| 910 | + * which protection key was set on the PTE. |
---|
| 911 | + * |
---|
| 912 | + * If we get here, we know that the hardware signaled a X86_PF_PK |
---|
| 913 | + * fault and that there was a VMA once we got in the fault |
---|
| 914 | + * handler. It does *not* guarantee that the VMA we find here |
---|
| 915 | + * was the one that we faulted on. |
---|
| 916 | + * |
---|
| 917 | + * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4); |
---|
| 918 | + * 2. T1 : set PKRU to deny access to pkey=4, touches page |
---|
| 919 | + * 3. T1 : faults... |
---|
| 920 | + * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5); |
---|
| 921 | + * 5. T1 : enters fault handler, takes mmap_lock, etc... |
---|
| 922 | + * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really |
---|
| 923 | + * faulted on a pte with its pkey=4. |
---|
| 924 | + */ |
---|
| 925 | + u32 pkey = vma_pkey(vma); |
---|
| 926 | + |
---|
| 927 | + __bad_area(regs, error_code, address, pkey, SEGV_PKUERR); |
---|
| 928 | + } else { |
---|
| 929 | + __bad_area(regs, error_code, address, 0, SEGV_ACCERR); |
---|
| 930 | + } |
---|
984 | 931 | } |
---|
985 | 932 | |
---|
986 | 933 | static void |
---|
987 | 934 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, |
---|
988 | | - u32 *pkey, unsigned int fault) |
---|
| 935 | + vm_fault_t fault) |
---|
989 | 936 | { |
---|
990 | | - struct task_struct *tsk = current; |
---|
991 | | - int code = BUS_ADRERR; |
---|
992 | | - |
---|
993 | 937 | /* Kernel mode? Handle exceptions or die: */ |
---|
994 | 938 | if (!(error_code & X86_PF_USER)) { |
---|
995 | 939 | no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); |
---|
.. | .. |
---|
1000 | 944 | if (is_prefetch(regs, error_code, address)) |
---|
1001 | 945 | return; |
---|
1002 | 946 | |
---|
1003 | | - tsk->thread.cr2 = address; |
---|
1004 | | - tsk->thread.error_code = error_code; |
---|
1005 | | - tsk->thread.trap_nr = X86_TRAP_PF; |
---|
| 947 | + set_signal_archinfo(address, error_code); |
---|
1006 | 948 | |
---|
1007 | 949 | #ifdef CONFIG_MEMORY_FAILURE |
---|
1008 | 950 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
---|
1009 | | - printk(KERN_ERR |
---|
| 951 | + struct task_struct *tsk = current; |
---|
| 952 | + unsigned lsb = 0; |
---|
| 953 | + |
---|
| 954 | + pr_err( |
---|
1010 | 955 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
---|
1011 | 956 | tsk->comm, tsk->pid, address); |
---|
1012 | | - code = BUS_MCEERR_AR; |
---|
| 957 | + if (fault & VM_FAULT_HWPOISON_LARGE) |
---|
| 958 | + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); |
---|
| 959 | + if (fault & VM_FAULT_HWPOISON) |
---|
| 960 | + lsb = PAGE_SHIFT; |
---|
| 961 | + force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb); |
---|
| 962 | + return; |
---|
1013 | 963 | } |
---|
1014 | 964 | #endif |
---|
1015 | | - force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); |
---|
| 965 | + force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address); |
---|
1016 | 966 | } |
---|
1017 | 967 | |
---|
1018 | 968 | static noinline void |
---|
1019 | 969 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
---|
1020 | | - unsigned long address, u32 *pkey, vm_fault_t fault) |
---|
| 970 | + unsigned long address, vm_fault_t fault) |
---|
1021 | 971 | { |
---|
1022 | 972 | if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) { |
---|
1023 | 973 | no_context(regs, error_code, address, 0, 0); |
---|
.. | .. |
---|
1041 | 991 | } else { |
---|
1042 | 992 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
---|
1043 | 993 | VM_FAULT_HWPOISON_LARGE)) |
---|
1044 | | - do_sigbus(regs, error_code, address, pkey, fault); |
---|
| 994 | + do_sigbus(regs, error_code, address, fault); |
---|
1045 | 995 | else if (fault & VM_FAULT_SIGSEGV) |
---|
1046 | | - bad_area_nosemaphore(regs, error_code, address, pkey); |
---|
| 996 | + bad_area_nosemaphore(regs, error_code, address); |
---|
1047 | 997 | else |
---|
1048 | 998 | BUG(); |
---|
1049 | 999 | } |
---|
1050 | 1000 | } |
---|
1051 | 1001 | |
---|
1052 | | -static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
---|
| 1002 | +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte) |
---|
1053 | 1003 | { |
---|
1054 | 1004 | if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) |
---|
1055 | 1005 | return 0; |
---|
1056 | 1006 | |
---|
1057 | 1007 | if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) |
---|
1058 | 1008 | return 0; |
---|
1059 | | - /* |
---|
1060 | | - * Note: We do not do lazy flushing on protection key |
---|
1061 | | - * changes, so no spurious fault will ever set X86_PF_PK. |
---|
1062 | | - */ |
---|
1063 | | - if ((error_code & X86_PF_PK)) |
---|
1064 | | - return 1; |
---|
1065 | 1009 | |
---|
1066 | 1010 | return 1; |
---|
1067 | 1011 | } |
---|
.. | .. |
---|
1088 | 1032 | * (Optional Invalidation). |
---|
1089 | 1033 | */ |
---|
1090 | 1034 | static noinline int |
---|
1091 | | -spurious_fault(unsigned long error_code, unsigned long address) |
---|
| 1035 | +spurious_kernel_fault(unsigned long error_code, unsigned long address) |
---|
1092 | 1036 | { |
---|
1093 | 1037 | pgd_t *pgd; |
---|
1094 | 1038 | p4d_t *p4d; |
---|
.. | .. |
---|
1119 | 1063 | return 0; |
---|
1120 | 1064 | |
---|
1121 | 1065 | if (p4d_large(*p4d)) |
---|
1122 | | - return spurious_fault_check(error_code, (pte_t *) p4d); |
---|
| 1066 | + return spurious_kernel_fault_check(error_code, (pte_t *) p4d); |
---|
1123 | 1067 | |
---|
1124 | 1068 | pud = pud_offset(p4d, address); |
---|
1125 | 1069 | if (!pud_present(*pud)) |
---|
1126 | 1070 | return 0; |
---|
1127 | 1071 | |
---|
1128 | 1072 | if (pud_large(*pud)) |
---|
1129 | | - return spurious_fault_check(error_code, (pte_t *) pud); |
---|
| 1073 | + return spurious_kernel_fault_check(error_code, (pte_t *) pud); |
---|
1130 | 1074 | |
---|
1131 | 1075 | pmd = pmd_offset(pud, address); |
---|
1132 | 1076 | if (!pmd_present(*pmd)) |
---|
1133 | 1077 | return 0; |
---|
1134 | 1078 | |
---|
1135 | 1079 | if (pmd_large(*pmd)) |
---|
1136 | | - return spurious_fault_check(error_code, (pte_t *) pmd); |
---|
| 1080 | + return spurious_kernel_fault_check(error_code, (pte_t *) pmd); |
---|
1137 | 1081 | |
---|
1138 | 1082 | pte = pte_offset_kernel(pmd, address); |
---|
1139 | 1083 | if (!pte_present(*pte)) |
---|
1140 | 1084 | return 0; |
---|
1141 | 1085 | |
---|
1142 | | - ret = spurious_fault_check(error_code, pte); |
---|
| 1086 | + ret = spurious_kernel_fault_check(error_code, pte); |
---|
1143 | 1087 | if (!ret) |
---|
1144 | 1088 | return 0; |
---|
1145 | 1089 | |
---|
.. | .. |
---|
1147 | 1091 | * Make sure we have permissions in PMD. |
---|
1148 | 1092 | * If not, then there's a bug in the page tables: |
---|
1149 | 1093 | */ |
---|
1150 | | - ret = spurious_fault_check(error_code, (pte_t *) pmd); |
---|
| 1094 | + ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd); |
---|
1151 | 1095 | WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); |
---|
1152 | 1096 | |
---|
1153 | 1097 | return ret; |
---|
1154 | 1098 | } |
---|
1155 | | -NOKPROBE_SYMBOL(spurious_fault); |
---|
| 1099 | +NOKPROBE_SYMBOL(spurious_kernel_fault); |
---|
1156 | 1100 | |
---|
1157 | 1101 | int show_unhandled_signals = 1; |
---|
1158 | 1102 | |
---|
.. | .. |
---|
1191 | 1135 | return 1; |
---|
1192 | 1136 | |
---|
1193 | 1137 | /* read, not present: */ |
---|
1194 | | - if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))) |
---|
| 1138 | + if (unlikely(!vma_is_accessible(vma))) |
---|
1195 | 1139 | return 1; |
---|
1196 | 1140 | |
---|
1197 | 1141 | return 0; |
---|
1198 | 1142 | } |
---|
1199 | 1143 | |
---|
1200 | | -static int fault_in_kernel_space(unsigned long address) |
---|
| 1144 | +bool fault_in_kernel_space(unsigned long address) |
---|
1201 | 1145 | { |
---|
| 1146 | + /* |
---|
| 1147 | + * On 64-bit systems, the vsyscall page is at an address above |
---|
| 1148 | + * TASK_SIZE_MAX, but is not considered part of the kernel |
---|
| 1149 | + * address space. |
---|
| 1150 | + */ |
---|
| 1151 | + if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) |
---|
| 1152 | + return false; |
---|
| 1153 | + |
---|
1202 | 1154 | return address >= TASK_SIZE_MAX; |
---|
1203 | 1155 | } |
---|
1204 | 1156 | |
---|
1205 | | -static inline bool smap_violation(int error_code, struct pt_regs *regs) |
---|
1206 | | -{ |
---|
1207 | | - if (!IS_ENABLED(CONFIG_X86_SMAP)) |
---|
1208 | | - return false; |
---|
1209 | | - |
---|
1210 | | - if (!static_cpu_has(X86_FEATURE_SMAP)) |
---|
1211 | | - return false; |
---|
1212 | | - |
---|
1213 | | - if (error_code & X86_PF_USER) |
---|
1214 | | - return false; |
---|
1215 | | - |
---|
1216 | | - if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) |
---|
1217 | | - return false; |
---|
1218 | | - |
---|
1219 | | - return true; |
---|
1220 | | -} |
---|
1221 | | - |
---|
1222 | 1157 | /* |
---|
1223 | | - * This routine handles page faults. It determines the address, |
---|
1224 | | - * and the problem, and then passes it off to one of the appropriate |
---|
1225 | | - * routines. |
---|
| 1158 | + * Called for all faults where 'address' is part of the kernel address |
---|
| 1159 | + * space. Might get called for faults that originate from *code* that |
---|
| 1160 | + * ran in userspace or the kernel. |
---|
1226 | 1161 | */ |
---|
1227 | | -static noinline void |
---|
1228 | | -__do_page_fault(struct pt_regs *regs, unsigned long error_code, |
---|
1229 | | - unsigned long address) |
---|
| 1162 | +static void |
---|
| 1163 | +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, |
---|
| 1164 | + unsigned long address) |
---|
1230 | 1165 | { |
---|
1231 | | - struct vm_area_struct *vma; |
---|
1232 | | - struct task_struct *tsk; |
---|
1233 | | - struct mm_struct *mm; |
---|
1234 | | - vm_fault_t fault, major = 0; |
---|
1235 | | - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; |
---|
1236 | | - u32 pkey; |
---|
1237 | | - |
---|
1238 | | - tsk = current; |
---|
1239 | | - mm = tsk->mm; |
---|
1240 | | - |
---|
1241 | | - prefetchw(&mm->mmap_sem); |
---|
1242 | | - |
---|
1243 | | - if (unlikely(kmmio_fault(regs, address))) |
---|
1244 | | - return; |
---|
1245 | | - |
---|
1246 | 1166 | /* |
---|
1247 | | - * We fault-in kernel-space virtual memory on-demand. The |
---|
| 1167 | + * Protection keys exceptions only happen on user pages. We |
---|
| 1168 | + * have no user pages in the kernel portion of the address |
---|
| 1169 | + * space, so do not expect them here. |
---|
| 1170 | + */ |
---|
| 1171 | + WARN_ON_ONCE(hw_error_code & X86_PF_PK); |
---|
| 1172 | + |
---|
| 1173 | +#ifdef CONFIG_X86_32 |
---|
| 1174 | + /* |
---|
| 1175 | + * We can fault-in kernel-space virtual memory on-demand. The |
---|
1248 | 1176 | * 'reference' page table is init_mm.pgd. |
---|
1249 | 1177 | * |
---|
1250 | 1178 | * NOTE! We MUST NOT take any locks for this case. We may |
---|
.. | .. |
---|
1252 | 1180 | * only copy the information from the master page table, |
---|
1253 | 1181 | * nothing more. |
---|
1254 | 1182 | * |
---|
1255 | | - * This verifies that the fault happens in kernel space |
---|
1256 | | - * (error_code & 4) == 0, and that the fault was not a |
---|
1257 | | - * protection error (error_code & 9) == 0. |
---|
| 1183 | + * Before doing this on-demand faulting, ensure that the |
---|
| 1184 | + * fault is not any of the following: |
---|
| 1185 | + * 1. A fault on a PTE with a reserved bit set. |
---|
| 1186 | + * 2. A fault caused by a user-mode access. (Do not demand- |
---|
| 1187 | + * fault kernel memory due to user-mode accesses). |
---|
| 1188 | + * 3. A fault caused by a page-level protection violation. |
---|
| 1189 | + * (A demand fault would be on a non-present page which |
---|
| 1190 | + * would have X86_PF_PROT==0). |
---|
| 1191 | + * |
---|
| 1192 | + * This is only needed to close a race condition on x86-32 in |
---|
| 1193 | + * the vmalloc mapping/unmapping code. See the comment above |
---|
| 1194 | + * vmalloc_fault() for details. On x86-64 the race does not |
---|
| 1195 | + * exist as the vmalloc mappings don't need to be synchronized |
---|
| 1196 | + * there. |
---|
1258 | 1197 | */ |
---|
1259 | | - if (unlikely(fault_in_kernel_space(address))) { |
---|
1260 | | - if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { |
---|
1261 | | - if (vmalloc_fault(address) >= 0) |
---|
1262 | | - return; |
---|
1263 | | - } |
---|
1264 | | - |
---|
1265 | | - /* Can handle a stale RO->RW TLB: */ |
---|
1266 | | - if (spurious_fault(error_code, address)) |
---|
| 1198 | + if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { |
---|
| 1199 | + if (vmalloc_fault(address) >= 0) |
---|
1267 | 1200 | return; |
---|
1268 | | - |
---|
1269 | | - /* kprobes don't want to hook the spurious faults: */ |
---|
1270 | | - if (kprobes_fault(regs)) |
---|
1271 | | - return; |
---|
1272 | | - /* |
---|
1273 | | - * Don't take the mm semaphore here. If we fixup a prefetch |
---|
1274 | | - * fault we could otherwise deadlock: |
---|
1275 | | - */ |
---|
1276 | | - bad_area_nosemaphore(regs, error_code, address, NULL); |
---|
1277 | | - |
---|
1278 | | - return; |
---|
1279 | 1201 | } |
---|
| 1202 | +#endif |
---|
| 1203 | + |
---|
| 1204 | + /* Was the fault spurious, caused by lazy TLB invalidation? */ |
---|
| 1205 | + if (spurious_kernel_fault(hw_error_code, address)) |
---|
| 1206 | + return; |
---|
1280 | 1207 | |
---|
1281 | 1208 | /* kprobes don't want to hook the spurious faults: */ |
---|
1282 | | - if (unlikely(kprobes_fault(regs))) |
---|
| 1209 | + if (kprobe_page_fault(regs, X86_TRAP_PF)) |
---|
1283 | 1210 | return; |
---|
1284 | 1211 | |
---|
1285 | | - if (unlikely(error_code & X86_PF_RSVD)) |
---|
1286 | | - pgtable_bad(regs, error_code, address); |
---|
| 1212 | + /* |
---|
| 1213 | + * Note, despite being a "bad area", there are quite a few |
---|
| 1214 | + * acceptable reasons to get here, such as erratum fixups |
---|
| 1215 | + * and handling kernel code that can fault, like get_user(). |
---|
| 1216 | + * |
---|
| 1217 | + * Don't take the mm semaphore here. If we fixup a prefetch |
---|
| 1218 | + * fault we could otherwise deadlock: |
---|
| 1219 | + */ |
---|
| 1220 | + bad_area_nosemaphore(regs, hw_error_code, address); |
---|
| 1221 | +} |
---|
| 1222 | +NOKPROBE_SYMBOL(do_kern_addr_fault); |
---|
1287 | 1223 | |
---|
1288 | | - if (unlikely(smap_violation(error_code, regs))) { |
---|
1289 | | - bad_area_nosemaphore(regs, error_code, address, NULL); |
---|
| 1224 | +/* Handle faults in the user portion of the address space */ |
---|
| 1225 | +static inline |
---|
| 1226 | +void do_user_addr_fault(struct pt_regs *regs, |
---|
| 1227 | + unsigned long hw_error_code, |
---|
| 1228 | + unsigned long address) |
---|
| 1229 | +{ |
---|
| 1230 | + struct vm_area_struct *vma = NULL; |
---|
| 1231 | + struct task_struct *tsk; |
---|
| 1232 | + struct mm_struct *mm; |
---|
| 1233 | + vm_fault_t fault; |
---|
| 1234 | + unsigned int flags = FAULT_FLAG_DEFAULT; |
---|
| 1235 | + |
---|
| 1236 | + tsk = current; |
---|
| 1237 | + mm = tsk->mm; |
---|
| 1238 | + |
---|
| 1239 | + /* kprobes don't want to hook the spurious faults: */ |
---|
| 1240 | + if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF))) |
---|
| 1241 | + return; |
---|
| 1242 | + |
---|
| 1243 | + /* |
---|
| 1244 | + * Reserved bits are never expected to be set on |
---|
| 1245 | + * entries in the user portion of the page tables. |
---|
| 1246 | + */ |
---|
| 1247 | + if (unlikely(hw_error_code & X86_PF_RSVD)) |
---|
| 1248 | + pgtable_bad(regs, hw_error_code, address); |
---|
| 1249 | + |
---|
| 1250 | + /* |
---|
| 1251 | + * If SMAP is on, check for invalid kernel (supervisor) access to user |
---|
| 1252 | + * pages in the user address space. The odd case here is WRUSS, |
---|
| 1253 | + * which, according to the preliminary documentation, does not respect |
---|
| 1254 | + * SMAP and will have the USER bit set so, in all cases, SMAP |
---|
| 1255 | + * enforcement appears to be consistent with the USER bit. |
---|
| 1256 | + */ |
---|
| 1257 | + if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) && |
---|
| 1258 | + !(hw_error_code & X86_PF_USER) && |
---|
| 1259 | + !(regs->flags & X86_EFLAGS_AC))) |
---|
| 1260 | + { |
---|
| 1261 | + bad_area_nosemaphore(regs, hw_error_code, address); |
---|
1290 | 1262 | return; |
---|
1291 | 1263 | } |
---|
1292 | 1264 | |
---|
.. | .. |
---|
1295 | 1267 | * in a region with pagefaults disabled then we must not take the fault |
---|
1296 | 1268 | */ |
---|
1297 | 1269 | if (unlikely(faulthandler_disabled() || !mm)) { |
---|
1298 | | - bad_area_nosemaphore(regs, error_code, address, NULL); |
---|
| 1270 | + bad_area_nosemaphore(regs, hw_error_code, address); |
---|
1299 | 1271 | return; |
---|
1300 | 1272 | } |
---|
1301 | 1273 | |
---|
.. | .. |
---|
1308 | 1280 | */ |
---|
1309 | 1281 | if (user_mode(regs)) { |
---|
1310 | 1282 | local_irq_enable(); |
---|
1311 | | - error_code |= X86_PF_USER; |
---|
1312 | 1283 | flags |= FAULT_FLAG_USER; |
---|
1313 | 1284 | } else { |
---|
1314 | 1285 | if (regs->flags & X86_EFLAGS_IF) |
---|
.. | .. |
---|
1317 | 1288 | |
---|
1318 | 1289 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); |
---|
1319 | 1290 | |
---|
1320 | | - if (error_code & X86_PF_WRITE) |
---|
| 1291 | + if (hw_error_code & X86_PF_WRITE) |
---|
1321 | 1292 | flags |= FAULT_FLAG_WRITE; |
---|
1322 | | - if (error_code & X86_PF_INSTR) |
---|
| 1293 | + if (hw_error_code & X86_PF_INSTR) |
---|
1323 | 1294 | flags |= FAULT_FLAG_INSTRUCTION; |
---|
1324 | 1295 | |
---|
| 1296 | +#ifdef CONFIG_X86_64 |
---|
1325 | 1297 | /* |
---|
1326 | | - * When running in the kernel we expect faults to occur only to |
---|
1327 | | - * addresses in user space. All other faults represent errors in |
---|
1328 | | - * the kernel and should generate an OOPS. Unfortunately, in the |
---|
1329 | | - * case of an erroneous fault occurring in a code path which already |
---|
1330 | | - * holds mmap_sem we will deadlock attempting to validate the fault |
---|
1331 | | - * against the address space. Luckily the kernel only validly |
---|
1332 | | - * references user space from well defined areas of code, which are |
---|
1333 | | - * listed in the exceptions table. |
---|
| 1298 | + * Faults in the vsyscall page might need emulation. The |
---|
| 1299 | + * vsyscall page is at a high address (>PAGE_OFFSET), but is |
---|
| 1300 | + * considered to be part of the user address space. |
---|
1334 | 1301 | * |
---|
1335 | | - * As the vast majority of faults will be valid we will only perform |
---|
1336 | | - * the source reference check when there is a possibility of a |
---|
1337 | | - * deadlock. Attempt to lock the address space, if we cannot we then |
---|
1338 | | - * validate the source. If this is invalid we can skip the address |
---|
1339 | | - * space check, thus avoiding the deadlock: |
---|
| 1302 | + * The vsyscall page does not have a "real" VMA, so do this |
---|
| 1303 | + * emulation before we go searching for VMAs. |
---|
| 1304 | + * |
---|
| 1305 | + * PKRU never rejects instruction fetches, so we don't need |
---|
| 1306 | + * to consider the PF_PK bit. |
---|
1340 | 1307 | */ |
---|
1341 | | - if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
---|
1342 | | - if (!(error_code & X86_PF_USER) && |
---|
1343 | | - !search_exception_tables(regs->ip)) { |
---|
1344 | | - bad_area_nosemaphore(regs, error_code, address, NULL); |
---|
| 1308 | + if (is_vsyscall_vaddr(address)) { |
---|
| 1309 | + if (emulate_vsyscall(hw_error_code, regs, address)) |
---|
| 1310 | + return; |
---|
| 1311 | + } |
---|
| 1312 | +#endif |
---|
| 1313 | + |
---|
| 1314 | + /* |
---|
| 1315 | + * Do not try to do a speculative page fault if the fault was due to |
---|
| 1316 | + * protection keys since it can't be resolved. |
---|
| 1317 | + */ |
---|
| 1318 | + if (!(hw_error_code & X86_PF_PK)) { |
---|
| 1319 | + fault = handle_speculative_fault(mm, address, flags, &vma, regs); |
---|
| 1320 | + if (fault != VM_FAULT_RETRY) |
---|
| 1321 | + goto done; |
---|
| 1322 | + } |
---|
| 1323 | + |
---|
| 1324 | + /* |
---|
| 1325 | + * Kernel-mode access to the user address space should only occur |
---|
| 1326 | + * on well-defined single instructions listed in the exception |
---|
| 1327 | + * tables. But, an erroneous kernel fault occurring outside one of |
---|
| 1328 | + * those areas which also holds mmap_lock might deadlock attempting |
---|
| 1329 | + * to validate the fault against the address space. |
---|
| 1330 | + * |
---|
| 1331 | + * Only do the expensive exception table search when we might be at |
---|
| 1332 | + * risk of a deadlock. This happens if we |
---|
| 1333 | + * 1. Failed to acquire mmap_lock, and |
---|
| 1334 | + * 2. The access did not originate in userspace. |
---|
| 1335 | + */ |
---|
| 1336 | + if (unlikely(!mmap_read_trylock(mm))) { |
---|
| 1337 | + if (!user_mode(regs) && !search_exception_tables(regs->ip)) { |
---|
| 1338 | + /* |
---|
| 1339 | + * Fault from code in kernel from |
---|
| 1340 | + * which we do not expect faults. |
---|
| 1341 | + */ |
---|
| 1342 | + bad_area_nosemaphore(regs, hw_error_code, address); |
---|
1345 | 1343 | return; |
---|
1346 | 1344 | } |
---|
1347 | 1345 | retry: |
---|
1348 | | - down_read(&mm->mmap_sem); |
---|
| 1346 | + mmap_read_lock(mm); |
---|
1349 | 1347 | } else { |
---|
1350 | 1348 | /* |
---|
1351 | 1349 | * The above down_read_trylock() might have succeeded in |
---|
.. | .. |
---|
1355 | 1353 | might_sleep(); |
---|
1356 | 1354 | } |
---|
1357 | 1355 | |
---|
1358 | | - vma = find_vma(mm, address); |
---|
| 1356 | + if (!vma || !can_reuse_spf_vma(vma, address)) |
---|
| 1357 | + vma = find_vma(mm, address); |
---|
1359 | 1358 | if (unlikely(!vma)) { |
---|
1360 | | - bad_area(regs, error_code, address); |
---|
| 1359 | + bad_area(regs, hw_error_code, address); |
---|
1361 | 1360 | return; |
---|
1362 | 1361 | } |
---|
1363 | 1362 | if (likely(vma->vm_start <= address)) |
---|
1364 | 1363 | goto good_area; |
---|
1365 | 1364 | if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { |
---|
1366 | | - bad_area(regs, error_code, address); |
---|
| 1365 | + bad_area(regs, hw_error_code, address); |
---|
1367 | 1366 | return; |
---|
1368 | 1367 | } |
---|
1369 | | - if (error_code & X86_PF_USER) { |
---|
1370 | | - /* |
---|
1371 | | - * Accessing the stack below %sp is always a bug. |
---|
1372 | | - * The large cushion allows instructions like enter |
---|
1373 | | - * and pusha to work. ("enter $65535, $31" pushes |
---|
1374 | | - * 32 pointers and then decrements %sp by 65535.) |
---|
1375 | | - */ |
---|
1376 | | - if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { |
---|
1377 | | - bad_area(regs, error_code, address); |
---|
1378 | | - return; |
---|
1379 | | - } |
---|
1380 | | - } |
---|
1381 | 1368 | if (unlikely(expand_stack(vma, address))) { |
---|
1382 | | - bad_area(regs, error_code, address); |
---|
| 1369 | + bad_area(regs, hw_error_code, address); |
---|
1383 | 1370 | return; |
---|
1384 | 1371 | } |
---|
1385 | 1372 | |
---|
.. | .. |
---|
1388 | 1375 | * we can handle it.. |
---|
1389 | 1376 | */ |
---|
1390 | 1377 | good_area: |
---|
1391 | | - if (unlikely(access_error(error_code, vma))) { |
---|
1392 | | - bad_area_access_error(regs, error_code, address, vma); |
---|
| 1378 | + if (unlikely(access_error(hw_error_code, vma))) { |
---|
| 1379 | + bad_area_access_error(regs, hw_error_code, address, vma); |
---|
1393 | 1380 | return; |
---|
1394 | 1381 | } |
---|
1395 | 1382 | |
---|
.. | .. |
---|
1397 | 1384 | * If for any reason at all we couldn't handle the fault, |
---|
1398 | 1385 | * make sure we exit gracefully rather than endlessly redo |
---|
1399 | 1386 | * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if |
---|
1400 | | - * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. |
---|
| 1387 | + * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked. |
---|
1401 | 1388 | * |
---|
1402 | | - * Note that handle_userfault() may also release and reacquire mmap_sem |
---|
| 1389 | + * Note that handle_userfault() may also release and reacquire mmap_lock |
---|
1403 | 1390 | * (and not return with VM_FAULT_RETRY), when returning to userland to |
---|
1404 | 1391 | * repeat the page fault later with a VM_FAULT_NOPAGE retval |
---|
1405 | 1392 | * (potentially after handling any pending signal during the return to |
---|
1406 | 1393 | * userland). The return to userland is identified whenever |
---|
1407 | 1394 | * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. |
---|
1408 | | - * Thus we have to be careful about not touching vma after handling the |
---|
1409 | | - * fault, so we read the pkey beforehand. |
---|
1410 | 1395 | */ |
---|
1411 | | - pkey = vma_pkey(vma); |
---|
1412 | | - fault = handle_mm_fault(vma, address, flags); |
---|
1413 | | - major |= fault & VM_FAULT_MAJOR; |
---|
| 1396 | + fault = handle_mm_fault(vma, address, flags, regs); |
---|
| 1397 | + |
---|
| 1398 | + /* Quick path to respond to signals */ |
---|
| 1399 | + if (fault_signal_pending(fault, regs)) { |
---|
| 1400 | + if (!user_mode(regs)) |
---|
| 1401 | + no_context(regs, hw_error_code, address, SIGBUS, |
---|
| 1402 | + BUS_ADRERR); |
---|
| 1403 | + return; |
---|
| 1404 | + } |
---|
1414 | 1405 | |
---|
1415 | 1406 | /* |
---|
1416 | | - * If we need to retry the mmap_sem has already been released, |
---|
| 1407 | + * If we need to retry the mmap_lock has already been released, |
---|
1417 | 1408 | * and if there is a fatal signal pending there is no guarantee |
---|
1418 | 1409 | * that we made any progress. Handle this case first. |
---|
1419 | 1410 | */ |
---|
1420 | | - if (unlikely(fault & VM_FAULT_RETRY)) { |
---|
1421 | | - /* Retry at most once */ |
---|
1422 | | - if (flags & FAULT_FLAG_ALLOW_RETRY) { |
---|
1423 | | - flags &= ~FAULT_FLAG_ALLOW_RETRY; |
---|
1424 | | - flags |= FAULT_FLAG_TRIED; |
---|
1425 | | - if (!fatal_signal_pending(tsk)) |
---|
1426 | | - goto retry; |
---|
1427 | | - } |
---|
| 1411 | + if (unlikely((fault & VM_FAULT_RETRY) && |
---|
| 1412 | + (flags & FAULT_FLAG_ALLOW_RETRY))) { |
---|
| 1413 | + flags |= FAULT_FLAG_TRIED; |
---|
1428 | 1414 | |
---|
1429 | | - /* User mode? Just return to handle the fatal exception */ |
---|
1430 | | - if (flags & FAULT_FLAG_USER) |
---|
1431 | | - return; |
---|
| 1415 | + /* |
---|
| 1416 | + * Do not try to reuse this vma and fetch it |
---|
| 1417 | + * again since we will release the mmap_sem. |
---|
| 1418 | + */ |
---|
| 1419 | + vma = NULL; |
---|
1432 | 1420 | |
---|
1433 | | - /* Not returning to user mode? Handle exceptions or die: */ |
---|
1434 | | - no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); |
---|
1435 | | - return; |
---|
| 1421 | + goto retry; |
---|
1436 | 1422 | } |
---|
1437 | 1423 | |
---|
1438 | | - up_read(&mm->mmap_sem); |
---|
| 1424 | + mmap_read_unlock(mm); |
---|
| 1425 | + |
---|
| 1426 | +done: |
---|
1439 | 1427 | if (unlikely(fault & VM_FAULT_ERROR)) { |
---|
1440 | | - mm_fault_error(regs, error_code, address, &pkey, fault); |
---|
| 1428 | + mm_fault_error(regs, hw_error_code, address, fault); |
---|
1441 | 1429 | return; |
---|
1442 | | - } |
---|
1443 | | - |
---|
1444 | | - /* |
---|
1445 | | - * Major/minor page fault accounting. If any of the events |
---|
1446 | | - * returned VM_FAULT_MAJOR, we account it as a major fault. |
---|
1447 | | - */ |
---|
1448 | | - if (major) { |
---|
1449 | | - tsk->maj_flt++; |
---|
1450 | | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); |
---|
1451 | | - } else { |
---|
1452 | | - tsk->min_flt++; |
---|
1453 | | - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); |
---|
1454 | 1430 | } |
---|
1455 | 1431 | |
---|
1456 | 1432 | check_v8086_mode(regs, address, tsk); |
---|
1457 | 1433 | } |
---|
1458 | | -NOKPROBE_SYMBOL(__do_page_fault); |
---|
| 1434 | +NOKPROBE_SYMBOL(do_user_addr_fault); |
---|
1459 | 1435 | |
---|
1460 | | -static nokprobe_inline void |
---|
1461 | | -trace_page_fault_entries(unsigned long address, struct pt_regs *regs, |
---|
1462 | | - unsigned long error_code) |
---|
| 1436 | +static __always_inline void |
---|
| 1437 | +trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, |
---|
| 1438 | + unsigned long address) |
---|
1463 | 1439 | { |
---|
| 1440 | + if (!trace_pagefault_enabled()) |
---|
| 1441 | + return; |
---|
| 1442 | + |
---|
1464 | 1443 | if (user_mode(regs)) |
---|
1465 | 1444 | trace_page_fault_user(address, regs, error_code); |
---|
1466 | 1445 | else |
---|
1467 | 1446 | trace_page_fault_kernel(address, regs, error_code); |
---|
1468 | 1447 | } |
---|
1469 | 1448 | |
---|
1470 | | -/* |
---|
1471 | | - * We must have this function blacklisted from kprobes, tagged with notrace |
---|
1472 | | - * and call read_cr2() before calling anything else. To avoid calling any |
---|
1473 | | - * kind of tracing machinery before we've observed the CR2 value. |
---|
1474 | | - * |
---|
1475 | | - * exception_{enter,exit}() contains all sorts of tracepoints. |
---|
1476 | | - */ |
---|
1477 | | -dotraplinkage void notrace |
---|
1478 | | -do_page_fault(struct pt_regs *regs, unsigned long error_code) |
---|
| 1449 | +static __always_inline void |
---|
| 1450 | +handle_page_fault(struct pt_regs *regs, unsigned long error_code, |
---|
| 1451 | + unsigned long address) |
---|
1479 | 1452 | { |
---|
1480 | | - unsigned long address = read_cr2(); /* Get the faulting address */ |
---|
1481 | | - enum ctx_state prev_state; |
---|
| 1453 | + trace_page_fault_entries(regs, error_code, address); |
---|
1482 | 1454 | |
---|
1483 | | - prev_state = exception_enter(); |
---|
1484 | | - if (trace_pagefault_enabled()) |
---|
1485 | | - trace_page_fault_entries(address, regs, error_code); |
---|
| 1455 | + if (unlikely(kmmio_fault(regs, address))) |
---|
| 1456 | + return; |
---|
1486 | 1457 | |
---|
1487 | | - __do_page_fault(regs, error_code, address); |
---|
1488 | | - exception_exit(prev_state); |
---|
| 1458 | + /* Was the fault on kernel-controlled part of the address space? */ |
---|
| 1459 | + if (unlikely(fault_in_kernel_space(address))) { |
---|
| 1460 | + do_kern_addr_fault(regs, error_code, address); |
---|
| 1461 | + } else { |
---|
| 1462 | + do_user_addr_fault(regs, error_code, address); |
---|
| 1463 | + /* |
---|
| 1464 | + * User address page fault handling might have reenabled |
---|
| 1465 | + * interrupts. Fixing up all potential exit points of |
---|
| 1466 | + * do_user_addr_fault() and its leaf functions is just not |
---|
| 1467 | + * doable w/o creating an unholy mess or turning the code |
---|
| 1468 | + * upside down. |
---|
| 1469 | + */ |
---|
| 1470 | + local_irq_disable(); |
---|
| 1471 | + } |
---|
1489 | 1472 | } |
---|
1490 | | -NOKPROBE_SYMBOL(do_page_fault); |
---|
| 1473 | + |
---|
| 1474 | +DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault) |
---|
| 1475 | +{ |
---|
| 1476 | + unsigned long address = read_cr2(); |
---|
| 1477 | + irqentry_state_t state; |
---|
| 1478 | + |
---|
| 1479 | + prefetchw(¤t->mm->mmap_lock); |
---|
| 1480 | + |
---|
| 1481 | + /* |
---|
| 1482 | + * KVM uses #PF vector to deliver 'page not present' events to guests |
---|
| 1483 | + * (asynchronous page fault mechanism). The event happens when a |
---|
| 1484 | + * userspace task is trying to access some valid (from guest's point of |
---|
| 1485 | + * view) memory which is not currently mapped by the host (e.g. the |
---|
| 1486 | + * memory is swapped out). Note, the corresponding "page ready" event |
---|
| 1487 | + * which is injected when the memory becomes available, is delived via |
---|
| 1488 | + * an interrupt mechanism and not a #PF exception |
---|
| 1489 | + * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()). |
---|
| 1490 | + * |
---|
| 1491 | + * We are relying on the interrupted context being sane (valid RSP, |
---|
| 1492 | + * relevant locks not held, etc.), which is fine as long as the |
---|
| 1493 | + * interrupted context had IF=1. We are also relying on the KVM |
---|
| 1494 | + * async pf type field and CR2 being read consistently instead of |
---|
| 1495 | + * getting values from real and async page faults mixed up. |
---|
| 1496 | + * |
---|
| 1497 | + * Fingers crossed. |
---|
| 1498 | + * |
---|
| 1499 | + * The async #PF handling code takes care of idtentry handling |
---|
| 1500 | + * itself. |
---|
| 1501 | + */ |
---|
| 1502 | + if (kvm_handle_async_pf(regs, (u32)address)) |
---|
| 1503 | + return; |
---|
| 1504 | + |
---|
| 1505 | + /* |
---|
| 1506 | + * Entry handling for valid #PF from kernel mode is slightly |
---|
| 1507 | + * different: RCU is already watching and rcu_irq_enter() must not |
---|
| 1508 | + * be invoked because a kernel fault on a user space address might |
---|
| 1509 | + * sleep. |
---|
| 1510 | + * |
---|
| 1511 | + * In case the fault hit a RCU idle region the conditional entry |
---|
| 1512 | + * code reenabled RCU to avoid subsequent wreckage which helps |
---|
| 1513 | + * debugability. |
---|
| 1514 | + */ |
---|
| 1515 | + state = irqentry_enter(regs); |
---|
| 1516 | + |
---|
| 1517 | + instrumentation_begin(); |
---|
| 1518 | + handle_page_fault(regs, error_code, address); |
---|
| 1519 | + instrumentation_end(); |
---|
| 1520 | + |
---|
| 1521 | + irqentry_exit(regs, state); |
---|
| 1522 | +} |
---|