forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-05-10 9999e48639b3cecb08ffb37358bcba3b48161b29
kernel/arch/x86/mm/fault.c
....@@ -8,7 +8,8 @@
88 #include <linux/sched/task_stack.h> /* task_stack_*(), ... */
99 #include <linux/kdebug.h> /* oops_begin/end, ... */
1010 #include <linux/extable.h> /* search_exception_tables */
11
-#include <linux/bootmem.h> /* max_low_pfn */
11
+#include <linux/memblock.h> /* max_low_pfn */
12
+#include <linux/kfence.h> /* kfence_handle_page_fault */
1213 #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
1314 #include <linux/mmiotrace.h> /* kmmio_handler, ... */
1415 #include <linux/perf_event.h> /* perf_sw_event */
....@@ -16,15 +17,20 @@
1617 #include <linux/prefetch.h> /* prefetchw */
1718 #include <linux/context_tracking.h> /* exception_enter(), ... */
1819 #include <linux/uaccess.h> /* faulthandler_disabled() */
20
+#include <linux/efi.h> /* efi_recover_from_page_fault()*/
1921 #include <linux/mm_types.h>
2022
2123 #include <asm/cpufeature.h> /* boot_cpu_has, ... */
2224 #include <asm/traps.h> /* dotraplinkage, ... */
23
-#include <asm/pgalloc.h> /* pgd_*(), ... */
2425 #include <asm/fixmap.h> /* VSYSCALL_ADDR */
2526 #include <asm/vsyscall.h> /* emulate_vsyscall */
2627 #include <asm/vm86.h> /* struct vm86 */
2728 #include <asm/mmu_context.h> /* vma_pkey() */
29
+#include <asm/efi.h> /* efi_recover_from_page_fault()*/
30
+#include <asm/desc.h> /* store_idt(), ... */
31
+#include <asm/cpu_entry_area.h> /* exception stack */
32
+#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
33
+#include <asm/kvm_para.h> /* kvm_handle_async_pf */
2834
2935 #define CREATE_TRACE_POINTS
3036 #include <asm/trace/exceptions.h>
....@@ -42,28 +48,13 @@
4248 return 0;
4349 }
4450
45
-static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
46
-{
47
- int ret = 0;
48
-
49
- /* kprobe_running() needs smp_processor_id() */
50
- if (kprobes_built_in() && !user_mode(regs)) {
51
- preempt_disable();
52
- if (kprobe_running() && kprobe_fault_handler(regs, 14))
53
- ret = 1;
54
- preempt_enable();
55
- }
56
-
57
- return ret;
58
-}
59
-
6051 /*
6152 * Prefetch quirks:
6253 *
6354 * 32-bit mode:
6455 *
6556 * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
66
- * Check that here and ignore it.
57
+ * Check that here and ignore it. This is AMD erratum #91.
6758 *
6859 * 64-bit mode:
6960 *
....@@ -92,11 +83,7 @@
9283 #ifdef CONFIG_X86_64
9384 case 0x40:
9485 /*
95
- * In AMD64 long mode 0x40..0x4F are valid REX prefixes
96
- * Need to figure out under what instruction mode the
97
- * instruction was issued. Could check the LDT for lm,
98
- * but for now it's good enough to assume that long
99
- * mode only uses well known segments or kernel.
86
+ * In 64-bit mode 0x40..0x4F are valid REX prefixes
10087 */
10188 return (!user_mode(regs) || user_64bit_mode(regs));
10289 #endif
....@@ -108,7 +95,7 @@
10895 return !instr_lo || (instr_lo>>1) == 1;
10996 case 0x00:
11097 /* Prefetch instruction is 0x0F0D or 0x0F18 */
111
- if (probe_kernel_address(instr, opcode))
98
+ if (get_kernel_nofault(opcode, instr))
11299 return 0;
113100
114101 *prefetch = (instr_lo == 0xF) &&
....@@ -136,94 +123,32 @@
136123 instr = (void *)convert_ip_to_linear(current, regs);
137124 max_instr = instr + 15;
138125
139
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
140
- return 0;
126
+ /*
127
+ * This code has historically always bailed out if IP points to a
128
+ * not-present page (e.g. due to a race). No one has ever
129
+ * complained about this.
130
+ */
131
+ pagefault_disable();
141132
142133 while (instr < max_instr) {
143134 unsigned char opcode;
144135
145
- if (probe_kernel_address(instr, opcode))
146
- break;
136
+ if (user_mode(regs)) {
137
+ if (get_user(opcode, instr))
138
+ break;
139
+ } else {
140
+ if (get_kernel_nofault(opcode, instr))
141
+ break;
142
+ }
147143
148144 instr++;
149145
150146 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
151147 break;
152148 }
149
+
150
+ pagefault_enable();
153151 return prefetch;
154
-}
155
-
156
-/*
157
- * A protection key fault means that the PKRU value did not allow
158
- * access to some PTE. Userspace can figure out what PKRU was
159
- * from the XSAVE state, and this function fills out a field in
160
- * siginfo so userspace can discover which protection key was set
161
- * on the PTE.
162
- *
163
- * If we get here, we know that the hardware signaled a X86_PF_PK
164
- * fault and that there was a VMA once we got in the fault
165
- * handler. It does *not* guarantee that the VMA we find here
166
- * was the one that we faulted on.
167
- *
168
- * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
169
- * 2. T1 : set PKRU to deny access to pkey=4, touches page
170
- * 3. T1 : faults...
171
- * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
172
- * 5. T1 : enters fault handler, takes mmap_sem, etc...
173
- * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
174
- * faulted on a pte with its pkey=4.
175
- */
176
-static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
177
- u32 *pkey)
178
-{
179
- /* This is effectively an #ifdef */
180
- if (!boot_cpu_has(X86_FEATURE_OSPKE))
181
- return;
182
-
183
- /* Fault not from Protection Keys: nothing to do */
184
- if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
185
- return;
186
- /*
187
- * force_sig_info_fault() is called from a number of
188
- * contexts, some of which have a VMA and some of which
189
- * do not. The X86_PF_PK handing happens after we have a
190
- * valid VMA, so we should never reach this without a
191
- * valid VMA.
192
- */
193
- if (!pkey) {
194
- WARN_ONCE(1, "PKU fault with no VMA passed in");
195
- info->si_pkey = 0;
196
- return;
197
- }
198
- /*
199
- * si_pkey should be thought of as a strong hint, but not
200
- * absolutely guranteed to be 100% accurate because of
201
- * the race explained above.
202
- */
203
- info->si_pkey = *pkey;
204
-}
205
-
206
-static void
207
-force_sig_info_fault(int si_signo, int si_code, unsigned long address,
208
- struct task_struct *tsk, u32 *pkey, int fault)
209
-{
210
- unsigned lsb = 0;
211
- siginfo_t info;
212
-
213
- clear_siginfo(&info);
214
- info.si_signo = si_signo;
215
- info.si_errno = 0;
216
- info.si_code = si_code;
217
- info.si_addr = (void __user *)address;
218
- if (fault & VM_FAULT_HWPOISON_LARGE)
219
- lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
220
- if (fault & VM_FAULT_HWPOISON)
221
- lsb = PAGE_SHIFT;
222
- info.si_addr_lsb = lsb;
223
-
224
- fill_sig_info_pkey(si_signo, si_code, &info, pkey);
225
-
226
- force_sig_info(si_signo, &info, tsk);
227152 }
228153
229154 DEFINE_SPINLOCK(pgd_lock);
....@@ -273,47 +198,19 @@
273198 return pmd_k;
274199 }
275200
276
-static void vmalloc_sync(void)
277
-{
278
- unsigned long address;
279
-
280
- if (SHARED_KERNEL_PMD)
281
- return;
282
-
283
- for (address = VMALLOC_START & PMD_MASK;
284
- address >= TASK_SIZE_MAX && address < VMALLOC_END;
285
- address += PMD_SIZE) {
286
- struct page *page;
287
-
288
- spin_lock(&pgd_lock);
289
- list_for_each_entry(page, &pgd_list, lru) {
290
- spinlock_t *pgt_lock;
291
-
292
- /* the pgt_lock only for Xen */
293
- pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
294
-
295
- spin_lock(pgt_lock);
296
- vmalloc_sync_one(page_address(page), address);
297
- spin_unlock(pgt_lock);
298
- }
299
- spin_unlock(&pgd_lock);
300
- }
301
-}
302
-
303
-void vmalloc_sync_mappings(void)
304
-{
305
- vmalloc_sync();
306
-}
307
-
308
-void vmalloc_sync_unmappings(void)
309
-{
310
- vmalloc_sync();
311
-}
312
-
313201 /*
314
- * 32-bit:
315
- *
316202 * Handle a fault on the vmalloc or module mapping area
203
+ *
204
+ * This is needed because there is a race condition between the time
205
+ * when the vmalloc mapping code updates the PMD to the point in time
206
+ * where it synchronizes this update with the other page-tables in the
207
+ * system.
208
+ *
209
+ * In this race window another thread/CPU can map an area on the same
210
+ * PMD, finds it already present and does not synchronize it with the
211
+ * rest of the system yet. As a result v[mz]alloc might return areas
212
+ * which are not mapped in every page-table in the system, causing an
213
+ * unhandled page-fault when they are accessed.
317214 */
318215 static noinline int vmalloc_fault(unsigned long address)
319216 {
....@@ -347,6 +244,30 @@
347244 return 0;
348245 }
349246 NOKPROBE_SYMBOL(vmalloc_fault);
247
+
248
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
249
+{
250
+ unsigned long addr;
251
+
252
+ for (addr = start & PMD_MASK;
253
+ addr >= TASK_SIZE_MAX && addr < VMALLOC_END;
254
+ addr += PMD_SIZE) {
255
+ struct page *page;
256
+
257
+ spin_lock(&pgd_lock);
258
+ list_for_each_entry(page, &pgd_list, lru) {
259
+ spinlock_t *pgt_lock;
260
+
261
+ /* the pgt_lock only for Xen */
262
+ pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
263
+
264
+ spin_lock(pgt_lock);
265
+ vmalloc_sync_one(page_address(page), addr);
266
+ spin_unlock(pgt_lock);
267
+ }
268
+ spin_unlock(&pgd_lock);
269
+ }
270
+}
350271
351272 /*
352273 * Did it hit the DOS screen memory VA from vm86 mode?
....@@ -412,96 +333,6 @@
412333
413334 #else /* CONFIG_X86_64: */
414335
415
-void vmalloc_sync_mappings(void)
416
-{
417
- /*
418
- * 64-bit mappings might allocate new p4d/pud pages
419
- * that need to be propagated to all tasks' PGDs.
420
- */
421
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
422
-}
423
-
424
-void vmalloc_sync_unmappings(void)
425
-{
426
- /*
427
- * Unmappings never allocate or free p4d/pud pages.
428
- * No work is required here.
429
- */
430
-}
431
-
432
-/*
433
- * 64-bit:
434
- *
435
- * Handle a fault on the vmalloc area
436
- */
437
-static noinline int vmalloc_fault(unsigned long address)
438
-{
439
- pgd_t *pgd, *pgd_k;
440
- p4d_t *p4d, *p4d_k;
441
- pud_t *pud;
442
- pmd_t *pmd;
443
- pte_t *pte;
444
-
445
- /* Make sure we are in vmalloc area: */
446
- if (!(address >= VMALLOC_START && address < VMALLOC_END))
447
- return -1;
448
-
449
- /*
450
- * Copy kernel mappings over when needed. This can also
451
- * happen within a race in page table update. In the later
452
- * case just flush:
453
- */
454
- pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
455
- pgd_k = pgd_offset_k(address);
456
- if (pgd_none(*pgd_k))
457
- return -1;
458
-
459
- if (pgtable_l5_enabled()) {
460
- if (pgd_none(*pgd)) {
461
- set_pgd(pgd, *pgd_k);
462
- arch_flush_lazy_mmu_mode();
463
- } else {
464
- BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k));
465
- }
466
- }
467
-
468
- /* With 4-level paging, copying happens on the p4d level. */
469
- p4d = p4d_offset(pgd, address);
470
- p4d_k = p4d_offset(pgd_k, address);
471
- if (p4d_none(*p4d_k))
472
- return -1;
473
-
474
- if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
475
- set_p4d(p4d, *p4d_k);
476
- arch_flush_lazy_mmu_mode();
477
- } else {
478
- BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k));
479
- }
480
-
481
- BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
482
-
483
- pud = pud_offset(p4d, address);
484
- if (pud_none(*pud))
485
- return -1;
486
-
487
- if (pud_large(*pud))
488
- return 0;
489
-
490
- pmd = pmd_offset(pud, address);
491
- if (pmd_none(*pmd))
492
- return -1;
493
-
494
- if (pmd_large(*pmd))
495
- return 0;
496
-
497
- pte = pte_offset_kernel(pmd, address);
498
- if (!pte_present(*pte))
499
- return -1;
500
-
501
- return 0;
502
-}
503
-NOKPROBE_SYMBOL(vmalloc_fault);
504
-
505336 #ifdef CONFIG_CPU_SUP_AMD
506337 static const char errata93_warning[] =
507338 KERN_ERR
....@@ -524,7 +355,7 @@
524355 {
525356 unsigned long dummy;
526357
527
- return probe_kernel_address((unsigned long *)p, dummy);
358
+ return get_kernel_nofault(dummy, (unsigned long *)p);
528359 }
529360
530361 static void dump_pagetable(unsigned long address)
....@@ -637,29 +468,51 @@
637468 return 0;
638469 }
639470
471
+/* Pentium F0 0F C7 C8 bug workaround: */
640472 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
641473 {
642474 #ifdef CONFIG_X86_F00F_BUG
643
- unsigned long nr;
644
-
645
- /*
646
- * Pentium F0 0F C7 C8 bug workaround:
647
- */
648
- if (boot_cpu_has_bug(X86_BUG_F00F)) {
649
- nr = (address - idt_descr.address) >> 3;
650
-
651
- if (nr == 6) {
652
- do_invalid_op(regs, 0);
653
- return 1;
654
- }
475
+ if (boot_cpu_has_bug(X86_BUG_F00F) && idt_is_f00f_address(address)) {
476
+ handle_invalid_op(regs);
477
+ return 1;
655478 }
656479 #endif
657480 return 0;
658481 }
659482
483
+static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index)
484
+{
485
+ u32 offset = (index >> 3) * sizeof(struct desc_struct);
486
+ unsigned long addr;
487
+ struct ldttss_desc desc;
488
+
489
+ if (index == 0) {
490
+ pr_alert("%s: NULL\n", name);
491
+ return;
492
+ }
493
+
494
+ if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
495
+ pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
496
+ return;
497
+ }
498
+
499
+ if (copy_from_kernel_nofault(&desc, (void *)(gdt->address + offset),
500
+ sizeof(struct ldttss_desc))) {
501
+ pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
502
+ name, index);
503
+ return;
504
+ }
505
+
506
+ addr = desc.base0 | (desc.base1 << 16) | ((unsigned long)desc.base2 << 24);
507
+#ifdef CONFIG_X86_64
508
+ addr |= ((u64)desc.base3 << 32);
509
+#endif
510
+ pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
511
+ name, index, addr, (desc.limit0 | (desc.limit1 << 16)));
512
+}
513
+
660514 static void
661
-show_fault_oops(struct pt_regs *regs, unsigned long error_code,
662
- unsigned long address)
515
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
663516 {
664517 if (!oops_may_print())
665518 return;
....@@ -684,9 +537,53 @@
684537 from_kuid(&init_user_ns, current_uid()));
685538 }
686539
687
- pr_alert("BUG: unable to handle kernel %s at %px\n",
688
- address < PAGE_SIZE ? "NULL pointer dereference" : "paging request",
689
- (void *)address);
540
+ if (address < PAGE_SIZE && !user_mode(regs))
541
+ pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
542
+ (void *)address);
543
+ else
544
+ pr_alert("BUG: unable to handle page fault for address: %px\n",
545
+ (void *)address);
546
+
547
+ pr_alert("#PF: %s %s in %s mode\n",
548
+ (error_code & X86_PF_USER) ? "user" : "supervisor",
549
+ (error_code & X86_PF_INSTR) ? "instruction fetch" :
550
+ (error_code & X86_PF_WRITE) ? "write access" :
551
+ "read access",
552
+ user_mode(regs) ? "user" : "kernel");
553
+ pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
554
+ !(error_code & X86_PF_PROT) ? "not-present page" :
555
+ (error_code & X86_PF_RSVD) ? "reserved bit violation" :
556
+ (error_code & X86_PF_PK) ? "protection keys violation" :
557
+ "permissions violation");
558
+
559
+ if (!(error_code & X86_PF_USER) && user_mode(regs)) {
560
+ struct desc_ptr idt, gdt;
561
+ u16 ldtr, tr;
562
+
563
+ /*
564
+ * This can happen for quite a few reasons. The more obvious
565
+ * ones are faults accessing the GDT, or LDT. Perhaps
566
+ * surprisingly, if the CPU tries to deliver a benign or
567
+ * contributory exception from user code and gets a page fault
568
+ * during delivery, the page fault can be delivered as though
569
+ * it originated directly from user code. This could happen
570
+ * due to wrong permissions on the IDT, GDT, LDT, TSS, or
571
+ * kernel or IST stack.
572
+ */
573
+ store_idt(&idt);
574
+
575
+ /* Usable even on Xen PV -- it's just slow. */
576
+ native_store_gdt(&gdt);
577
+
578
+ pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
579
+ idt.address, idt.size, gdt.address, gdt.size);
580
+
581
+ store_ldt(ldtr);
582
+ show_ldttss(&gdt, "LDTR", ldtr);
583
+
584
+ store_tr(tr);
585
+ show_ldttss(&gdt, "TR", tr);
586
+ }
690587
691588 dump_pagetable(address);
692589 }
....@@ -707,14 +604,32 @@
707604 tsk->comm, address);
708605 dump_pagetable(address);
709606
710
- tsk->thread.cr2 = address;
711
- tsk->thread.trap_nr = X86_TRAP_PF;
712
- tsk->thread.error_code = error_code;
713
-
714607 if (__die("Bad pagetable", regs, error_code))
715608 sig = 0;
716609
717610 oops_end(flags, regs, sig);
611
+}
612
+
613
+static void set_signal_archinfo(unsigned long address,
614
+ unsigned long error_code)
615
+{
616
+ struct task_struct *tsk = current;
617
+
618
+ /*
619
+ * To avoid leaking information about the kernel page
620
+ * table layout, pretend that user-mode accesses to
621
+ * kernel addresses are always protection faults.
622
+ *
623
+ * NB: This means that failed vsyscalls with vsyscall=none
624
+ * will have the PROT bit. This doesn't leak any
625
+ * information and does not appear to cause any problems.
626
+ */
627
+ if (address >= TASK_SIZE_MAX)
628
+ error_code |= X86_PF_PROT;
629
+
630
+ tsk->thread.trap_nr = X86_TRAP_PF;
631
+ tsk->thread.error_code = error_code | X86_PF_USER;
632
+ tsk->thread.cr2 = address;
718633 }
719634
720635 static noinline void
....@@ -725,8 +640,17 @@
725640 unsigned long flags;
726641 int sig;
727642
643
+ if (user_mode(regs)) {
644
+ /*
645
+ * This is an implicit supervisor-mode access from user
646
+ * mode. Bypass all the kernel-mode recovery code and just
647
+ * OOPS.
648
+ */
649
+ goto oops;
650
+ }
651
+
728652 /* Are we prepared to handle this kernel fault? */
729
- if (fixup_exception(regs, X86_TRAP_PF)) {
653
+ if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
730654 /*
731655 * Any interrupt that takes a fault gets the fixup. This makes
732656 * the below recursive fault logic only apply to a faults from
....@@ -742,13 +666,10 @@
742666 * faulting through the emulate_vsyscall() logic.
743667 */
744668 if (current->thread.sig_on_uaccess_err && signal) {
745
- tsk->thread.trap_nr = X86_TRAP_PF;
746
- tsk->thread.error_code = error_code | X86_PF_USER;
747
- tsk->thread.cr2 = address;
669
+ set_signal_archinfo(address, error_code);
748670
749671 /* XXX: hwpoison faults will set the wrong code. */
750
- force_sig_info_fault(signal, si_code, address,
751
- tsk, NULL, 0);
672
+ force_sig_fault(signal, si_code, (void __user *)address);
752673 }
753674
754675 /*
....@@ -766,7 +687,7 @@
766687 if (is_vmalloc_addr((void *)address) &&
767688 (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
768689 address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
769
- unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
690
+ unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
770691 /*
771692 * We're likely to be running with very little stack space
772693 * left. It's plausible that we'd hit this condition but
....@@ -806,6 +727,19 @@
806727 return;
807728
808729 /*
730
+ * Buggy firmware could access regions which might page fault, try to
731
+ * recover from such faults.
732
+ */
733
+ if (IS_ENABLED(CONFIG_EFI))
734
+ efi_recover_from_page_fault(address);
735
+
736
+ /* Only not-present faults should be handled by KFENCE. */
737
+ if (!(error_code & X86_PF_PROT) &&
738
+ kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
739
+ return;
740
+
741
+oops:
742
+ /*
809743 * Oops. The kernel tried to access some bad page. We'll have to
810744 * terminate things with extreme prejudice:
811745 */
....@@ -815,10 +749,6 @@
815749
816750 if (task_stack_end_corrupted(tsk))
817751 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
818
-
819
- tsk->thread.cr2 = address;
820
- tsk->thread.trap_nr = X86_TRAP_PF;
821
- tsk->thread.error_code = error_code;
822752
823753 sig = SIGKILL;
824754 if (__die("Oops", regs, error_code))
....@@ -857,14 +787,23 @@
857787 show_opcodes(regs, loglvl);
858788 }
859789
790
+/*
791
+ * The (legacy) vsyscall page is the long page in the kernel portion
792
+ * of the address space that has user-accessible permissions.
793
+ */
794
+static bool is_vsyscall_vaddr(unsigned long vaddr)
795
+{
796
+ return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
797
+}
798
+
860799 static void
861800 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
862
- unsigned long address, u32 *pkey, int si_code)
801
+ unsigned long address, u32 pkey, int si_code)
863802 {
864803 struct task_struct *tsk = current;
865804
866805 /* User mode accesses just cause a SIGSEGV */
867
- if (error_code & X86_PF_USER) {
806
+ if (user_mode(regs) && (error_code & X86_PF_USER)) {
868807 /*
869808 * It's possible to have interrupts off here:
870809 */
....@@ -880,18 +819,6 @@
880819 if (is_errata100(regs, address))
881820 return;
882821
883
-#ifdef CONFIG_X86_64
884
- /*
885
- * Instruction fetch faults in the vsyscall page might need
886
- * emulation.
887
- */
888
- if (unlikely((error_code & X86_PF_INSTR) &&
889
- ((address & ~0xfff) == VSYSCALL_ADDR))) {
890
- if (emulate_vsyscall(regs, address))
891
- return;
892
- }
893
-#endif
894
-
895822 /*
896823 * To avoid leaking information about the kernel page table
897824 * layout, pretend that user-mode accesses to kernel addresses
....@@ -903,11 +830,14 @@
903830 if (likely(show_unhandled_signals))
904831 show_signal_msg(regs, error_code, address, tsk);
905832
906
- tsk->thread.cr2 = address;
907
- tsk->thread.error_code = error_code;
908
- tsk->thread.trap_nr = X86_TRAP_PF;
833
+ set_signal_archinfo(address, error_code);
909834
910
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
835
+ if (si_code == SEGV_PKUERR)
836
+ force_sig_pkuerr((void __user *)address, pkey);
837
+
838
+ force_sig_fault(SIGSEGV, si_code, (void __user *)address);
839
+
840
+ local_irq_disable();
911841
912842 return;
913843 }
....@@ -920,35 +850,29 @@
920850
921851 static noinline void
922852 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
923
- unsigned long address, u32 *pkey)
853
+ unsigned long address)
924854 {
925
- __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
855
+ __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
926856 }
927857
928858 static void
929859 __bad_area(struct pt_regs *regs, unsigned long error_code,
930
- unsigned long address, struct vm_area_struct *vma, int si_code)
860
+ unsigned long address, u32 pkey, int si_code)
931861 {
932862 struct mm_struct *mm = current->mm;
933
- u32 pkey;
934
-
935
- if (vma)
936
- pkey = vma_pkey(vma);
937
-
938863 /*
939864 * Something tried to access memory that isn't in our memory map..
940865 * Fix it, but check if it's kernel or user first..
941866 */
942
- up_read(&mm->mmap_sem);
867
+ mmap_read_unlock(mm);
943868
944
- __bad_area_nosemaphore(regs, error_code, address,
945
- (vma) ? &pkey : NULL, si_code);
869
+ __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
946870 }
947871
948872 static noinline void
949873 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
950874 {
951
- __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
875
+ __bad_area(regs, error_code, address, 0, SEGV_MAPERR);
952876 }
953877
954878 static inline bool bad_area_access_from_pkeys(unsigned long error_code,
....@@ -977,19 +901,39 @@
977901 * But, doing it this way allows compiler optimizations
978902 * if pkeys are compiled out.
979903 */
980
- if (bad_area_access_from_pkeys(error_code, vma))
981
- __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
982
- else
983
- __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
904
+ if (bad_area_access_from_pkeys(error_code, vma)) {
905
+ /*
906
+ * A protection key fault means that the PKRU value did not allow
907
+ * access to some PTE. Userspace can figure out what PKRU was
908
+ * from the XSAVE state. This function captures the pkey from
909
+ * the vma and passes it to userspace so userspace can discover
910
+ * which protection key was set on the PTE.
911
+ *
912
+ * If we get here, we know that the hardware signaled a X86_PF_PK
913
+ * fault and that there was a VMA once we got in the fault
914
+ * handler. It does *not* guarantee that the VMA we find here
915
+ * was the one that we faulted on.
916
+ *
917
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
918
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
919
+ * 3. T1 : faults...
920
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
921
+ * 5. T1 : enters fault handler, takes mmap_lock, etc...
922
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
923
+ * faulted on a pte with its pkey=4.
924
+ */
925
+ u32 pkey = vma_pkey(vma);
926
+
927
+ __bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
928
+ } else {
929
+ __bad_area(regs, error_code, address, 0, SEGV_ACCERR);
930
+ }
984931 }
985932
986933 static void
987934 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
988
- u32 *pkey, unsigned int fault)
935
+ vm_fault_t fault)
989936 {
990
- struct task_struct *tsk = current;
991
- int code = BUS_ADRERR;
992
-
993937 /* Kernel mode? Handle exceptions or die: */
994938 if (!(error_code & X86_PF_USER)) {
995939 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
....@@ -1000,24 +944,30 @@
1000944 if (is_prefetch(regs, error_code, address))
1001945 return;
1002946
1003
- tsk->thread.cr2 = address;
1004
- tsk->thread.error_code = error_code;
1005
- tsk->thread.trap_nr = X86_TRAP_PF;
947
+ set_signal_archinfo(address, error_code);
1006948
1007949 #ifdef CONFIG_MEMORY_FAILURE
1008950 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
1009
- printk(KERN_ERR
951
+ struct task_struct *tsk = current;
952
+ unsigned lsb = 0;
953
+
954
+ pr_err(
1010955 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
1011956 tsk->comm, tsk->pid, address);
1012
- code = BUS_MCEERR_AR;
957
+ if (fault & VM_FAULT_HWPOISON_LARGE)
958
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
959
+ if (fault & VM_FAULT_HWPOISON)
960
+ lsb = PAGE_SHIFT;
961
+ force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
962
+ return;
1013963 }
1014964 #endif
1015
- force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
965
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
1016966 }
1017967
1018968 static noinline void
1019969 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
1020
- unsigned long address, u32 *pkey, vm_fault_t fault)
970
+ unsigned long address, vm_fault_t fault)
1021971 {
1022972 if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
1023973 no_context(regs, error_code, address, 0, 0);
....@@ -1041,27 +991,21 @@
1041991 } else {
1042992 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1043993 VM_FAULT_HWPOISON_LARGE))
1044
- do_sigbus(regs, error_code, address, pkey, fault);
994
+ do_sigbus(regs, error_code, address, fault);
1045995 else if (fault & VM_FAULT_SIGSEGV)
1046
- bad_area_nosemaphore(regs, error_code, address, pkey);
996
+ bad_area_nosemaphore(regs, error_code, address);
1047997 else
1048998 BUG();
1049999 }
10501000 }
10511001
1052
-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
1002
+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
10531003 {
10541004 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
10551005 return 0;
10561006
10571007 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
10581008 return 0;
1059
- /*
1060
- * Note: We do not do lazy flushing on protection key
1061
- * changes, so no spurious fault will ever set X86_PF_PK.
1062
- */
1063
- if ((error_code & X86_PF_PK))
1064
- return 1;
10651009
10661010 return 1;
10671011 }
....@@ -1088,7 +1032,7 @@
10881032 * (Optional Invalidation).
10891033 */
10901034 static noinline int
1091
-spurious_fault(unsigned long error_code, unsigned long address)
1035
+spurious_kernel_fault(unsigned long error_code, unsigned long address)
10921036 {
10931037 pgd_t *pgd;
10941038 p4d_t *p4d;
....@@ -1119,27 +1063,27 @@
11191063 return 0;
11201064
11211065 if (p4d_large(*p4d))
1122
- return spurious_fault_check(error_code, (pte_t *) p4d);
1066
+ return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
11231067
11241068 pud = pud_offset(p4d, address);
11251069 if (!pud_present(*pud))
11261070 return 0;
11271071
11281072 if (pud_large(*pud))
1129
- return spurious_fault_check(error_code, (pte_t *) pud);
1073
+ return spurious_kernel_fault_check(error_code, (pte_t *) pud);
11301074
11311075 pmd = pmd_offset(pud, address);
11321076 if (!pmd_present(*pmd))
11331077 return 0;
11341078
11351079 if (pmd_large(*pmd))
1136
- return spurious_fault_check(error_code, (pte_t *) pmd);
1080
+ return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
11371081
11381082 pte = pte_offset_kernel(pmd, address);
11391083 if (!pte_present(*pte))
11401084 return 0;
11411085
1142
- ret = spurious_fault_check(error_code, pte);
1086
+ ret = spurious_kernel_fault_check(error_code, pte);
11431087 if (!ret)
11441088 return 0;
11451089
....@@ -1147,12 +1091,12 @@
11471091 * Make sure we have permissions in PMD.
11481092 * If not, then there's a bug in the page tables:
11491093 */
1150
- ret = spurious_fault_check(error_code, (pte_t *) pmd);
1094
+ ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
11511095 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
11521096
11531097 return ret;
11541098 }
1155
-NOKPROBE_SYMBOL(spurious_fault);
1099
+NOKPROBE_SYMBOL(spurious_kernel_fault);
11561100
11571101 int show_unhandled_signals = 1;
11581102
....@@ -1191,60 +1135,44 @@
11911135 return 1;
11921136
11931137 /* read, not present: */
1194
- if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1138
+ if (unlikely(!vma_is_accessible(vma)))
11951139 return 1;
11961140
11971141 return 0;
11981142 }
11991143
1200
-static int fault_in_kernel_space(unsigned long address)
1144
+bool fault_in_kernel_space(unsigned long address)
12011145 {
1146
+ /*
1147
+ * On 64-bit systems, the vsyscall page is at an address above
1148
+ * TASK_SIZE_MAX, but is not considered part of the kernel
1149
+ * address space.
1150
+ */
1151
+ if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1152
+ return false;
1153
+
12021154 return address >= TASK_SIZE_MAX;
12031155 }
12041156
1205
-static inline bool smap_violation(int error_code, struct pt_regs *regs)
1206
-{
1207
- if (!IS_ENABLED(CONFIG_X86_SMAP))
1208
- return false;
1209
-
1210
- if (!static_cpu_has(X86_FEATURE_SMAP))
1211
- return false;
1212
-
1213
- if (error_code & X86_PF_USER)
1214
- return false;
1215
-
1216
- if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1217
- return false;
1218
-
1219
- return true;
1220
-}
1221
-
12221157 /*
1223
- * This routine handles page faults. It determines the address,
1224
- * and the problem, and then passes it off to one of the appropriate
1225
- * routines.
1158
+ * Called for all faults where 'address' is part of the kernel address
1159
+ * space. Might get called for faults that originate from *code* that
1160
+ * ran in userspace or the kernel.
12261161 */
1227
-static noinline void
1228
-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
1229
- unsigned long address)
1162
+static void
1163
+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1164
+ unsigned long address)
12301165 {
1231
- struct vm_area_struct *vma;
1232
- struct task_struct *tsk;
1233
- struct mm_struct *mm;
1234
- vm_fault_t fault, major = 0;
1235
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1236
- u32 pkey;
1237
-
1238
- tsk = current;
1239
- mm = tsk->mm;
1240
-
1241
- prefetchw(&mm->mmap_sem);
1242
-
1243
- if (unlikely(kmmio_fault(regs, address)))
1244
- return;
1245
-
12461166 /*
1247
- * We fault-in kernel-space virtual memory on-demand. The
1167
+ * Protection keys exceptions only happen on user pages. We
1168
+ * have no user pages in the kernel portion of the address
1169
+ * space, so do not expect them here.
1170
+ */
1171
+ WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1172
+
1173
+#ifdef CONFIG_X86_32
1174
+ /*
1175
+ * We can fault-in kernel-space virtual memory on-demand. The
12481176 * 'reference' page table is init_mm.pgd.
12491177 *
12501178 * NOTE! We MUST NOT take any locks for this case. We may
....@@ -1252,41 +1180,85 @@
12521180 * only copy the information from the master page table,
12531181 * nothing more.
12541182 *
1255
- * This verifies that the fault happens in kernel space
1256
- * (error_code & 4) == 0, and that the fault was not a
1257
- * protection error (error_code & 9) == 0.
1183
+ * Before doing this on-demand faulting, ensure that the
1184
+ * fault is not any of the following:
1185
+ * 1. A fault on a PTE with a reserved bit set.
1186
+ * 2. A fault caused by a user-mode access. (Do not demand-
1187
+ * fault kernel memory due to user-mode accesses).
1188
+ * 3. A fault caused by a page-level protection violation.
1189
+ * (A demand fault would be on a non-present page which
1190
+ * would have X86_PF_PROT==0).
1191
+ *
1192
+ * This is only needed to close a race condition on x86-32 in
1193
+ * the vmalloc mapping/unmapping code. See the comment above
1194
+ * vmalloc_fault() for details. On x86-64 the race does not
1195
+ * exist as the vmalloc mappings don't need to be synchronized
1196
+ * there.
12581197 */
1259
- if (unlikely(fault_in_kernel_space(address))) {
1260
- if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1261
- if (vmalloc_fault(address) >= 0)
1262
- return;
1263
- }
1264
-
1265
- /* Can handle a stale RO->RW TLB: */
1266
- if (spurious_fault(error_code, address))
1198
+ if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1199
+ if (vmalloc_fault(address) >= 0)
12671200 return;
1268
-
1269
- /* kprobes don't want to hook the spurious faults: */
1270
- if (kprobes_fault(regs))
1271
- return;
1272
- /*
1273
- * Don't take the mm semaphore here. If we fixup a prefetch
1274
- * fault we could otherwise deadlock:
1275
- */
1276
- bad_area_nosemaphore(regs, error_code, address, NULL);
1277
-
1278
- return;
12791201 }
1202
+#endif
1203
+
1204
+ /* Was the fault spurious, caused by lazy TLB invalidation? */
1205
+ if (spurious_kernel_fault(hw_error_code, address))
1206
+ return;
12801207
12811208 /* kprobes don't want to hook the spurious faults: */
1282
- if (unlikely(kprobes_fault(regs)))
1209
+ if (kprobe_page_fault(regs, X86_TRAP_PF))
12831210 return;
12841211
1285
- if (unlikely(error_code & X86_PF_RSVD))
1286
- pgtable_bad(regs, error_code, address);
1212
+ /*
1213
+ * Note, despite being a "bad area", there are quite a few
1214
+ * acceptable reasons to get here, such as erratum fixups
1215
+ * and handling kernel code that can fault, like get_user().
1216
+ *
1217
+ * Don't take the mm semaphore here. If we fixup a prefetch
1218
+ * fault we could otherwise deadlock:
1219
+ */
1220
+ bad_area_nosemaphore(regs, hw_error_code, address);
1221
+}
1222
+NOKPROBE_SYMBOL(do_kern_addr_fault);
12871223
1288
- if (unlikely(smap_violation(error_code, regs))) {
1289
- bad_area_nosemaphore(regs, error_code, address, NULL);
1224
+/* Handle faults in the user portion of the address space */
1225
+static inline
1226
+void do_user_addr_fault(struct pt_regs *regs,
1227
+ unsigned long hw_error_code,
1228
+ unsigned long address)
1229
+{
1230
+ struct vm_area_struct *vma = NULL;
1231
+ struct task_struct *tsk;
1232
+ struct mm_struct *mm;
1233
+ vm_fault_t fault;
1234
+ unsigned int flags = FAULT_FLAG_DEFAULT;
1235
+
1236
+ tsk = current;
1237
+ mm = tsk->mm;
1238
+
1239
+ /* kprobes don't want to hook the spurious faults: */
1240
+ if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
1241
+ return;
1242
+
1243
+ /*
1244
+ * Reserved bits are never expected to be set on
1245
+ * entries in the user portion of the page tables.
1246
+ */
1247
+ if (unlikely(hw_error_code & X86_PF_RSVD))
1248
+ pgtable_bad(regs, hw_error_code, address);
1249
+
1250
+ /*
1251
+ * If SMAP is on, check for invalid kernel (supervisor) access to user
1252
+ * pages in the user address space. The odd case here is WRUSS,
1253
+ * which, according to the preliminary documentation, does not respect
1254
+ * SMAP and will have the USER bit set so, in all cases, SMAP
1255
+ * enforcement appears to be consistent with the USER bit.
1256
+ */
1257
+ if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
1258
+ !(hw_error_code & X86_PF_USER) &&
1259
+ !(regs->flags & X86_EFLAGS_AC)))
1260
+ {
1261
+ bad_area_nosemaphore(regs, hw_error_code, address);
12901262 return;
12911263 }
12921264
....@@ -1295,7 +1267,7 @@
12951267 * in a region with pagefaults disabled then we must not take the fault
12961268 */
12971269 if (unlikely(faulthandler_disabled() || !mm)) {
1298
- bad_area_nosemaphore(regs, error_code, address, NULL);
1270
+ bad_area_nosemaphore(regs, hw_error_code, address);
12991271 return;
13001272 }
13011273
....@@ -1308,7 +1280,6 @@
13081280 */
13091281 if (user_mode(regs)) {
13101282 local_irq_enable();
1311
- error_code |= X86_PF_USER;
13121283 flags |= FAULT_FLAG_USER;
13131284 } else {
13141285 if (regs->flags & X86_EFLAGS_IF)
....@@ -1317,35 +1288,62 @@
13171288
13181289 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
13191290
1320
- if (error_code & X86_PF_WRITE)
1291
+ if (hw_error_code & X86_PF_WRITE)
13211292 flags |= FAULT_FLAG_WRITE;
1322
- if (error_code & X86_PF_INSTR)
1293
+ if (hw_error_code & X86_PF_INSTR)
13231294 flags |= FAULT_FLAG_INSTRUCTION;
13241295
1296
+#ifdef CONFIG_X86_64
13251297 /*
1326
- * When running in the kernel we expect faults to occur only to
1327
- * addresses in user space. All other faults represent errors in
1328
- * the kernel and should generate an OOPS. Unfortunately, in the
1329
- * case of an erroneous fault occurring in a code path which already
1330
- * holds mmap_sem we will deadlock attempting to validate the fault
1331
- * against the address space. Luckily the kernel only validly
1332
- * references user space from well defined areas of code, which are
1333
- * listed in the exceptions table.
1298
+ * Faults in the vsyscall page might need emulation. The
1299
+ * vsyscall page is at a high address (>PAGE_OFFSET), but is
1300
+ * considered to be part of the user address space.
13341301 *
1335
- * As the vast majority of faults will be valid we will only perform
1336
- * the source reference check when there is a possibility of a
1337
- * deadlock. Attempt to lock the address space, if we cannot we then
1338
- * validate the source. If this is invalid we can skip the address
1339
- * space check, thus avoiding the deadlock:
1302
+ * The vsyscall page does not have a "real" VMA, so do this
1303
+ * emulation before we go searching for VMAs.
1304
+ *
1305
+ * PKRU never rejects instruction fetches, so we don't need
1306
+ * to consider the PF_PK bit.
13401307 */
1341
- if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1342
- if (!(error_code & X86_PF_USER) &&
1343
- !search_exception_tables(regs->ip)) {
1344
- bad_area_nosemaphore(regs, error_code, address, NULL);
1308
+ if (is_vsyscall_vaddr(address)) {
1309
+ if (emulate_vsyscall(hw_error_code, regs, address))
1310
+ return;
1311
+ }
1312
+#endif
1313
+
1314
+ /*
1315
+ * Do not try to do a speculative page fault if the fault was due to
1316
+ * protection keys since it can't be resolved.
1317
+ */
1318
+ if (!(hw_error_code & X86_PF_PK)) {
1319
+ fault = handle_speculative_fault(mm, address, flags, &vma, regs);
1320
+ if (fault != VM_FAULT_RETRY)
1321
+ goto done;
1322
+ }
1323
+
1324
+ /*
1325
+ * Kernel-mode access to the user address space should only occur
1326
+ * on well-defined single instructions listed in the exception
1327
+ * tables. But, an erroneous kernel fault occurring outside one of
1328
+ * those areas which also holds mmap_lock might deadlock attempting
1329
+ * to validate the fault against the address space.
1330
+ *
1331
+ * Only do the expensive exception table search when we might be at
1332
+ * risk of a deadlock. This happens if we
1333
+ * 1. Failed to acquire mmap_lock, and
1334
+ * 2. The access did not originate in userspace.
1335
+ */
1336
+ if (unlikely(!mmap_read_trylock(mm))) {
1337
+ if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
1338
+ /*
1339
+ * Fault from code in kernel from
1340
+ * which we do not expect faults.
1341
+ */
1342
+ bad_area_nosemaphore(regs, hw_error_code, address);
13451343 return;
13461344 }
13471345 retry:
1348
- down_read(&mm->mmap_sem);
1346
+ mmap_read_lock(mm);
13491347 } else {
13501348 /*
13511349 * The above down_read_trylock() might have succeeded in
....@@ -1355,31 +1353,20 @@
13551353 might_sleep();
13561354 }
13571355
1358
- vma = find_vma(mm, address);
1356
+ if (!vma || !can_reuse_spf_vma(vma, address))
1357
+ vma = find_vma(mm, address);
13591358 if (unlikely(!vma)) {
1360
- bad_area(regs, error_code, address);
1359
+ bad_area(regs, hw_error_code, address);
13611360 return;
13621361 }
13631362 if (likely(vma->vm_start <= address))
13641363 goto good_area;
13651364 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1366
- bad_area(regs, error_code, address);
1365
+ bad_area(regs, hw_error_code, address);
13671366 return;
13681367 }
1369
- if (error_code & X86_PF_USER) {
1370
- /*
1371
- * Accessing the stack below %sp is always a bug.
1372
- * The large cushion allows instructions like enter
1373
- * and pusha to work. ("enter $65535, $31" pushes
1374
- * 32 pointers and then decrements %sp by 65535.)
1375
- */
1376
- if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1377
- bad_area(regs, error_code, address);
1378
- return;
1379
- }
1380
- }
13811368 if (unlikely(expand_stack(vma, address))) {
1382
- bad_area(regs, error_code, address);
1369
+ bad_area(regs, hw_error_code, address);
13831370 return;
13841371 }
13851372
....@@ -1388,8 +1375,8 @@
13881375 * we can handle it..
13891376 */
13901377 good_area:
1391
- if (unlikely(access_error(error_code, vma))) {
1392
- bad_area_access_error(regs, error_code, address, vma);
1378
+ if (unlikely(access_error(hw_error_code, vma))) {
1379
+ bad_area_access_error(regs, hw_error_code, address, vma);
13931380 return;
13941381 }
13951382
....@@ -1397,94 +1384,139 @@
13971384 * If for any reason at all we couldn't handle the fault,
13981385 * make sure we exit gracefully rather than endlessly redo
13991386 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1400
- * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
1387
+ * we get VM_FAULT_RETRY back, the mmap_lock has been unlocked.
14011388 *
1402
- * Note that handle_userfault() may also release and reacquire mmap_sem
1389
+ * Note that handle_userfault() may also release and reacquire mmap_lock
14031390 * (and not return with VM_FAULT_RETRY), when returning to userland to
14041391 * repeat the page fault later with a VM_FAULT_NOPAGE retval
14051392 * (potentially after handling any pending signal during the return to
14061393 * userland). The return to userland is identified whenever
14071394 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
1408
- * Thus we have to be careful about not touching vma after handling the
1409
- * fault, so we read the pkey beforehand.
14101395 */
1411
- pkey = vma_pkey(vma);
1412
- fault = handle_mm_fault(vma, address, flags);
1413
- major |= fault & VM_FAULT_MAJOR;
1396
+ fault = handle_mm_fault(vma, address, flags, regs);
1397
+
1398
+ /* Quick path to respond to signals */
1399
+ if (fault_signal_pending(fault, regs)) {
1400
+ if (!user_mode(regs))
1401
+ no_context(regs, hw_error_code, address, SIGBUS,
1402
+ BUS_ADRERR);
1403
+ return;
1404
+ }
14141405
14151406 /*
1416
- * If we need to retry the mmap_sem has already been released,
1407
+ * If we need to retry the mmap_lock has already been released,
14171408 * and if there is a fatal signal pending there is no guarantee
14181409 * that we made any progress. Handle this case first.
14191410 */
1420
- if (unlikely(fault & VM_FAULT_RETRY)) {
1421
- /* Retry at most once */
1422
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
1423
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
1424
- flags |= FAULT_FLAG_TRIED;
1425
- if (!fatal_signal_pending(tsk))
1426
- goto retry;
1427
- }
1411
+ if (unlikely((fault & VM_FAULT_RETRY) &&
1412
+ (flags & FAULT_FLAG_ALLOW_RETRY))) {
1413
+ flags |= FAULT_FLAG_TRIED;
14281414
1429
- /* User mode? Just return to handle the fatal exception */
1430
- if (flags & FAULT_FLAG_USER)
1431
- return;
1415
+ /*
1416
+ * Do not try to reuse this vma and fetch it
1417
+ * again since we will release the mmap_sem.
1418
+ */
1419
+ vma = NULL;
14321420
1433
- /* Not returning to user mode? Handle exceptions or die: */
1434
- no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1435
- return;
1421
+ goto retry;
14361422 }
14371423
1438
- up_read(&mm->mmap_sem);
1424
+ mmap_read_unlock(mm);
1425
+
1426
+done:
14391427 if (unlikely(fault & VM_FAULT_ERROR)) {
1440
- mm_fault_error(regs, error_code, address, &pkey, fault);
1428
+ mm_fault_error(regs, hw_error_code, address, fault);
14411429 return;
1442
- }
1443
-
1444
- /*
1445
- * Major/minor page fault accounting. If any of the events
1446
- * returned VM_FAULT_MAJOR, we account it as a major fault.
1447
- */
1448
- if (major) {
1449
- tsk->maj_flt++;
1450
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1451
- } else {
1452
- tsk->min_flt++;
1453
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
14541430 }
14551431
14561432 check_v8086_mode(regs, address, tsk);
14571433 }
1458
-NOKPROBE_SYMBOL(__do_page_fault);
1434
+NOKPROBE_SYMBOL(do_user_addr_fault);
14591435
1460
-static nokprobe_inline void
1461
-trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1462
- unsigned long error_code)
1436
+static __always_inline void
1437
+trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
1438
+ unsigned long address)
14631439 {
1440
+ if (!trace_pagefault_enabled())
1441
+ return;
1442
+
14641443 if (user_mode(regs))
14651444 trace_page_fault_user(address, regs, error_code);
14661445 else
14671446 trace_page_fault_kernel(address, regs, error_code);
14681447 }
14691448
1470
-/*
1471
- * We must have this function blacklisted from kprobes, tagged with notrace
1472
- * and call read_cr2() before calling anything else. To avoid calling any
1473
- * kind of tracing machinery before we've observed the CR2 value.
1474
- *
1475
- * exception_{enter,exit}() contains all sorts of tracepoints.
1476
- */
1477
-dotraplinkage void notrace
1478
-do_page_fault(struct pt_regs *regs, unsigned long error_code)
1449
+static __always_inline void
1450
+handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1451
+ unsigned long address)
14791452 {
1480
- unsigned long address = read_cr2(); /* Get the faulting address */
1481
- enum ctx_state prev_state;
1453
+ trace_page_fault_entries(regs, error_code, address);
14821454
1483
- prev_state = exception_enter();
1484
- if (trace_pagefault_enabled())
1485
- trace_page_fault_entries(address, regs, error_code);
1455
+ if (unlikely(kmmio_fault(regs, address)))
1456
+ return;
14861457
1487
- __do_page_fault(regs, error_code, address);
1488
- exception_exit(prev_state);
1458
+ /* Was the fault on kernel-controlled part of the address space? */
1459
+ if (unlikely(fault_in_kernel_space(address))) {
1460
+ do_kern_addr_fault(regs, error_code, address);
1461
+ } else {
1462
+ do_user_addr_fault(regs, error_code, address);
1463
+ /*
1464
+ * User address page fault handling might have reenabled
1465
+ * interrupts. Fixing up all potential exit points of
1466
+ * do_user_addr_fault() and its leaf functions is just not
1467
+ * doable w/o creating an unholy mess or turning the code
1468
+ * upside down.
1469
+ */
1470
+ local_irq_disable();
1471
+ }
14891472 }
1490
-NOKPROBE_SYMBOL(do_page_fault);
1473
+
1474
+DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
1475
+{
1476
+ unsigned long address = read_cr2();
1477
+ irqentry_state_t state;
1478
+
1479
+ prefetchw(&current->mm->mmap_lock);
1480
+
1481
+ /*
1482
+ * KVM uses #PF vector to deliver 'page not present' events to guests
1483
+ * (asynchronous page fault mechanism). The event happens when a
1484
+ * userspace task is trying to access some valid (from guest's point of
1485
+ * view) memory which is not currently mapped by the host (e.g. the
1486
+ * memory is swapped out). Note, the corresponding "page ready" event
1487
+ * which is injected when the memory becomes available, is delived via
1488
+ * an interrupt mechanism and not a #PF exception
1489
+ * (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
1490
+ *
1491
+ * We are relying on the interrupted context being sane (valid RSP,
1492
+ * relevant locks not held, etc.), which is fine as long as the
1493
+ * interrupted context had IF=1. We are also relying on the KVM
1494
+ * async pf type field and CR2 being read consistently instead of
1495
+ * getting values from real and async page faults mixed up.
1496
+ *
1497
+ * Fingers crossed.
1498
+ *
1499
+ * The async #PF handling code takes care of idtentry handling
1500
+ * itself.
1501
+ */
1502
+ if (kvm_handle_async_pf(regs, (u32)address))
1503
+ return;
1504
+
1505
+ /*
1506
+ * Entry handling for valid #PF from kernel mode is slightly
1507
+ * different: RCU is already watching and rcu_irq_enter() must not
1508
+ * be invoked because a kernel fault on a user space address might
1509
+ * sleep.
1510
+ *
1511
+ * In case the fault hit a RCU idle region the conditional entry
1512
+ * code reenabled RCU to avoid subsequent wreckage which helps
1513
+ * debugability.
1514
+ */
1515
+ state = irqentry_enter(regs);
1516
+
1517
+ instrumentation_begin();
1518
+ handle_page_fault(regs, error_code, address);
1519
+ instrumentation_end();
1520
+
1521
+ irqentry_exit(regs, state);
1522
+}