hc
2024-11-01 2f529f9b558ca1c1bd74be7437a84e4711743404
kernel/arch/x86/mm/fault.c
....@@ -19,6 +19,7 @@
1919 #include <linux/uaccess.h> /* faulthandler_disabled() */
2020 #include <linux/efi.h> /* efi_recover_from_page_fault()*/
2121 #include <linux/mm_types.h>
22
+#include <linux/irqstage.h>
2223
2324 #include <asm/cpufeature.h> /* boot_cpu_has, ... */
2425 #include <asm/traps.h> /* dotraplinkage, ... */
....@@ -656,7 +657,7 @@
656657 * the below recursive fault logic only apply to a faults from
657658 * task context.
658659 */
659
- if (in_interrupt())
660
+ if (running_oob() || in_interrupt())
660661 return;
661662
662663 /*
....@@ -666,10 +667,12 @@
666667 * faulting through the emulate_vsyscall() logic.
667668 */
668669 if (current->thread.sig_on_uaccess_err && signal) {
670
+ oob_trap_notify(X86_TRAP_PF, regs);
669671 set_signal_archinfo(address, error_code);
670672
671673 /* XXX: hwpoison faults will set the wrong code. */
672674 force_sig_fault(signal, si_code, (void __user *)address);
675
+ oob_trap_unwind(X86_TRAP_PF, regs);
673676 }
674677
675678 /*
....@@ -677,6 +680,12 @@
677680 */
678681 return;
679682 }
683
+
684
+ /*
685
+ * Do not bother unwinding the notification context on
686
+ * CPU/firmware/kernel bug.
687
+ */
688
+ oob_trap_notify(X86_TRAP_PF, regs);
680689
681690 #ifdef CONFIG_VMAP_STACK
682691 /*
....@@ -796,6 +805,55 @@
796805 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
797806 }
798807
808
+#ifdef CONFIG_IRQ_PIPELINE
809
+
810
+static inline void cond_reenable_irqs_user(void)
811
+{
812
+ hard_local_irq_enable();
813
+
814
+ if (running_inband())
815
+ local_irq_enable();
816
+}
817
+
818
+static inline void cond_reenable_irqs_kernel(irqentry_state_t state,
819
+ struct pt_regs *regs)
820
+{
821
+ if (regs->flags & X86_EFLAGS_IF) {
822
+ hard_local_irq_enable();
823
+ if (state.stage_info == IRQENTRY_INBAND_UNSTALLED)
824
+ local_irq_enable();
825
+ }
826
+}
827
+
828
+static inline void cond_disable_irqs(void)
829
+{
830
+ hard_local_irq_disable();
831
+
832
+ if (running_inband())
833
+ local_irq_disable();
834
+}
835
+
836
+#else /* !CONFIG_IRQ_PIPELINE */
837
+
838
+static inline void cond_reenable_irqs_user(void)
839
+{
840
+ local_irq_enable();
841
+}
842
+
843
+static inline void cond_reenable_irqs_kernel(irqentry_state_t state,
844
+ struct pt_regs *regs)
845
+{
846
+ if (regs->flags & X86_EFLAGS_IF)
847
+ local_irq_enable();
848
+}
849
+
850
+static inline void cond_disable_irqs(void)
851
+{
852
+ local_irq_disable();
853
+}
854
+
855
+#endif /* !CONFIG_IRQ_PIPELINE */
856
+
799857 static void
800858 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
801859 unsigned long address, u32 pkey, int si_code)
....@@ -807,7 +865,7 @@
807865 /*
808866 * It's possible to have interrupts off here:
809867 */
810
- local_irq_enable();
868
+ cond_reenable_irqs_user();
811869
812870 /*
813871 * Valid to do another page fault here because this one came
....@@ -818,6 +876,12 @@
818876
819877 if (is_errata100(regs, address))
820878 return;
879
+
880
+ oob_trap_notify(X86_TRAP_PF, regs);
881
+ if (!running_inband()) {
882
+ local_irq_disable_full();
883
+ return;
884
+ }
821885
822886 /*
823887 * To avoid leaking information about the kernel page table
....@@ -837,7 +901,9 @@
837901
838902 force_sig_fault(SIGSEGV, si_code, (void __user *)address);
839903
840
- local_irq_disable();
904
+ local_irq_disable_full();
905
+
906
+ oob_trap_unwind(X86_TRAP_PF, regs);
841907
842908 return;
843909 }
....@@ -1225,7 +1291,8 @@
12251291 static inline
12261292 void do_user_addr_fault(struct pt_regs *regs,
12271293 unsigned long hw_error_code,
1228
- unsigned long address)
1294
+ unsigned long address,
1295
+ irqentry_state_t state)
12291296 {
12301297 struct vm_area_struct *vma = NULL;
12311298 struct task_struct *tsk;
....@@ -1266,7 +1333,7 @@
12661333 * If we're in an interrupt, have no user context or are running
12671334 * in a region with pagefaults disabled then we must not take the fault
12681335 */
1269
- if (unlikely(faulthandler_disabled() || !mm)) {
1336
+ if (unlikely(running_inband() && (faulthandler_disabled() || !mm))) {
12701337 bad_area_nosemaphore(regs, hw_error_code, address);
12711338 return;
12721339 }
....@@ -1279,12 +1346,22 @@
12791346 * potential system fault or CPU buglet:
12801347 */
12811348 if (user_mode(regs)) {
1282
- local_irq_enable();
1349
+ cond_reenable_irqs_user();
12831350 flags |= FAULT_FLAG_USER;
12841351 } else {
1285
- if (regs->flags & X86_EFLAGS_IF)
1286
- local_irq_enable();
1352
+ cond_reenable_irqs_kernel(state, regs);
12871353 }
1354
+
1355
+ /*
1356
+ * At this point, we would have to stop running
1357
+ * out-of-band. Tell the companion core about the page fault
1358
+ * event, so that it might switch current to in-band mode if
1359
+ * need be. If it does not, then we may assume that it would
1360
+ * also handle the fixups.
1361
+ */
1362
+ oob_trap_notify(X86_TRAP_PF, regs);
1363
+ if (!running_inband())
1364
+ return;
12881365
12891366 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
12901367
....@@ -1307,7 +1384,7 @@
13071384 */
13081385 if (is_vsyscall_vaddr(address)) {
13091386 if (emulate_vsyscall(hw_error_code, regs, address))
1310
- return;
1387
+ goto out;
13111388 }
13121389 #endif
13131390
....@@ -1340,7 +1417,7 @@
13401417 * which we do not expect faults.
13411418 */
13421419 bad_area_nosemaphore(regs, hw_error_code, address);
1343
- return;
1420
+ goto out;
13441421 }
13451422 retry:
13461423 mmap_read_lock(mm);
....@@ -1357,17 +1434,17 @@
13571434 vma = find_vma(mm, address);
13581435 if (unlikely(!vma)) {
13591436 bad_area(regs, hw_error_code, address);
1360
- return;
1437
+ goto out;
13611438 }
13621439 if (likely(vma->vm_start <= address))
13631440 goto good_area;
13641441 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
13651442 bad_area(regs, hw_error_code, address);
1366
- return;
1443
+ goto out;
13671444 }
13681445 if (unlikely(expand_stack(vma, address))) {
13691446 bad_area(regs, hw_error_code, address);
1370
- return;
1447
+ goto out;
13711448 }
13721449
13731450 /*
....@@ -1377,7 +1454,7 @@
13771454 good_area:
13781455 if (unlikely(access_error(hw_error_code, vma))) {
13791456 bad_area_access_error(regs, hw_error_code, address, vma);
1380
- return;
1457
+ goto out;
13811458 }
13821459
13831460 /*
....@@ -1400,7 +1477,7 @@
14001477 if (!user_mode(regs))
14011478 no_context(regs, hw_error_code, address, SIGBUS,
14021479 BUS_ADRERR);
1403
- return;
1480
+ goto out;
14041481 }
14051482
14061483 /*
....@@ -1426,10 +1503,12 @@
14261503 done:
14271504 if (unlikely(fault & VM_FAULT_ERROR)) {
14281505 mm_fault_error(regs, hw_error_code, address, fault);
1429
- return;
1506
+ goto out;
14301507 }
14311508
14321509 check_v8086_mode(regs, address, tsk);
1510
+out:
1511
+ oob_trap_unwind(X86_TRAP_PF, regs);
14331512 }
14341513 NOKPROBE_SYMBOL(do_user_addr_fault);
14351514
....@@ -1448,7 +1527,8 @@
14481527
14491528 static __always_inline void
14501529 handle_page_fault(struct pt_regs *regs, unsigned long error_code,
1451
- unsigned long address)
1530
+ unsigned long address,
1531
+ irqentry_state_t state)
14521532 {
14531533 trace_page_fault_entries(regs, error_code, address);
14541534
....@@ -1459,7 +1539,7 @@
14591539 if (unlikely(fault_in_kernel_space(address))) {
14601540 do_kern_addr_fault(regs, error_code, address);
14611541 } else {
1462
- do_user_addr_fault(regs, error_code, address);
1542
+ do_user_addr_fault(regs, error_code, address, state);
14631543 /*
14641544 * User address page fault handling might have reenabled
14651545 * interrupts. Fixing up all potential exit points of
....@@ -1467,7 +1547,7 @@
14671547 * doable w/o creating an unholy mess or turning the code
14681548 * upside down.
14691549 */
1470
- local_irq_disable();
1550
+ cond_disable_irqs();
14711551 }
14721552 }
14731553
....@@ -1515,8 +1595,46 @@
15151595 state = irqentry_enter(regs);
15161596
15171597 instrumentation_begin();
1518
- handle_page_fault(regs, error_code, address);
1598
+ handle_page_fault(regs, error_code, address, state);
15191599 instrumentation_end();
15201600
15211601 irqentry_exit(regs, state);
15221602 }
1603
+
1604
+#ifdef CONFIG_DOVETAIL
1605
+
1606
+void arch_advertise_page_mapping(unsigned long start, unsigned long end)
1607
+{
1608
+ unsigned long next, addr = start;
1609
+ pgd_t *pgd, *pgd_ref;
1610
+ struct page *page;
1611
+
1612
+ /*
1613
+ * APEI may create temporary mappings in interrupt context -
1614
+ * nothing we can and need to propagate globally.
1615
+ */
1616
+ if (in_interrupt())
1617
+ return;
1618
+
1619
+ if (!(start >= VMALLOC_START && start < VMALLOC_END))
1620
+ return;
1621
+
1622
+ do {
1623
+ next = pgd_addr_end(addr, end);
1624
+ pgd_ref = pgd_offset_k(addr);
1625
+ if (pgd_none(*pgd_ref))
1626
+ continue;
1627
+ spin_lock(&pgd_lock);
1628
+ list_for_each_entry(page, &pgd_list, lru) {
1629
+ pgd = page_address(page) + pgd_index(addr);
1630
+ if (pgd_none(*pgd))
1631
+ set_pgd(pgd, *pgd_ref);
1632
+ }
1633
+ spin_unlock(&pgd_lock);
1634
+ addr = next;
1635
+ } while (addr != end);
1636
+
1637
+ arch_flush_lazy_mmu_mode();
1638
+}
1639
+
1640
+#endif