hc
2023-12-09 b22da3d8526a935aa31e086e63f60ff3246cb61c
kernel/arch/arm64/mm/fault.c
....@@ -1,28 +1,21 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Based on arch/arm/mm/fault.c
34 *
45 * Copyright (C) 1995 Linus Torvalds
56 * Copyright (C) 1995-2004 Russell King
67 * Copyright (C) 2012 ARM Ltd.
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License version 2 as
10
- * published by the Free Software Foundation.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
198 */
209
10
+#include <linux/acpi.h>
11
+#include <linux/bitfield.h>
2112 #include <linux/extable.h>
13
+#include <linux/kfence.h>
2214 #include <linux/signal.h>
2315 #include <linux/mm.h>
2416 #include <linux/hardirq.h>
2517 #include <linux/init.h>
18
+#include <linux/kasan.h>
2619 #include <linux/kprobes.h>
2720 #include <linux/uaccess.h>
2821 #include <linux/page-flags.h>
....@@ -33,23 +26,26 @@
3326 #include <linux/preempt.h>
3427 #include <linux/hugetlb.h>
3528
29
+#include <asm/acpi.h>
3630 #include <asm/bug.h>
3731 #include <asm/cmpxchg.h>
3832 #include <asm/cpufeature.h>
3933 #include <asm/exception.h>
34
+#include <asm/daifflags.h>
4035 #include <asm/debug-monitors.h>
4136 #include <asm/esr.h>
42
-#include <asm/kasan.h>
37
+#include <asm/kprobes.h>
38
+#include <asm/mte.h>
39
+#include <asm/processor.h>
4340 #include <asm/sysreg.h>
4441 #include <asm/system_misc.h>
45
-#include <asm/pgtable.h>
4642 #include <asm/tlbflush.h>
4743 #include <asm/traps.h>
4844
49
-#include <acpi/ghes.h>
45
+#include <trace/hooks/fault.h>
5046
5147 struct fault_info {
52
- int (*fn)(unsigned long addr, unsigned int esr,
48
+ int (*fn)(unsigned long far, unsigned int esr,
5349 struct pt_regs *regs);
5450 int sig;
5551 int code;
....@@ -57,33 +53,17 @@
5753 };
5854
5955 static const struct fault_info fault_info[];
56
+static struct fault_info debug_fault_info[];
6057
6158 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
6259 {
63
- return fault_info + (esr & 63);
60
+ return fault_info + (esr & ESR_ELx_FSC);
6461 }
6562
66
-#ifdef CONFIG_KPROBES
67
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
63
+static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
6864 {
69
- int ret = 0;
70
-
71
- /* kprobe_running() needs smp_processor_id() */
72
- if (!user_mode(regs)) {
73
- preempt_disable();
74
- if (kprobe_running() && kprobe_fault_handler(regs, esr))
75
- ret = 1;
76
- preempt_enable();
77
- }
78
-
79
- return ret;
65
+ return debug_fault_info + DBG_ESR_EVT(esr);
8066 }
81
-#else
82
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
83
-{
84
- return 0;
85
-}
86
-#endif
8767
8868 static void data_abort_decode(unsigned int esr)
8969 {
....@@ -112,8 +92,8 @@
11292 pr_alert("Mem abort info:\n");
11393
11494 pr_alert(" ESR = 0x%08x\n", esr);
115
- pr_alert(" Exception class = %s, IL = %u bits\n",
116
- esr_get_class_string(esr),
95
+ pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
96
+ ESR_ELx_EC(esr), esr_get_class_string(esr),
11797 (esr & ESR_ELx_IL) ? 32 : 16);
11898 pr_alert(" SET = %lu, FnV = %lu\n",
11999 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
....@@ -126,22 +106,19 @@
126106 data_abort_decode(esr);
127107 }
128108
129
-static inline bool is_ttbr0_addr(unsigned long addr)
109
+static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
130110 {
131
- /* entry assembly clears tags for TTBR0 addrs */
132
- return addr < TASK_SIZE;
133
-}
111
+ /* Either init_pg_dir or swapper_pg_dir */
112
+ if (mm == &init_mm)
113
+ return __pa_symbol(mm->pgd);
134114
135
-static inline bool is_ttbr1_addr(unsigned long addr)
136
-{
137
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
138
- return arch_kasan_reset_tag(addr) >= VA_START;
115
+ return (unsigned long)virt_to_phys(mm->pgd);
139116 }
140117
141118 /*
142119 * Dump out the page tables associated with 'addr' in the currently active mm.
143120 */
144
-void show_pte(unsigned long addr)
121
+static void show_pte(unsigned long addr)
145122 {
146123 struct mm_struct *mm;
147124 pgd_t *pgdp;
....@@ -164,14 +141,15 @@
164141 return;
165142 }
166143
167
- pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgdp = %p\n",
144
+ pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
168145 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
169
- VA_BITS, mm->pgd);
146
+ vabits_actual, mm_to_pgd_phys(mm));
170147 pgdp = pgd_offset(mm, addr);
171148 pgd = READ_ONCE(*pgdp);
172149 pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
173150
174151 do {
152
+ p4d_t *p4dp, p4d;
175153 pud_t *pudp, pud;
176154 pmd_t *pmdp, pmd;
177155 pte_t *ptep, pte;
....@@ -179,7 +157,13 @@
179157 if (pgd_none(pgd) || pgd_bad(pgd))
180158 break;
181159
182
- pudp = pud_offset(pgdp, addr);
160
+ p4dp = p4d_offset(pgdp, addr);
161
+ p4d = READ_ONCE(*p4dp);
162
+ pr_cont(", p4d=%016llx", p4d_val(p4d));
163
+ if (p4d_none(p4d) || p4d_bad(p4d))
164
+ break;
165
+
166
+ pudp = pud_offset(p4dp, addr);
183167 pud = READ_ONCE(*pudp);
184168 pr_cont(", pud=%016llx", pud_val(pud));
185169 if (pud_none(pud) || pud_bad(pud))
....@@ -239,7 +223,9 @@
239223 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
240224 } while (pteval != old_pteval);
241225
242
- flush_tlb_fix_spurious_fault(vma, address);
226
+ /* Invalidate a stale read-only entry */
227
+ if (dirty)
228
+ flush_tlb_page(vma, address);
243229 return 1;
244230 }
245231
....@@ -248,9 +234,8 @@
248234 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
249235 }
250236
251
-static inline bool is_el1_permission_fault(unsigned int esr,
252
- struct pt_regs *regs,
253
- unsigned long addr)
237
+static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
238
+ struct pt_regs *regs)
254239 {
255240 unsigned int ec = ESR_ELx_EC(esr);
256241 unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
....@@ -268,6 +253,38 @@
268253 return false;
269254 }
270255
256
+static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
257
+ unsigned int esr,
258
+ struct pt_regs *regs)
259
+{
260
+ unsigned long flags;
261
+ u64 par, dfsc;
262
+
263
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
264
+ (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
265
+ return false;
266
+
267
+ local_irq_save(flags);
268
+ asm volatile("at s1e1r, %0" :: "r" (addr));
269
+ isb();
270
+ par = read_sysreg_par();
271
+ local_irq_restore(flags);
272
+
273
+ /*
274
+ * If we now have a valid translation, treat the translation fault as
275
+ * spurious.
276
+ */
277
+ if (!(par & SYS_PAR_EL1_F))
278
+ return true;
279
+
280
+ /*
281
+ * If we got a different type of fault from the AT instruction,
282
+ * treat the translation fault as spurious.
283
+ */
284
+ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
285
+ return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
286
+}
287
+
271288 static void die_kernel_fault(const char *msg, unsigned long addr,
272289 unsigned int esr, struct pt_regs *regs)
273290 {
....@@ -276,12 +293,72 @@
276293 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
277294 addr);
278295
296
+ trace_android_rvh_die_kernel_fault(regs, esr, addr, msg);
279297 mem_abort_decode(esr);
280298
281299 show_pte(addr);
282300 die("Oops", regs, esr);
283301 bust_spinlocks(0);
284302 do_exit(SIGKILL);
303
+}
304
+
305
+#ifdef CONFIG_KASAN_HW_TAGS
306
+static void report_tag_fault(unsigned long addr, unsigned int esr,
307
+ struct pt_regs *regs)
308
+{
309
+ static bool reported;
310
+ bool is_write;
311
+
312
+ if (READ_ONCE(reported))
313
+ return;
314
+
315
+ /*
316
+ * This is used for KASAN tests and assumes that no MTE faults
317
+ * happened before running the tests.
318
+ */
319
+ if (mte_report_once())
320
+ WRITE_ONCE(reported, true);
321
+
322
+ /*
323
+ * SAS bits aren't set for all faults reported in EL1, so we can't
324
+ * find out access size.
325
+ */
326
+ is_write = !!(esr & ESR_ELx_WNR);
327
+ kasan_report(addr, 0, is_write, regs->pc);
328
+}
329
+#else
330
+/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
331
+static inline void report_tag_fault(unsigned long addr, unsigned int esr,
332
+ struct pt_regs *regs) { }
333
+#endif
334
+
335
+static void do_tag_recovery(unsigned long addr, unsigned int esr,
336
+ struct pt_regs *regs)
337
+{
338
+
339
+ report_tag_fault(addr, esr, regs);
340
+
341
+ /*
342
+ * Disable MTE Tag Checking on the local CPU for the current EL.
343
+ * It will be done lazily on the other CPUs when they will hit a
344
+ * tag fault.
345
+ */
346
+ sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE);
347
+ isb();
348
+}
349
+
350
+static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
351
+{
352
+ unsigned int ec = ESR_ELx_EC(esr);
353
+ unsigned int fsc = esr & ESR_ELx_FSC;
354
+
355
+ if (ec != ESR_ELx_EC_DABT_CUR)
356
+ return false;
357
+
358
+ if (fsc == ESR_ELx_FSC_MTE)
359
+ return true;
360
+
361
+ return false;
285362 }
286363
287364 static void __do_kernel_fault(unsigned long addr, unsigned int esr,
....@@ -296,23 +373,38 @@
296373 if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
297374 return;
298375
299
- if (is_el1_permission_fault(esr, regs, addr)) {
376
+ if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
377
+ "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
378
+ return;
379
+
380
+ if (is_el1_mte_sync_tag_check_fault(esr)) {
381
+ do_tag_recovery(addr, esr, regs);
382
+
383
+ return;
384
+ }
385
+
386
+ if (is_el1_permission_fault(addr, esr, regs)) {
300387 if (esr & ESR_ELx_WNR)
301388 msg = "write to read-only memory";
389
+ else if (is_el1_instruction_abort(esr))
390
+ msg = "execute from non-executable memory";
302391 else
303392 msg = "read from unreadable memory";
304393 } else if (addr < PAGE_SIZE) {
305394 msg = "NULL pointer dereference";
306395 } else {
396
+ if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
397
+ return;
398
+
307399 msg = "paging request";
308400 }
309401
310402 die_kernel_fault(msg, addr, esr, regs);
311403 }
312404
313
-static void __do_user_fault(struct siginfo *info, unsigned int esr)
405
+static void set_thread_esr(unsigned long address, unsigned int esr)
314406 {
315
- current->thread.fault_address = (unsigned long)info->si_addr;
407
+ current->thread.fault_address = address;
316408
317409 /*
318410 * If the faulting address is in the kernel, we must sanitize the ESR.
....@@ -365,25 +457,22 @@
365457 }
366458
367459 current->thread.fault_code = esr;
368
- arm64_force_sig_info(info, esr_to_fault_info(esr)->name, current);
369460 }
370461
371
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
462
+static void do_bad_area(unsigned long far, unsigned int esr,
463
+ struct pt_regs *regs)
372464 {
465
+ unsigned long addr = untagged_addr(far);
466
+
373467 /*
374468 * If we are in kernel mode at this point, we have no context to
375469 * handle this fault with.
376470 */
377471 if (user_mode(regs)) {
378472 const struct fault_info *inf = esr_to_fault_info(esr);
379
- struct siginfo si;
380473
381
- clear_siginfo(&si);
382
- si.si_signo = inf->sig;
383
- si.si_code = inf->code;
384
- si.si_addr = (void __user *)addr;
385
-
386
- __do_user_fault(&si, esr);
474
+ set_thread_esr(addr, esr);
475
+ arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
387476 } else {
388477 __do_kernel_fault(addr, esr, regs);
389478 }
....@@ -392,41 +481,32 @@
392481 #define VM_FAULT_BADMAP 0x010000
393482 #define VM_FAULT_BADACCESS 0x020000
394483
395
-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
396
- unsigned int mm_flags, unsigned long vm_flags,
397
- struct task_struct *tsk)
484
+static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
485
+ unsigned int mm_flags, unsigned long vm_flags,
486
+ struct pt_regs *regs)
398487 {
399
- struct vm_area_struct *vma;
400
- vm_fault_t fault;
401488
402
- vma = find_vma(mm, addr);
403
- fault = VM_FAULT_BADMAP;
404489 if (unlikely(!vma))
405
- goto out;
406
- if (unlikely(vma->vm_start > addr))
407
- goto check_stack;
490
+ return VM_FAULT_BADMAP;
408491
409492 /*
410493 * Ok, we have a good vm_area for this memory access, so we can handle
411494 * it.
412495 */
413
-good_area:
496
+ if (unlikely(vma->vm_start > addr)) {
497
+ if (!(vma->vm_flags & VM_GROWSDOWN))
498
+ return VM_FAULT_BADMAP;
499
+ if (expand_stack(vma, addr))
500
+ return VM_FAULT_BADMAP;
501
+ }
502
+
414503 /*
415504 * Check that the permissions on the VMA allow for the fault which
416505 * occurred.
417506 */
418
- if (!(vma->vm_flags & vm_flags)) {
419
- fault = VM_FAULT_BADACCESS;
420
- goto out;
421
- }
422
-
423
- return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
424
-
425
-check_stack:
426
- if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
427
- goto good_area;
428
-out:
429
- return fault;
507
+ if (!(vma->vm_flags & vm_flags))
508
+ return VM_FAULT_BADACCESS;
509
+ return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);
430510 }
431511
432512 static bool is_el0_instruction_abort(unsigned int esr)
....@@ -434,21 +514,28 @@
434514 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
435515 }
436516
437
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
517
+/*
518
+ * Note: not valid for EL1 DC IVAC, but we never use that such that it
519
+ * should fault. EL0 cannot issue DC IVAC (undef).
520
+ */
521
+static bool is_write_abort(unsigned int esr)
522
+{
523
+ return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
524
+}
525
+
526
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
438527 struct pt_regs *regs)
439528 {
440
- struct task_struct *tsk;
441
- struct mm_struct *mm;
442
- struct siginfo si;
443
- vm_fault_t fault, major = 0;
444
- unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
445
- unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
529
+ const struct fault_info *inf;
530
+ struct mm_struct *mm = current->mm;
531
+ vm_fault_t fault;
532
+ unsigned long vm_flags = VM_ACCESS_FLAGS;
533
+ unsigned int mm_flags = FAULT_FLAG_DEFAULT;
534
+ struct vm_area_struct *vma = NULL;
535
+ unsigned long addr = untagged_addr(far);
446536
447
- if (notify_page_fault(regs, esr))
537
+ if (kprobe_page_fault(regs, esr))
448538 return 0;
449
-
450
- tsk = current;
451
- mm = tsk->mm;
452539
453540 /*
454541 * If we're in an interrupt or have no user context, we must not take
....@@ -462,12 +549,13 @@
462549
463550 if (is_el0_instruction_abort(esr)) {
464551 vm_flags = VM_EXEC;
465
- } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
552
+ mm_flags |= FAULT_FLAG_INSTRUCTION;
553
+ } else if (is_write_abort(esr)) {
466554 vm_flags = VM_WRITE;
467555 mm_flags |= FAULT_FLAG_WRITE;
468556 }
469557
470
- if (is_ttbr0_addr(addr) && is_el1_permission_fault(esr, regs, addr)) {
558
+ if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
471559 /* regs->orig_addr_limit may be 0 if we entered from EL0 */
472560 if (regs->orig_addr_limit == KERNEL_DS)
473561 die_kernel_fault("access to user memory with fs=KERNEL_DS",
....@@ -485,15 +573,23 @@
485573 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
486574
487575 /*
576
+ * let's try a speculative page fault without grabbing the
577
+ * mmap_sem.
578
+ */
579
+ fault = handle_speculative_fault(mm, addr, mm_flags, &vma, regs);
580
+ if (fault != VM_FAULT_RETRY)
581
+ goto done;
582
+
583
+ /*
488584 * As per x86, we may deadlock here. However, since the kernel only
489585 * validly references user space from well defined areas of the code,
490586 * we can bug out early if this is from code which shouldn't.
491587 */
492
- if (!down_read_trylock(&mm->mmap_sem)) {
588
+ if (!mmap_read_trylock(mm)) {
493589 if (!user_mode(regs) && !search_exception_tables(regs->pc))
494590 goto no_context;
495591 retry:
496
- down_read(&mm->mmap_sem);
592
+ mmap_read_lock(mm);
497593 } else {
498594 /*
499595 * The above down_read_trylock() might have succeeded in which
....@@ -501,62 +597,47 @@
501597 */
502598 might_sleep();
503599 #ifdef CONFIG_DEBUG_VM
504
- if (!user_mode(regs) && !search_exception_tables(regs->pc))
600
+ if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
601
+ mmap_read_unlock(mm);
505602 goto no_context;
603
+ }
506604 #endif
507605 }
508606
509
- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
510
- major |= fault & VM_FAULT_MAJOR;
607
+ if (!vma || !can_reuse_spf_vma(vma, addr))
608
+ vma = find_vma(mm, addr);
609
+ fault = __do_page_fault(vma, addr, mm_flags, vm_flags, regs);
610
+
611
+ /* Quick path to respond to signals */
612
+ if (fault_signal_pending(fault, regs)) {
613
+ if (!user_mode(regs))
614
+ goto no_context;
615
+ return 0;
616
+ }
511617
512618 if (fault & VM_FAULT_RETRY) {
513
- /*
514
- * If we need to retry but a fatal signal is pending,
515
- * handle the signal first. We do not need to release
516
- * the mmap_sem because it would already be released
517
- * in __lock_page_or_retry in mm/filemap.c.
518
- */
519
- if (fatal_signal_pending(current)) {
520
- if (!user_mode(regs))
521
- goto no_context;
522
- return 0;
523
- }
524
-
525
- /*
526
- * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
527
- * starvation.
528
- */
529619 if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
530
- mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
531620 mm_flags |= FAULT_FLAG_TRIED;
621
+
622
+ /*
623
+ * Do not try to reuse this vma and fetch it
624
+ * again since we will release the mmap_sem.
625
+ */
626
+ vma = NULL;
627
+
532628 goto retry;
533629 }
534630 }
535
- up_read(&mm->mmap_sem);
631
+ mmap_read_unlock(mm);
632
+
633
+done:
536634
537635 /*
538636 * Handle the "normal" (no error) case first.
539637 */
540638 if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
541
- VM_FAULT_BADACCESS)))) {
542
- /*
543
- * Major/minor page fault accounting is only done
544
- * once. If we go through a retry, it is extremely
545
- * likely that the page will be found in page cache at
546
- * that point.
547
- */
548
- if (major) {
549
- tsk->maj_flt++;
550
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
551
- addr);
552
- } else {
553
- tsk->min_flt++;
554
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
555
- addr);
556
- }
557
-
639
+ VM_FAULT_BADACCESS))))
558640 return 0;
559
- }
560641
561642 /*
562643 * If we are in kernel mode at this point, we have no context to
....@@ -575,37 +656,32 @@
575656 return 0;
576657 }
577658
578
- clear_siginfo(&si);
579
- si.si_addr = (void __user *)addr;
580
-
659
+ inf = esr_to_fault_info(esr);
660
+ set_thread_esr(addr, esr);
581661 if (fault & VM_FAULT_SIGBUS) {
582662 /*
583663 * We had some memory, but were unable to successfully fix up
584664 * this page fault.
585665 */
586
- si.si_signo = SIGBUS;
587
- si.si_code = BUS_ADRERR;
588
- } else if (fault & VM_FAULT_HWPOISON_LARGE) {
589
- unsigned int hindex = VM_FAULT_GET_HINDEX(fault);
666
+ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
667
+ } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
668
+ unsigned int lsb;
590669
591
- si.si_signo = SIGBUS;
592
- si.si_code = BUS_MCEERR_AR;
593
- si.si_addr_lsb = hstate_index_to_shift(hindex);
594
- } else if (fault & VM_FAULT_HWPOISON) {
595
- si.si_signo = SIGBUS;
596
- si.si_code = BUS_MCEERR_AR;
597
- si.si_addr_lsb = PAGE_SHIFT;
670
+ lsb = PAGE_SHIFT;
671
+ if (fault & VM_FAULT_HWPOISON_LARGE)
672
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
673
+
674
+ arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
598675 } else {
599676 /*
600677 * Something tried to access memory that isn't in our memory
601678 * map.
602679 */
603
- si.si_signo = SIGSEGV;
604
- si.si_code = fault == VM_FAULT_BADACCESS ?
605
- SEGV_ACCERR : SEGV_MAPERR;
680
+ arm64_force_sig_fault(SIGSEGV,
681
+ fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
682
+ far, inf->name);
606683 }
607684
608
- __do_user_fault(&si, esr);
609685 return 0;
610686
611687 no_context:
....@@ -613,81 +689,84 @@
613689 return 0;
614690 }
615691
616
-int __weak do_tlb_conf_fault(unsigned long addr,
617
- unsigned int esr,
618
- struct pt_regs *regs)
619
-{
620
- return 1; /* do_bad default */
621
-}
622
-
623
-int (*do_tlb_conf_fault_cb)(unsigned long addr,
624
- unsigned int esr,
625
- struct pt_regs *regs)
626
- = do_tlb_conf_fault; /* initialization saves us a branch */
627
-EXPORT_SYMBOL_GPL(do_tlb_conf_fault_cb);
628
-
629
-static int _do_tlb_conf_fault(unsigned long addr,
630
- unsigned int esr,
631
- struct pt_regs *regs)
632
-{
633
- return (*do_tlb_conf_fault_cb)(addr, esr, regs);
634
-}
635
-
636
-static int __kprobes do_translation_fault(unsigned long addr,
692
+static int __kprobes do_translation_fault(unsigned long far,
637693 unsigned int esr,
638694 struct pt_regs *regs)
639695 {
640
- if (is_ttbr0_addr(addr))
641
- return do_page_fault(addr, esr, regs);
696
+ unsigned long addr = untagged_addr(far);
642697
643
- do_bad_area(addr, esr, regs);
698
+ if (is_ttbr0_addr(addr))
699
+ return do_page_fault(far, esr, regs);
700
+
701
+ do_bad_area(far, esr, regs);
644702 return 0;
645703 }
646704
647
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
705
+#ifdef CONFIG_ROCKCHIP_ARM64_ALIGN_FAULT_FIX
706
+extern int alignment_fixup_helper(unsigned long addr, unsigned int esr,
707
+ struct pt_regs *regs);
708
+#endif
709
+static int do_alignment_fault(unsigned long far, unsigned int esr,
648710 struct pt_regs *regs)
649711 {
650
- do_bad_area(addr, esr, regs);
712
+#ifdef CONFIG_ROCKCHIP_ARM64_ALIGN_FAULT_FIX
713
+ if (!alignment_fixup_helper(far, esr, regs))
714
+ return 0;
715
+#endif
716
+ do_bad_area(far, esr, regs);
651717 return 0;
652718 }
653719
654
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
720
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
655721 {
656
- return 1; /* "fault" */
722
+ unsigned long addr = untagged_addr(far);
723
+ int ret = 1;
724
+
725
+ trace_android_vh_handle_tlb_conf(addr, esr, &ret);
726
+ return ret;
657727 }
658728
659
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
729
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
660730 {
661
- struct siginfo info;
662731 const struct fault_info *inf;
732
+ unsigned long siaddr;
663733
664734 inf = esr_to_fault_info(esr);
665735
666
- /*
667
- * Synchronous aborts may interrupt code which had interrupts masked.
668
- * Before calling out into the wider kernel tell the interested
669
- * subsystems.
670
- */
671
- if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
672
- if (interrupts_enabled(regs))
673
- nmi_enter();
674
-
675
- ghes_notify_sea();
676
-
677
- if (interrupts_enabled(regs))
678
- nmi_exit();
736
+ if (user_mode(regs) && apei_claim_sea(regs) == 0) {
737
+ /*
738
+ * APEI claimed this as a firmware-first notification.
739
+ * Some processing deferred to task_work before ret_to_user().
740
+ */
741
+ return 0;
679742 }
680743
681
- clear_siginfo(&info);
682
- info.si_signo = inf->sig;
683
- info.si_errno = 0;
684
- info.si_code = inf->code;
685
- if (esr & ESR_ELx_FnV)
686
- info.si_addr = NULL;
687
- else
688
- info.si_addr = (void __user *)addr;
689
- arm64_notify_die(inf->name, regs, &info, esr);
744
+ if (esr & ESR_ELx_FnV) {
745
+ siaddr = 0;
746
+ } else {
747
+ /*
748
+ * The architecture specifies that the tag bits of FAR_EL1 are
749
+ * UNKNOWN for synchronous external aborts. Mask them out now
750
+ * so that userspace doesn't see them.
751
+ */
752
+ siaddr = untagged_addr(far);
753
+ }
754
+ trace_android_rvh_do_sea(regs, esr, siaddr, inf->name);
755
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
690756
757
+ return 0;
758
+}
759
+
760
+static int do_tag_check_fault(unsigned long far, unsigned int esr,
761
+ struct pt_regs *regs)
762
+{
763
+ /*
764
+ * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
765
+ * for tag check faults. Set them to corresponding bits in the untagged
766
+ * address.
767
+ */
768
+ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
769
+ do_bad_area(far, esr, regs);
691770 return 0;
692771 }
693772
....@@ -709,7 +788,7 @@
709788 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
710789 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
711790 { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
712
- { do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
791
+ { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
713792 { do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
714793 { do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
715794 { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
....@@ -740,7 +819,7 @@
740819 { do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
741820 { do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
742821 { do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
743
- { _do_tlb_conf_fault, SIGKILL, SI_KERNEL, "TLB conflict abort" },
822
+ { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
744823 { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
745824 { do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
746825 { do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
....@@ -758,76 +837,45 @@
758837 { do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
759838 };
760839
761
-int handle_guest_sea(phys_addr_t addr, unsigned int esr)
762
-{
763
- return ghes_notify_sea();
764
-}
765
-
766
-asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
767
- struct pt_regs *regs)
840
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
768841 {
769842 const struct fault_info *inf = esr_to_fault_info(esr);
770
- struct siginfo info;
843
+ unsigned long addr = untagged_addr(far);
771844
772
- if (!inf->fn(addr, esr, regs))
845
+ if (!inf->fn(far, esr, regs))
773846 return;
774847
775848 if (!user_mode(regs)) {
776849 pr_alert("Unhandled fault at 0x%016lx\n", addr);
850
+ trace_android_rvh_do_mem_abort(regs, esr, addr, inf->name);
777851 mem_abort_decode(esr);
778852 show_pte(addr);
779853 }
780854
781
- clear_siginfo(&info);
782
- info.si_signo = inf->sig;
783
- info.si_errno = 0;
784
- info.si_code = inf->code;
785
- info.si_addr = (void __user *)addr;
786
- arm64_notify_die(inf->name, regs, &info, esr);
855
+ /*
856
+ * At this point we have an unrecognized fault type whose tag bits may
857
+ * have been defined as UNKNOWN. Therefore we only expose the untagged
858
+ * address to the signal handler.
859
+ */
860
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
787861 }
862
+NOKPROBE_SYMBOL(do_mem_abort);
788863
789
-asmlinkage void __exception do_el0_irq_bp_hardening(void)
864
+void do_el0_irq_bp_hardening(void)
790865 {
791866 /* PC has already been checked in entry.S */
792867 arm64_apply_bp_hardening();
793868 }
869
+NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
794870
795
-asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
796
- unsigned int esr,
797
- struct pt_regs *regs)
871
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
798872 {
799
- /*
800
- * We've taken an instruction abort from userspace and not yet
801
- * re-enabled IRQs. If the address is a kernel address, apply
802
- * BP hardening prior to enabling IRQs and pre-emption.
803
- */
804
- if (!is_ttbr0_addr(addr))
805
- arm64_apply_bp_hardening();
873
+ trace_android_rvh_do_sp_pc_abort(regs, esr, addr, user_mode(regs));
806874
807
- local_irq_enable();
808
- do_mem_abort(addr, esr, regs);
875
+ arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
876
+ addr, esr);
809877 }
810
-
811
-
812
-asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
813
- unsigned int esr,
814
- struct pt_regs *regs)
815
-{
816
- struct siginfo info;
817
-
818
- if (user_mode(regs)) {
819
- if (!is_ttbr0_addr(instruction_pointer(regs)))
820
- arm64_apply_bp_hardening();
821
- local_irq_enable();
822
- }
823
-
824
- clear_siginfo(&info);
825
- info.si_signo = SIGBUS;
826
- info.si_errno = 0;
827
- info.si_code = BUS_ADRALN;
828
- info.si_addr = (void __user *)addr;
829
- arm64_notify_die("SP/PC alignment exception", regs, &info, esr);
830
-}
878
+NOKPROBE_SYMBOL(do_sp_pc_abort);
831879
832880 int __init early_brk64(unsigned long addr, unsigned int esr,
833881 struct pt_regs *regs);
....@@ -860,11 +908,32 @@
860908 debug_fault_info[nr].name = name;
861909 }
862910
911
+/*
912
+ * In debug exception context, we explicitly disable preemption despite
913
+ * having interrupts disabled.
914
+ * This serves two purposes: it makes it much less likely that we would
915
+ * accidentally schedule in exception context and it will force a warning
916
+ * if we somehow manage to schedule by accident.
917
+ */
918
+static void debug_exception_enter(struct pt_regs *regs)
919
+{
920
+ preempt_disable();
921
+
922
+ /* This code is a bit fragile. Test it. */
923
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
924
+}
925
+NOKPROBE_SYMBOL(debug_exception_enter);
926
+
927
+static void debug_exception_exit(struct pt_regs *regs)
928
+{
929
+ preempt_enable_no_resched();
930
+}
931
+NOKPROBE_SYMBOL(debug_exception_exit);
932
+
863933 #ifdef CONFIG_ARM64_ERRATUM_1463225
864934 DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
865935
866
-static int __exception
867
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
936
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
868937 {
869938 if (user_mode(regs))
870939 return 0;
....@@ -883,65 +952,57 @@
883952 return 1;
884953 }
885954 #else
886
-static int __exception
887
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
955
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
888956 {
889957 return 0;
890958 }
891959 #endif /* CONFIG_ARM64_ERRATUM_1463225 */
960
+NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler);
892961
893
-asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint,
894
- unsigned int esr,
895
- struct pt_regs *regs)
962
+void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
963
+ struct pt_regs *regs)
896964 {
897
- const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
965
+ const struct fault_info *inf = esr_to_debug_fault_info(esr);
898966 unsigned long pc = instruction_pointer(regs);
899
- int rv;
900967
901968 if (cortex_a76_erratum_1463225_debug_handler(regs))
902
- return 0;
969
+ return;
903970
904
- /*
905
- * Tell lockdep we disabled irqs in entry.S. Do nothing if they were
906
- * already disabled to preserve the last enabled/disabled addresses.
907
- */
908
- if (interrupts_enabled(regs))
909
- trace_hardirqs_off();
971
+ debug_exception_enter(regs);
910972
911973 if (user_mode(regs) && !is_ttbr0_addr(pc))
912974 arm64_apply_bp_hardening();
913975
914
- if (!inf->fn(addr_if_watchpoint, esr, regs)) {
915
- rv = 1;
916
- } else {
917
- struct siginfo info;
918
-
919
- clear_siginfo(&info);
920
- info.si_signo = inf->sig;
921
- info.si_errno = 0;
922
- info.si_code = inf->code;
923
- info.si_addr = (void __user *)pc;
924
- arm64_notify_die(inf->name, regs, &info, esr);
925
- rv = 0;
976
+ if (inf->fn(addr_if_watchpoint, esr, regs)) {
977
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
926978 }
927979
928
- if (interrupts_enabled(regs))
929
- trace_hardirqs_on();
930
-
931
- return rv;
980
+ debug_exception_exit(regs);
932981 }
933982 NOKPROBE_SYMBOL(do_debug_exception);
934983
935
-#ifdef CONFIG_ARM64_PAN
936
-void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
984
+/*
985
+ * Used during anonymous page fault handling.
986
+ */
987
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
988
+ unsigned long vaddr)
937989 {
938
- /*
939
- * We modify PSTATE. This won't work from irq context as the PSTATE
940
- * is discarded once we return from the exception.
941
- */
942
- WARN_ON_ONCE(in_interrupt());
990
+ gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO | __GFP_CMA;
943991
944
- sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0);
945
- asm(SET_PSTATE_PAN(1));
992
+ /*
993
+ * If the page is mapped with PROT_MTE, initialise the tags at the
994
+ * point of allocation and page zeroing as this is usually faster than
995
+ * separate DC ZVA and STGM.
996
+ */
997
+ if (vma->vm_flags & VM_MTE)
998
+ flags |= __GFP_ZEROTAGS;
999
+
1000
+ return alloc_page_vma(flags, vma, vaddr);
9461001 }
947
-#endif /* CONFIG_ARM64_PAN */
1002
+
1003
+void tag_clear_highpage(struct page *page)
1004
+{
1005
+ mte_zero_clear_page_tags(page_address(page));
1006
+ page_kasan_tag_reset(page);
1007
+ set_bit(PG_mte_tagged, &page->flags);
1008
+}