hc
2024-02-20 102a0743326a03cd1a1202ceda21e175b7d3575c
kernel/arch/arm64/mm/fault.c
....@@ -1,28 +1,21 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * Based on arch/arm/mm/fault.c
34 *
45 * Copyright (C) 1995 Linus Torvalds
56 * Copyright (C) 1995-2004 Russell King
67 * Copyright (C) 2012 ARM Ltd.
7
- *
8
- * This program is free software; you can redistribute it and/or modify
9
- * it under the terms of the GNU General Public License version 2 as
10
- * published by the Free Software Foundation.
11
- *
12
- * This program is distributed in the hope that it will be useful,
13
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- * GNU General Public License for more details.
16
- *
17
- * You should have received a copy of the GNU General Public License
18
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
198 */
209
10
+#include <linux/acpi.h>
11
+#include <linux/bitfield.h>
2112 #include <linux/extable.h>
13
+#include <linux/kfence.h>
2214 #include <linux/signal.h>
2315 #include <linux/mm.h>
2416 #include <linux/hardirq.h>
2517 #include <linux/init.h>
18
+#include <linux/kasan.h>
2619 #include <linux/kprobes.h>
2720 #include <linux/uaccess.h>
2821 #include <linux/page-flags.h>
....@@ -33,23 +26,26 @@
3326 #include <linux/preempt.h>
3427 #include <linux/hugetlb.h>
3528
29
+#include <asm/acpi.h>
3630 #include <asm/bug.h>
3731 #include <asm/cmpxchg.h>
3832 #include <asm/cpufeature.h>
3933 #include <asm/exception.h>
34
+#include <asm/daifflags.h>
4035 #include <asm/debug-monitors.h>
4136 #include <asm/esr.h>
42
-#include <asm/kasan.h>
37
+#include <asm/kprobes.h>
38
+#include <asm/mte.h>
39
+#include <asm/processor.h>
4340 #include <asm/sysreg.h>
4441 #include <asm/system_misc.h>
45
-#include <asm/pgtable.h>
4642 #include <asm/tlbflush.h>
4743 #include <asm/traps.h>
4844
49
-#include <acpi/ghes.h>
45
+#include <trace/hooks/fault.h>
5046
5147 struct fault_info {
52
- int (*fn)(unsigned long addr, unsigned int esr,
48
+ int (*fn)(unsigned long far, unsigned int esr,
5349 struct pt_regs *regs);
5450 int sig;
5551 int code;
....@@ -57,33 +53,17 @@
5753 };
5854
5955 static const struct fault_info fault_info[];
56
+static struct fault_info debug_fault_info[];
6057
6158 static inline const struct fault_info *esr_to_fault_info(unsigned int esr)
6259 {
63
- return fault_info + (esr & 63);
60
+ return fault_info + (esr & ESR_ELx_FSC);
6461 }
6562
66
-#ifdef CONFIG_KPROBES
67
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
63
+static inline const struct fault_info *esr_to_debug_fault_info(unsigned int esr)
6864 {
69
- int ret = 0;
70
-
71
- /* kprobe_running() needs smp_processor_id() */
72
- if (!user_mode(regs)) {
73
- preempt_disable();
74
- if (kprobe_running() && kprobe_fault_handler(regs, esr))
75
- ret = 1;
76
- preempt_enable();
77
- }
78
-
79
- return ret;
65
+ return debug_fault_info + DBG_ESR_EVT(esr);
8066 }
81
-#else
82
-static inline int notify_page_fault(struct pt_regs *regs, unsigned int esr)
83
-{
84
- return 0;
85
-}
86
-#endif
8767
8868 static void data_abort_decode(unsigned int esr)
8969 {
....@@ -112,8 +92,8 @@
11292 pr_alert("Mem abort info:\n");
11393
11494 pr_alert(" ESR = 0x%08x\n", esr);
115
- pr_alert(" Exception class = %s, IL = %u bits\n",
116
- esr_get_class_string(esr),
95
+ pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
96
+ ESR_ELx_EC(esr), esr_get_class_string(esr),
11797 (esr & ESR_ELx_IL) ? 32 : 16);
11898 pr_alert(" SET = %lu, FnV = %lu\n",
11999 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
....@@ -126,22 +106,19 @@
126106 data_abort_decode(esr);
127107 }
128108
129
-static inline bool is_ttbr0_addr(unsigned long addr)
109
+static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
130110 {
131
- /* entry assembly clears tags for TTBR0 addrs */
132
- return addr < TASK_SIZE;
133
-}
111
+ /* Either init_pg_dir or swapper_pg_dir */
112
+ if (mm == &init_mm)
113
+ return __pa_symbol(mm->pgd);
134114
135
-static inline bool is_ttbr1_addr(unsigned long addr)
136
-{
137
- /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
138
- return arch_kasan_reset_tag(addr) >= VA_START;
115
+ return (unsigned long)virt_to_phys(mm->pgd);
139116 }
140117
141118 /*
142119 * Dump out the page tables associated with 'addr' in the currently active mm.
143120 */
144
-void show_pte(unsigned long addr)
121
+static void show_pte(unsigned long addr)
145122 {
146123 struct mm_struct *mm;
147124 pgd_t *pgdp;
....@@ -164,14 +141,15 @@
164141 return;
165142 }
166143
167
- pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgdp = %p\n",
144
+ pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
168145 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
169
- VA_BITS, mm->pgd);
146
+ vabits_actual, mm_to_pgd_phys(mm));
170147 pgdp = pgd_offset(mm, addr);
171148 pgd = READ_ONCE(*pgdp);
172149 pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
173150
174151 do {
152
+ p4d_t *p4dp, p4d;
175153 pud_t *pudp, pud;
176154 pmd_t *pmdp, pmd;
177155 pte_t *ptep, pte;
....@@ -179,7 +157,13 @@
179157 if (pgd_none(pgd) || pgd_bad(pgd))
180158 break;
181159
182
- pudp = pud_offset(pgdp, addr);
160
+ p4dp = p4d_offset(pgdp, addr);
161
+ p4d = READ_ONCE(*p4dp);
162
+ pr_cont(", p4d=%016llx", p4d_val(p4d));
163
+ if (p4d_none(p4d) || p4d_bad(p4d))
164
+ break;
165
+
166
+ pudp = pud_offset(p4dp, addr);
183167 pud = READ_ONCE(*pudp);
184168 pr_cont(", pud=%016llx", pud_val(pud));
185169 if (pud_none(pud) || pud_bad(pud))
....@@ -239,7 +223,9 @@
239223 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
240224 } while (pteval != old_pteval);
241225
242
- flush_tlb_fix_spurious_fault(vma, address);
226
+ /* Invalidate a stale read-only entry */
227
+ if (dirty)
228
+ flush_tlb_page(vma, address);
243229 return 1;
244230 }
245231
....@@ -248,9 +234,8 @@
248234 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
249235 }
250236
251
-static inline bool is_el1_permission_fault(unsigned int esr,
252
- struct pt_regs *regs,
253
- unsigned long addr)
237
+static inline bool is_el1_permission_fault(unsigned long addr, unsigned int esr,
238
+ struct pt_regs *regs)
254239 {
255240 unsigned int ec = ESR_ELx_EC(esr);
256241 unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE;
....@@ -268,6 +253,38 @@
268253 return false;
269254 }
270255
256
+static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
257
+ unsigned int esr,
258
+ struct pt_regs *regs)
259
+{
260
+ unsigned long flags;
261
+ u64 par, dfsc;
262
+
263
+ if (ESR_ELx_EC(esr) != ESR_ELx_EC_DABT_CUR ||
264
+ (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
265
+ return false;
266
+
267
+ local_irq_save(flags);
268
+ asm volatile("at s1e1r, %0" :: "r" (addr));
269
+ isb();
270
+ par = read_sysreg_par();
271
+ local_irq_restore(flags);
272
+
273
+ /*
274
+ * If we now have a valid translation, treat the translation fault as
275
+ * spurious.
276
+ */
277
+ if (!(par & SYS_PAR_EL1_F))
278
+ return true;
279
+
280
+ /*
281
+ * If we got a different type of fault from the AT instruction,
282
+ * treat the translation fault as spurious.
283
+ */
284
+ dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
285
+ return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
286
+}
287
+
271288 static void die_kernel_fault(const char *msg, unsigned long addr,
272289 unsigned int esr, struct pt_regs *regs)
273290 {
....@@ -276,12 +293,77 @@
276293 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
277294 addr);
278295
296
+ trace_android_rvh_die_kernel_fault(regs, esr, addr, msg);
279297 mem_abort_decode(esr);
280298
281299 show_pte(addr);
282300 die("Oops", regs, esr);
283301 bust_spinlocks(0);
284
- do_exit(SIGKILL);
302
+ make_task_dead(SIGKILL);
303
+}
304
+
305
+#ifdef CONFIG_KASAN_HW_TAGS
306
+static void report_tag_fault(unsigned long addr, unsigned int esr,
307
+ struct pt_regs *regs)
308
+{
309
+ static bool reported;
310
+ bool is_write;
311
+
312
+ if (READ_ONCE(reported))
313
+ return;
314
+
315
+ /*
316
+ * This is used for KASAN tests and assumes that no MTE faults
317
+ * happened before running the tests.
318
+ */
319
+ if (mte_report_once())
320
+ WRITE_ONCE(reported, true);
321
+
322
+ /*
323
+ * SAS bits aren't set for all faults reported in EL1, so we can't
324
+ * find out access size.
325
+ */
326
+ is_write = !!(esr & ESR_ELx_WNR);
327
+ kasan_report(addr, 0, is_write, regs->pc);
328
+}
329
+#else
330
+/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
331
+static inline void report_tag_fault(unsigned long addr, unsigned int esr,
332
+ struct pt_regs *regs) { }
333
+#endif
334
+
335
+static void do_tag_recovery(unsigned long addr, unsigned int esr,
336
+ struct pt_regs *regs)
337
+{
338
+
339
+ report_tag_fault(addr, esr, regs);
340
+
341
+ /*
342
+ * Disable MTE Tag Checking on the local CPU for the current EL.
343
+ * It will be done lazily on the other CPUs when they will hit a
344
+ * tag fault.
345
+ */
346
+ sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE);
347
+ isb();
348
+}
349
+
350
+static bool is_el1_mte_sync_tag_check_fault(unsigned int esr)
351
+{
352
+ unsigned int ec = ESR_ELx_EC(esr);
353
+ unsigned int fsc = esr & ESR_ELx_FSC;
354
+
355
+ if (ec != ESR_ELx_EC_DABT_CUR)
356
+ return false;
357
+
358
+ if (fsc == ESR_ELx_FSC_MTE)
359
+ return true;
360
+
361
+ return false;
362
+}
363
+
364
+static bool is_translation_fault(unsigned long esr)
365
+{
366
+ return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
285367 }
286368
287369 static void __do_kernel_fault(unsigned long addr, unsigned int esr,
....@@ -296,23 +378,39 @@
296378 if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
297379 return;
298380
299
- if (is_el1_permission_fault(esr, regs, addr)) {
381
+ if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
382
+ "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
383
+ return;
384
+
385
+ if (is_el1_mte_sync_tag_check_fault(esr)) {
386
+ do_tag_recovery(addr, esr, regs);
387
+
388
+ return;
389
+ }
390
+
391
+ if (is_el1_permission_fault(addr, esr, regs)) {
300392 if (esr & ESR_ELx_WNR)
301393 msg = "write to read-only memory";
394
+ else if (is_el1_instruction_abort(esr))
395
+ msg = "execute from non-executable memory";
302396 else
303397 msg = "read from unreadable memory";
304398 } else if (addr < PAGE_SIZE) {
305399 msg = "NULL pointer dereference";
306400 } else {
401
+ if (is_translation_fault(esr) &&
402
+ kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
403
+ return;
404
+
307405 msg = "paging request";
308406 }
309407
310408 die_kernel_fault(msg, addr, esr, regs);
311409 }
312410
313
-static void __do_user_fault(struct siginfo *info, unsigned int esr)
411
+static void set_thread_esr(unsigned long address, unsigned int esr)
314412 {
315
- current->thread.fault_address = (unsigned long)info->si_addr;
413
+ current->thread.fault_address = address;
316414
317415 /*
318416 * If the faulting address is in the kernel, we must sanitize the ESR.
....@@ -365,68 +463,56 @@
365463 }
366464
367465 current->thread.fault_code = esr;
368
- arm64_force_sig_info(info, esr_to_fault_info(esr)->name, current);
369466 }
370467
371
-static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs)
468
+static void do_bad_area(unsigned long far, unsigned int esr,
469
+ struct pt_regs *regs)
372470 {
471
+ unsigned long addr = untagged_addr(far);
472
+
373473 /*
374474 * If we are in kernel mode at this point, we have no context to
375475 * handle this fault with.
376476 */
377477 if (user_mode(regs)) {
378478 const struct fault_info *inf = esr_to_fault_info(esr);
379
- struct siginfo si;
380479
381
- clear_siginfo(&si);
382
- si.si_signo = inf->sig;
383
- si.si_code = inf->code;
384
- si.si_addr = (void __user *)addr;
385
-
386
- __do_user_fault(&si, esr);
480
+ set_thread_esr(addr, esr);
481
+ arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
387482 } else {
388483 __do_kernel_fault(addr, esr, regs);
389484 }
390485 }
391486
392
-#define VM_FAULT_BADMAP 0x010000
393
-#define VM_FAULT_BADACCESS 0x020000
487
+#define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
488
+#define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
394489
395
-static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
396
- unsigned int mm_flags, unsigned long vm_flags,
397
- struct task_struct *tsk)
490
+static int __do_page_fault(struct vm_area_struct *vma, unsigned long addr,
491
+ unsigned int mm_flags, unsigned long vm_flags,
492
+ struct pt_regs *regs)
398493 {
399
- struct vm_area_struct *vma;
400
- vm_fault_t fault;
401494
402
- vma = find_vma(mm, addr);
403
- fault = VM_FAULT_BADMAP;
404495 if (unlikely(!vma))
405
- goto out;
406
- if (unlikely(vma->vm_start > addr))
407
- goto check_stack;
496
+ return VM_FAULT_BADMAP;
408497
409498 /*
410499 * Ok, we have a good vm_area for this memory access, so we can handle
411500 * it.
412501 */
413
-good_area:
502
+ if (unlikely(vma->vm_start > addr)) {
503
+ if (!(vma->vm_flags & VM_GROWSDOWN))
504
+ return VM_FAULT_BADMAP;
505
+ if (expand_stack(vma, addr))
506
+ return VM_FAULT_BADMAP;
507
+ }
508
+
414509 /*
415510 * Check that the permissions on the VMA allow for the fault which
416511 * occurred.
417512 */
418
- if (!(vma->vm_flags & vm_flags)) {
419
- fault = VM_FAULT_BADACCESS;
420
- goto out;
421
- }
422
-
423
- return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
424
-
425
-check_stack:
426
- if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
427
- goto good_area;
428
-out:
429
- return fault;
513
+ if (!(vma->vm_flags & vm_flags))
514
+ return VM_FAULT_BADACCESS;
515
+ return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags, regs);
430516 }
431517
432518 static bool is_el0_instruction_abort(unsigned int esr)
....@@ -434,21 +520,28 @@
434520 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
435521 }
436522
437
-static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
523
+/*
524
+ * Note: not valid for EL1 DC IVAC, but we never use that such that it
525
+ * should fault. EL0 cannot issue DC IVAC (undef).
526
+ */
527
+static bool is_write_abort(unsigned int esr)
528
+{
529
+ return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
530
+}
531
+
532
+static int __kprobes do_page_fault(unsigned long far, unsigned int esr,
438533 struct pt_regs *regs)
439534 {
440
- struct task_struct *tsk;
441
- struct mm_struct *mm;
442
- struct siginfo si;
443
- vm_fault_t fault, major = 0;
444
- unsigned long vm_flags = VM_READ | VM_WRITE | VM_EXEC;
445
- unsigned int mm_flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
535
+ const struct fault_info *inf;
536
+ struct mm_struct *mm = current->mm;
537
+ vm_fault_t fault;
538
+ unsigned long vm_flags = VM_ACCESS_FLAGS;
539
+ unsigned int mm_flags = FAULT_FLAG_DEFAULT;
540
+ struct vm_area_struct *vma = NULL;
541
+ unsigned long addr = untagged_addr(far);
446542
447
- if (notify_page_fault(regs, esr))
543
+ if (kprobe_page_fault(regs, esr))
448544 return 0;
449
-
450
- tsk = current;
451
- mm = tsk->mm;
452545
453546 /*
454547 * If we're in an interrupt or have no user context, we must not take
....@@ -462,12 +555,13 @@
462555
463556 if (is_el0_instruction_abort(esr)) {
464557 vm_flags = VM_EXEC;
465
- } else if ((esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM)) {
558
+ mm_flags |= FAULT_FLAG_INSTRUCTION;
559
+ } else if (is_write_abort(esr)) {
466560 vm_flags = VM_WRITE;
467561 mm_flags |= FAULT_FLAG_WRITE;
468562 }
469563
470
- if (is_ttbr0_addr(addr) && is_el1_permission_fault(esr, regs, addr)) {
564
+ if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
471565 /* regs->orig_addr_limit may be 0 if we entered from EL0 */
472566 if (regs->orig_addr_limit == KERNEL_DS)
473567 die_kernel_fault("access to user memory with fs=KERNEL_DS",
....@@ -485,15 +579,23 @@
485579 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
486580
487581 /*
582
+ * let's try a speculative page fault without grabbing the
583
+ * mmap_sem.
584
+ */
585
+ fault = handle_speculative_fault(mm, addr, mm_flags, &vma, regs);
586
+ if (fault != VM_FAULT_RETRY)
587
+ goto done;
588
+
589
+ /*
488590 * As per x86, we may deadlock here. However, since the kernel only
489591 * validly references user space from well defined areas of the code,
490592 * we can bug out early if this is from code which shouldn't.
491593 */
492
- if (!down_read_trylock(&mm->mmap_sem)) {
594
+ if (!mmap_read_trylock(mm)) {
493595 if (!user_mode(regs) && !search_exception_tables(regs->pc))
494596 goto no_context;
495597 retry:
496
- down_read(&mm->mmap_sem);
598
+ mmap_read_lock(mm);
497599 } else {
498600 /*
499601 * The above down_read_trylock() might have succeeded in which
....@@ -501,62 +603,47 @@
501603 */
502604 might_sleep();
503605 #ifdef CONFIG_DEBUG_VM
504
- if (!user_mode(regs) && !search_exception_tables(regs->pc))
606
+ if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
607
+ mmap_read_unlock(mm);
505608 goto no_context;
609
+ }
506610 #endif
507611 }
508612
509
- fault = __do_page_fault(mm, addr, mm_flags, vm_flags, tsk);
510
- major |= fault & VM_FAULT_MAJOR;
613
+ if (!vma || !can_reuse_spf_vma(vma, addr))
614
+ vma = find_vma(mm, addr);
615
+ fault = __do_page_fault(vma, addr, mm_flags, vm_flags, regs);
616
+
617
+ /* Quick path to respond to signals */
618
+ if (fault_signal_pending(fault, regs)) {
619
+ if (!user_mode(regs))
620
+ goto no_context;
621
+ return 0;
622
+ }
511623
512624 if (fault & VM_FAULT_RETRY) {
513
- /*
514
- * If we need to retry but a fatal signal is pending,
515
- * handle the signal first. We do not need to release
516
- * the mmap_sem because it would already be released
517
- * in __lock_page_or_retry in mm/filemap.c.
518
- */
519
- if (fatal_signal_pending(current)) {
520
- if (!user_mode(regs))
521
- goto no_context;
522
- return 0;
523
- }
524
-
525
- /*
526
- * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk of
527
- * starvation.
528
- */
529625 if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
530
- mm_flags &= ~FAULT_FLAG_ALLOW_RETRY;
531626 mm_flags |= FAULT_FLAG_TRIED;
627
+
628
+ /*
629
+ * Do not try to reuse this vma and fetch it
630
+ * again since we will release the mmap_sem.
631
+ */
632
+ vma = NULL;
633
+
532634 goto retry;
533635 }
534636 }
535
- up_read(&mm->mmap_sem);
637
+ mmap_read_unlock(mm);
638
+
639
+done:
536640
537641 /*
538642 * Handle the "normal" (no error) case first.
539643 */
540644 if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
541
- VM_FAULT_BADACCESS)))) {
542
- /*
543
- * Major/minor page fault accounting is only done
544
- * once. If we go through a retry, it is extremely
545
- * likely that the page will be found in page cache at
546
- * that point.
547
- */
548
- if (major) {
549
- tsk->maj_flt++;
550
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs,
551
- addr);
552
- } else {
553
- tsk->min_flt++;
554
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs,
555
- addr);
556
- }
557
-
645
+ VM_FAULT_BADACCESS))))
558646 return 0;
559
- }
560647
561648 /*
562649 * If we are in kernel mode at this point, we have no context to
....@@ -575,37 +662,32 @@
575662 return 0;
576663 }
577664
578
- clear_siginfo(&si);
579
- si.si_addr = (void __user *)addr;
580
-
665
+ inf = esr_to_fault_info(esr);
666
+ set_thread_esr(addr, esr);
581667 if (fault & VM_FAULT_SIGBUS) {
582668 /*
583669 * We had some memory, but were unable to successfully fix up
584670 * this page fault.
585671 */
586
- si.si_signo = SIGBUS;
587
- si.si_code = BUS_ADRERR;
588
- } else if (fault & VM_FAULT_HWPOISON_LARGE) {
589
- unsigned int hindex = VM_FAULT_GET_HINDEX(fault);
672
+ arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
673
+ } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
674
+ unsigned int lsb;
590675
591
- si.si_signo = SIGBUS;
592
- si.si_code = BUS_MCEERR_AR;
593
- si.si_addr_lsb = hstate_index_to_shift(hindex);
594
- } else if (fault & VM_FAULT_HWPOISON) {
595
- si.si_signo = SIGBUS;
596
- si.si_code = BUS_MCEERR_AR;
597
- si.si_addr_lsb = PAGE_SHIFT;
676
+ lsb = PAGE_SHIFT;
677
+ if (fault & VM_FAULT_HWPOISON_LARGE)
678
+ lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
679
+
680
+ arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
598681 } else {
599682 /*
600683 * Something tried to access memory that isn't in our memory
601684 * map.
602685 */
603
- si.si_signo = SIGSEGV;
604
- si.si_code = fault == VM_FAULT_BADACCESS ?
605
- SEGV_ACCERR : SEGV_MAPERR;
686
+ arm64_force_sig_fault(SIGSEGV,
687
+ fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
688
+ far, inf->name);
606689 }
607690
608
- __do_user_fault(&si, esr);
609691 return 0;
610692
611693 no_context:
....@@ -613,81 +695,84 @@
613695 return 0;
614696 }
615697
616
-int __weak do_tlb_conf_fault(unsigned long addr,
617
- unsigned int esr,
618
- struct pt_regs *regs)
619
-{
620
- return 1; /* do_bad default */
621
-}
622
-
623
-int (*do_tlb_conf_fault_cb)(unsigned long addr,
624
- unsigned int esr,
625
- struct pt_regs *regs)
626
- = do_tlb_conf_fault; /* initialization saves us a branch */
627
-EXPORT_SYMBOL_GPL(do_tlb_conf_fault_cb);
628
-
629
-static int _do_tlb_conf_fault(unsigned long addr,
630
- unsigned int esr,
631
- struct pt_regs *regs)
632
-{
633
- return (*do_tlb_conf_fault_cb)(addr, esr, regs);
634
-}
635
-
636
-static int __kprobes do_translation_fault(unsigned long addr,
698
+static int __kprobes do_translation_fault(unsigned long far,
637699 unsigned int esr,
638700 struct pt_regs *regs)
639701 {
640
- if (is_ttbr0_addr(addr))
641
- return do_page_fault(addr, esr, regs);
702
+ unsigned long addr = untagged_addr(far);
642703
643
- do_bad_area(addr, esr, regs);
704
+ if (is_ttbr0_addr(addr))
705
+ return do_page_fault(far, esr, regs);
706
+
707
+ do_bad_area(far, esr, regs);
644708 return 0;
645709 }
646710
647
-static int do_alignment_fault(unsigned long addr, unsigned int esr,
711
+#ifdef CONFIG_ROCKCHIP_ARM64_ALIGN_FAULT_FIX
712
+extern int alignment_fixup_helper(unsigned long addr, unsigned int esr,
713
+ struct pt_regs *regs);
714
+#endif
715
+static int do_alignment_fault(unsigned long far, unsigned int esr,
648716 struct pt_regs *regs)
649717 {
650
- do_bad_area(addr, esr, regs);
718
+#ifdef CONFIG_ROCKCHIP_ARM64_ALIGN_FAULT_FIX
719
+ if (!alignment_fixup_helper(far, esr, regs))
720
+ return 0;
721
+#endif
722
+ do_bad_area(far, esr, regs);
651723 return 0;
652724 }
653725
654
-static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs)
726
+static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs)
655727 {
656
- return 1; /* "fault" */
728
+ unsigned long addr = untagged_addr(far);
729
+ int ret = 1;
730
+
731
+ trace_android_vh_handle_tlb_conf(addr, esr, &ret);
732
+ return ret;
657733 }
658734
659
-static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
735
+static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs)
660736 {
661
- struct siginfo info;
662737 const struct fault_info *inf;
738
+ unsigned long siaddr;
663739
664740 inf = esr_to_fault_info(esr);
665741
666
- /*
667
- * Synchronous aborts may interrupt code which had interrupts masked.
668
- * Before calling out into the wider kernel tell the interested
669
- * subsystems.
670
- */
671
- if (IS_ENABLED(CONFIG_ACPI_APEI_SEA)) {
672
- if (interrupts_enabled(regs))
673
- nmi_enter();
674
-
675
- ghes_notify_sea();
676
-
677
- if (interrupts_enabled(regs))
678
- nmi_exit();
742
+ if (user_mode(regs) && apei_claim_sea(regs) == 0) {
743
+ /*
744
+ * APEI claimed this as a firmware-first notification.
745
+ * Some processing deferred to task_work before ret_to_user().
746
+ */
747
+ return 0;
679748 }
680749
681
- clear_siginfo(&info);
682
- info.si_signo = inf->sig;
683
- info.si_errno = 0;
684
- info.si_code = inf->code;
685
- if (esr & ESR_ELx_FnV)
686
- info.si_addr = NULL;
687
- else
688
- info.si_addr = (void __user *)addr;
689
- arm64_notify_die(inf->name, regs, &info, esr);
750
+ if (esr & ESR_ELx_FnV) {
751
+ siaddr = 0;
752
+ } else {
753
+ /*
754
+ * The architecture specifies that the tag bits of FAR_EL1 are
755
+ * UNKNOWN for synchronous external aborts. Mask them out now
756
+ * so that userspace doesn't see them.
757
+ */
758
+ siaddr = untagged_addr(far);
759
+ }
760
+ trace_android_rvh_do_sea(regs, esr, siaddr, inf->name);
761
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
690762
763
+ return 0;
764
+}
765
+
766
+static int do_tag_check_fault(unsigned long far, unsigned int esr,
767
+ struct pt_regs *regs)
768
+{
769
+ /*
770
+ * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
771
+ * for tag check faults. Set them to corresponding bits in the untagged
772
+ * address.
773
+ */
774
+ far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
775
+ do_bad_area(far, esr, regs);
691776 return 0;
692777 }
693778
....@@ -709,7 +794,7 @@
709794 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
710795 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
711796 { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
712
- { do_bad, SIGKILL, SI_KERNEL, "unknown 17" },
797
+ { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
713798 { do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
714799 { do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
715800 { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
....@@ -740,7 +825,7 @@
740825 { do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
741826 { do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
742827 { do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
743
- { _do_tlb_conf_fault, SIGKILL, SI_KERNEL, "TLB conflict abort" },
828
+ { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
744829 { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
745830 { do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
746831 { do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
....@@ -758,76 +843,45 @@
758843 { do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
759844 };
760845
761
-int handle_guest_sea(phys_addr_t addr, unsigned int esr)
762
-{
763
- return ghes_notify_sea();
764
-}
765
-
766
-asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
767
- struct pt_regs *regs)
846
+void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
768847 {
769848 const struct fault_info *inf = esr_to_fault_info(esr);
770
- struct siginfo info;
849
+ unsigned long addr = untagged_addr(far);
771850
772
- if (!inf->fn(addr, esr, regs))
851
+ if (!inf->fn(far, esr, regs))
773852 return;
774853
775854 if (!user_mode(regs)) {
776855 pr_alert("Unhandled fault at 0x%016lx\n", addr);
856
+ trace_android_rvh_do_mem_abort(regs, esr, addr, inf->name);
777857 mem_abort_decode(esr);
778858 show_pte(addr);
779859 }
780860
781
- clear_siginfo(&info);
782
- info.si_signo = inf->sig;
783
- info.si_errno = 0;
784
- info.si_code = inf->code;
785
- info.si_addr = (void __user *)addr;
786
- arm64_notify_die(inf->name, regs, &info, esr);
861
+ /*
862
+ * At this point we have an unrecognized fault type whose tag bits may
863
+ * have been defined as UNKNOWN. Therefore we only expose the untagged
864
+ * address to the signal handler.
865
+ */
866
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
787867 }
868
+NOKPROBE_SYMBOL(do_mem_abort);
788869
789
-asmlinkage void __exception do_el0_irq_bp_hardening(void)
870
+void do_el0_irq_bp_hardening(void)
790871 {
791872 /* PC has already been checked in entry.S */
792873 arm64_apply_bp_hardening();
793874 }
875
+NOKPROBE_SYMBOL(do_el0_irq_bp_hardening);
794876
795
-asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
796
- unsigned int esr,
797
- struct pt_regs *regs)
877
+void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs)
798878 {
799
- /*
800
- * We've taken an instruction abort from userspace and not yet
801
- * re-enabled IRQs. If the address is a kernel address, apply
802
- * BP hardening prior to enabling IRQs and pre-emption.
803
- */
804
- if (!is_ttbr0_addr(addr))
805
- arm64_apply_bp_hardening();
879
+ trace_android_rvh_do_sp_pc_abort(regs, esr, addr, user_mode(regs));
806880
807
- local_irq_enable();
808
- do_mem_abort(addr, esr, regs);
881
+ arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
882
+ addr, esr);
809883 }
810
-
811
-
812
-asmlinkage void __exception do_sp_pc_abort(unsigned long addr,
813
- unsigned int esr,
814
- struct pt_regs *regs)
815
-{
816
- struct siginfo info;
817
-
818
- if (user_mode(regs)) {
819
- if (!is_ttbr0_addr(instruction_pointer(regs)))
820
- arm64_apply_bp_hardening();
821
- local_irq_enable();
822
- }
823
-
824
- clear_siginfo(&info);
825
- info.si_signo = SIGBUS;
826
- info.si_errno = 0;
827
- info.si_code = BUS_ADRALN;
828
- info.si_addr = (void __user *)addr;
829
- arm64_notify_die("SP/PC alignment exception", regs, &info, esr);
830
-}
884
+NOKPROBE_SYMBOL(do_sp_pc_abort);
831885
832886 int __init early_brk64(unsigned long addr, unsigned int esr,
833887 struct pt_regs *regs);
....@@ -860,11 +914,32 @@
860914 debug_fault_info[nr].name = name;
861915 }
862916
917
+/*
918
+ * In debug exception context, we explicitly disable preemption despite
919
+ * having interrupts disabled.
920
+ * This serves two purposes: it makes it much less likely that we would
921
+ * accidentally schedule in exception context and it will force a warning
922
+ * if we somehow manage to schedule by accident.
923
+ */
924
+static void debug_exception_enter(struct pt_regs *regs)
925
+{
926
+ preempt_disable();
927
+
928
+ /* This code is a bit fragile. Test it. */
929
+ RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
930
+}
931
+NOKPROBE_SYMBOL(debug_exception_enter);
932
+
933
+static void debug_exception_exit(struct pt_regs *regs)
934
+{
935
+ preempt_enable_no_resched();
936
+}
937
+NOKPROBE_SYMBOL(debug_exception_exit);
938
+
863939 #ifdef CONFIG_ARM64_ERRATUM_1463225
864940 DECLARE_PER_CPU(int, __in_cortex_a76_erratum_1463225_wa);
865941
866
-static int __exception
867
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
942
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
868943 {
869944 if (user_mode(regs))
870945 return 0;
....@@ -883,65 +958,57 @@
883958 return 1;
884959 }
885960 #else
886
-static int __exception
887
-cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
961
+static int cortex_a76_erratum_1463225_debug_handler(struct pt_regs *regs)
888962 {
889963 return 0;
890964 }
891965 #endif /* CONFIG_ARM64_ERRATUM_1463225 */
966
+NOKPROBE_SYMBOL(cortex_a76_erratum_1463225_debug_handler);
892967
893
-asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint,
894
- unsigned int esr,
895
- struct pt_regs *regs)
968
+void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr,
969
+ struct pt_regs *regs)
896970 {
897
- const struct fault_info *inf = debug_fault_info + DBG_ESR_EVT(esr);
971
+ const struct fault_info *inf = esr_to_debug_fault_info(esr);
898972 unsigned long pc = instruction_pointer(regs);
899
- int rv;
900973
901974 if (cortex_a76_erratum_1463225_debug_handler(regs))
902
- return 0;
975
+ return;
903976
904
- /*
905
- * Tell lockdep we disabled irqs in entry.S. Do nothing if they were
906
- * already disabled to preserve the last enabled/disabled addresses.
907
- */
908
- if (interrupts_enabled(regs))
909
- trace_hardirqs_off();
977
+ debug_exception_enter(regs);
910978
911979 if (user_mode(regs) && !is_ttbr0_addr(pc))
912980 arm64_apply_bp_hardening();
913981
914
- if (!inf->fn(addr_if_watchpoint, esr, regs)) {
915
- rv = 1;
916
- } else {
917
- struct siginfo info;
918
-
919
- clear_siginfo(&info);
920
- info.si_signo = inf->sig;
921
- info.si_errno = 0;
922
- info.si_code = inf->code;
923
- info.si_addr = (void __user *)pc;
924
- arm64_notify_die(inf->name, regs, &info, esr);
925
- rv = 0;
982
+ if (inf->fn(addr_if_watchpoint, esr, regs)) {
983
+ arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
926984 }
927985
928
- if (interrupts_enabled(regs))
929
- trace_hardirqs_on();
930
-
931
- return rv;
986
+ debug_exception_exit(regs);
932987 }
933988 NOKPROBE_SYMBOL(do_debug_exception);
934989
935
-#ifdef CONFIG_ARM64_PAN
936
-void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
990
+/*
991
+ * Used during anonymous page fault handling.
992
+ */
993
+struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
994
+ unsigned long vaddr)
937995 {
938
- /*
939
- * We modify PSTATE. This won't work from irq context as the PSTATE
940
- * is discarded once we return from the exception.
941
- */
942
- WARN_ON_ONCE(in_interrupt());
996
+ gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO | __GFP_CMA;
943997
944
- sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0);
945
- asm(SET_PSTATE_PAN(1));
998
+ /*
999
+ * If the page is mapped with PROT_MTE, initialise the tags at the
1000
+ * point of allocation and page zeroing as this is usually faster than
1001
+ * separate DC ZVA and STGM.
1002
+ */
1003
+ if (vma->vm_flags & VM_MTE)
1004
+ flags |= __GFP_ZEROTAGS;
1005
+
1006
+ return alloc_page_vma(flags, vma, vaddr);
9461007 }
947
-#endif /* CONFIG_ARM64_PAN */
1008
+
1009
+void tag_clear_highpage(struct page *page)
1010
+{
1011
+ mte_zero_clear_page_tags(page_address(page));
1012
+ page_kasan_tag_reset(page);
1013
+ set_bit(PG_mte_tagged, &page->flags);
1014
+}