forked from ~ljy/RK356X_SDK_RELEASE

hc
2024-09-20 cf4ce59b3b70238352c7f1729f0f7223214828ad
kernel/arch/powerpc/kernel/mce.c
....@@ -1,19 +1,6 @@
1
+// SPDX-License-Identifier: GPL-2.0-or-later
12 /*
23 * Machine check exception handling.
3
- *
4
- * This program is free software; you can redistribute it and/or modify
5
- * it under the terms of the GNU General Public License as published by
6
- * the Free Software Foundation; either version 2 of the License, or
7
- * (at your option) any later version.
8
- *
9
- * This program is distributed in the hope that it will be useful,
10
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
11
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
- * GNU General Public License for more details.
13
- *
14
- * You should have received a copy of the GNU General Public License
15
- * along with this program; if not, write to the Free Software
16
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
174 *
185 * Copyright 2013 IBM Corporation
196 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
....@@ -28,9 +15,12 @@
2815 #include <linux/percpu.h>
2916 #include <linux/export.h>
3017 #include <linux/irq_work.h>
18
+#include <linux/extable.h>
19
+#include <linux/ftrace.h>
3120
3221 #include <asm/machdep.h>
3322 #include <asm/mce.h>
23
+#include <asm/nmi.h>
3424
3525 static DEFINE_PER_CPU(int, mce_nest_count);
3626 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
....@@ -46,7 +36,7 @@
4636
4737 static void machine_check_process_queued_event(struct irq_work *work);
4838 static void machine_check_ue_irq_work(struct irq_work *work);
49
-void machine_check_ue_event(struct machine_check_event *evt);
39
+static void machine_check_ue_event(struct machine_check_event *evt);
5040 static void machine_process_ue_event(struct work_struct *work);
5141
5242 static struct irq_work mce_event_process_work = {
....@@ -58,6 +48,20 @@
5848 };
5949
6050 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
51
+
52
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
53
+
54
+int mce_register_notifier(struct notifier_block *nb)
55
+{
56
+ return blocking_notifier_chain_register(&mce_notifier_list, nb);
57
+}
58
+EXPORT_SYMBOL_GPL(mce_register_notifier);
59
+
60
+int mce_unregister_notifier(struct notifier_block *nb)
61
+{
62
+ return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
63
+}
64
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
6165
6266 static void mce_set_error_info(struct machine_check_event *mce,
6367 struct mce_error_info *mce_err)
....@@ -116,6 +120,7 @@
116120 mce->srr1 = regs->msr;
117121 mce->gpr3 = regs->gpr[3];
118122 mce->in_use = 1;
123
+ mce->cpu = get_paca()->paca_index;
119124
120125 /* Mark it recovered if we have handled it and MSR(RI=1). */
121126 if (handled && (regs->msr & MSR_RI))
....@@ -125,6 +130,8 @@
125130
126131 mce->initiator = mce_err->initiator;
127132 mce->severity = mce_err->severity;
133
+ mce->sync_error = mce_err->sync_error;
134
+ mce->error_class = mce_err->error_class;
128135
129136 /*
130137 * Populate the mce error_type and type-specific error_type.
....@@ -158,6 +165,7 @@
158165 if (phys_addr != ULONG_MAX) {
159166 mce->u.ue_error.physical_address_provided = true;
160167 mce->u.ue_error.physical_address = phys_addr;
168
+ mce->u.ue_error.ignore_event = mce_err->ignore_event;
161169 machine_check_ue_event(mce);
162170 }
163171 }
....@@ -221,7 +229,7 @@
221229 /*
222230 * Queue up the MCE event which then can be handled later.
223231 */
224
-void machine_check_ue_event(struct machine_check_event *evt)
232
+static void machine_check_ue_event(struct machine_check_event *evt)
225233 {
226234 int index;
227235
....@@ -259,6 +267,19 @@
259267 /* Queue irq work to process this event later. */
260268 irq_work_queue(&mce_event_process_work);
261269 }
270
+
271
+void mce_common_process_ue(struct pt_regs *regs,
272
+ struct mce_error_info *mce_err)
273
+{
274
+ const struct exception_table_entry *entry;
275
+
276
+ entry = search_kernel_exception_table(regs->nip);
277
+ if (entry) {
278
+ mce_err->ignore_event = true;
279
+ regs->nip = extable_fixup(entry);
280
+ }
281
+}
282
+
262283 /*
263284 * process pending MCE event from the mce event queue. This function will be
264285 * called during syscall exit.
....@@ -271,12 +292,22 @@
271292 while (__this_cpu_read(mce_ue_count) > 0) {
272293 index = __this_cpu_read(mce_ue_count) - 1;
273294 evt = this_cpu_ptr(&mce_ue_event_queue[index]);
295
+ blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
274296 #ifdef CONFIG_MEMORY_FAILURE
275297 /*
276298 * This should probably queued elsewhere, but
277299 * oh! well
300
+ *
301
+ * Don't report this machine check because the caller has a
302
+ * asked us to ignore the event, it has a fixup handler which
303
+ * will do the appropriate error handling and reporting.
278304 */
279305 if (evt->error_type == MCE_ERROR_TYPE_UE) {
306
+ if (evt->u.ue_error.ignore_event) {
307
+ __this_cpu_dec(mce_ue_count);
308
+ continue;
309
+ }
310
+
280311 if (evt->u.ue_error.physical_address_provided) {
281312 unsigned long pfn;
282313
....@@ -310,15 +341,25 @@
310341 while (__this_cpu_read(mce_queue_count) > 0) {
311342 index = __this_cpu_read(mce_queue_count) - 1;
312343 evt = this_cpu_ptr(&mce_event_queue[index]);
313
- machine_check_print_event_info(evt, false);
344
+
345
+ if (evt->error_type == MCE_ERROR_TYPE_UE &&
346
+ evt->u.ue_error.ignore_event) {
347
+ __this_cpu_dec(mce_queue_count);
348
+ continue;
349
+ }
350
+ machine_check_print_event_info(evt, false, false);
314351 __this_cpu_dec(mce_queue_count);
315352 }
316353 }
317354
318355 void machine_check_print_event_info(struct machine_check_event *evt,
319
- bool user_mode)
356
+ bool user_mode, bool in_guest)
320357 {
321
- const char *level, *sevstr, *subtype;
358
+ const char *level, *sevstr, *subtype, *err_type, *initiator;
359
+ uint64_t ea = 0, pa = 0;
360
+ int n = 0;
361
+ char dar_str[50];
362
+ char pa_str[50];
322363 static const char *mc_ue_types[] = {
323364 "Indeterminate",
324365 "Instruction fetch",
....@@ -344,6 +385,7 @@
344385 static const char *mc_user_types[] = {
345386 "Indeterminate",
346387 "tlbie(l) invalid",
388
+ "scv invalid",
347389 };
348390 static const char *mc_ra_types[] = {
349391 "Indeterminate",
....@@ -365,6 +407,13 @@
365407 "Store (timeout)",
366408 "Page table walk Load/Store (timeout)",
367409 };
410
+ static const char *mc_error_class[] = {
411
+ "Unknown",
412
+ "Hardware error",
413
+ "Probable Hardware error (some chance of software cause)",
414
+ "Software error",
415
+ "Probable Software error (some chance of hardware cause)",
416
+ };
368417
369418 /* Print things out */
370419 if (evt->version != MCE_V1) {
....@@ -379,9 +428,9 @@
379428 break;
380429 case MCE_SEV_WARNING:
381430 level = KERN_WARNING;
382
- sevstr = "";
431
+ sevstr = "Warning";
383432 break;
384
- case MCE_SEV_ERROR_SYNC:
433
+ case MCE_SEV_SEVERE:
385434 level = KERN_ERR;
386435 sevstr = "Severe";
387436 break;
....@@ -392,99 +441,145 @@
392441 break;
393442 }
394443
395
- printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
396
- evt->disposition == MCE_DISPOSITION_RECOVERED ?
397
- "Recovered" : "Not recovered");
398
-
399
- if (user_mode) {
400
- printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level,
401
- evt->srr0, current->pid, current->comm);
402
- } else {
403
- printk("%s NIP [%016llx]: %pS\n", level, evt->srr0,
404
- (void *)evt->srr0);
444
+ switch(evt->initiator) {
445
+ case MCE_INITIATOR_CPU:
446
+ initiator = "CPU";
447
+ break;
448
+ case MCE_INITIATOR_PCI:
449
+ initiator = "PCI";
450
+ break;
451
+ case MCE_INITIATOR_ISA:
452
+ initiator = "ISA";
453
+ break;
454
+ case MCE_INITIATOR_MEMORY:
455
+ initiator = "Memory";
456
+ break;
457
+ case MCE_INITIATOR_POWERMGM:
458
+ initiator = "Power Management";
459
+ break;
460
+ case MCE_INITIATOR_UNKNOWN:
461
+ default:
462
+ initiator = "Unknown";
463
+ break;
405464 }
406465
407
- printk("%s Initiator: %s\n", level,
408
- evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
409466 switch (evt->error_type) {
410467 case MCE_ERROR_TYPE_UE:
468
+ err_type = "UE";
411469 subtype = evt->u.ue_error.ue_error_type <
412470 ARRAY_SIZE(mc_ue_types) ?
413471 mc_ue_types[evt->u.ue_error.ue_error_type]
414472 : "Unknown";
415
- printk("%s Error type: UE [%s]\n", level, subtype);
416473 if (evt->u.ue_error.effective_address_provided)
417
- printk("%s Effective address: %016llx\n",
418
- level, evt->u.ue_error.effective_address);
474
+ ea = evt->u.ue_error.effective_address;
419475 if (evt->u.ue_error.physical_address_provided)
420
- printk("%s Physical address: %016llx\n",
421
- level, evt->u.ue_error.physical_address);
476
+ pa = evt->u.ue_error.physical_address;
422477 break;
423478 case MCE_ERROR_TYPE_SLB:
479
+ err_type = "SLB";
424480 subtype = evt->u.slb_error.slb_error_type <
425481 ARRAY_SIZE(mc_slb_types) ?
426482 mc_slb_types[evt->u.slb_error.slb_error_type]
427483 : "Unknown";
428
- printk("%s Error type: SLB [%s]\n", level, subtype);
429484 if (evt->u.slb_error.effective_address_provided)
430
- printk("%s Effective address: %016llx\n",
431
- level, evt->u.slb_error.effective_address);
485
+ ea = evt->u.slb_error.effective_address;
432486 break;
433487 case MCE_ERROR_TYPE_ERAT:
488
+ err_type = "ERAT";
434489 subtype = evt->u.erat_error.erat_error_type <
435490 ARRAY_SIZE(mc_erat_types) ?
436491 mc_erat_types[evt->u.erat_error.erat_error_type]
437492 : "Unknown";
438
- printk("%s Error type: ERAT [%s]\n", level, subtype);
439493 if (evt->u.erat_error.effective_address_provided)
440
- printk("%s Effective address: %016llx\n",
441
- level, evt->u.erat_error.effective_address);
494
+ ea = evt->u.erat_error.effective_address;
442495 break;
443496 case MCE_ERROR_TYPE_TLB:
497
+ err_type = "TLB";
444498 subtype = evt->u.tlb_error.tlb_error_type <
445499 ARRAY_SIZE(mc_tlb_types) ?
446500 mc_tlb_types[evt->u.tlb_error.tlb_error_type]
447501 : "Unknown";
448
- printk("%s Error type: TLB [%s]\n", level, subtype);
449502 if (evt->u.tlb_error.effective_address_provided)
450
- printk("%s Effective address: %016llx\n",
451
- level, evt->u.tlb_error.effective_address);
503
+ ea = evt->u.tlb_error.effective_address;
452504 break;
453505 case MCE_ERROR_TYPE_USER:
506
+ err_type = "User";
454507 subtype = evt->u.user_error.user_error_type <
455508 ARRAY_SIZE(mc_user_types) ?
456509 mc_user_types[evt->u.user_error.user_error_type]
457510 : "Unknown";
458
- printk("%s Error type: User [%s]\n", level, subtype);
459511 if (evt->u.user_error.effective_address_provided)
460
- printk("%s Effective address: %016llx\n",
461
- level, evt->u.user_error.effective_address);
512
+ ea = evt->u.user_error.effective_address;
462513 break;
463514 case MCE_ERROR_TYPE_RA:
515
+ err_type = "Real address";
464516 subtype = evt->u.ra_error.ra_error_type <
465517 ARRAY_SIZE(mc_ra_types) ?
466518 mc_ra_types[evt->u.ra_error.ra_error_type]
467519 : "Unknown";
468
- printk("%s Error type: Real address [%s]\n", level, subtype);
469520 if (evt->u.ra_error.effective_address_provided)
470
- printk("%s Effective address: %016llx\n",
471
- level, evt->u.ra_error.effective_address);
521
+ ea = evt->u.ra_error.effective_address;
472522 break;
473523 case MCE_ERROR_TYPE_LINK:
524
+ err_type = "Link";
474525 subtype = evt->u.link_error.link_error_type <
475526 ARRAY_SIZE(mc_link_types) ?
476527 mc_link_types[evt->u.link_error.link_error_type]
477528 : "Unknown";
478
- printk("%s Error type: Link [%s]\n", level, subtype);
479529 if (evt->u.link_error.effective_address_provided)
480
- printk("%s Effective address: %016llx\n",
481
- level, evt->u.link_error.effective_address);
530
+ ea = evt->u.link_error.effective_address;
531
+ break;
532
+ case MCE_ERROR_TYPE_DCACHE:
533
+ err_type = "D-Cache";
534
+ subtype = "Unknown";
535
+ break;
536
+ case MCE_ERROR_TYPE_ICACHE:
537
+ err_type = "I-Cache";
538
+ subtype = "Unknown";
482539 break;
483540 default:
484541 case MCE_ERROR_TYPE_UNKNOWN:
485
- printk("%s Error type: Unknown\n", level);
542
+ err_type = "Unknown";
543
+ subtype = "";
486544 break;
487545 }
546
+
547
+ dar_str[0] = pa_str[0] = '\0';
548
+ if (ea && evt->srr0 != ea) {
549
+ /* Load/Store address */
550
+ n = sprintf(dar_str, "DAR: %016llx ", ea);
551
+ if (pa)
552
+ sprintf(dar_str + n, "paddr: %016llx ", pa);
553
+ } else if (pa) {
554
+ sprintf(pa_str, " paddr: %016llx", pa);
555
+ }
556
+
557
+ printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
558
+ level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
559
+ err_type, subtype, dar_str,
560
+ evt->disposition == MCE_DISPOSITION_RECOVERED ?
561
+ "Recovered" : "Not recovered");
562
+
563
+ if (in_guest || user_mode) {
564
+ printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
565
+ level, evt->cpu, current->pid, current->comm,
566
+ in_guest ? "Guest " : "", evt->srr0, pa_str);
567
+ } else {
568
+ printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
569
+ level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
570
+ }
571
+
572
+ printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator);
573
+
574
+ subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
575
+ mc_error_class[evt->error_class] : "Unknown";
576
+ printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
577
+
578
+#ifdef CONFIG_PPC_BOOK3S_64
579
+ /* Display faulty slb contents for SLB errors. */
580
+ if (evt->error_type == MCE_ERROR_TYPE_SLB)
581
+ slb_dump_contents(local_paca->mce_faulty_slbs);
582
+#endif
488583 }
489584 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
490585
....@@ -493,14 +588,29 @@
493588 *
494589 * regs->nip and regs->msr contains srr0 and ssr1.
495590 */
496
-long machine_check_early(struct pt_regs *regs)
591
+long notrace machine_check_early(struct pt_regs *regs)
497592 {
498593 long handled = 0;
594
+ u8 ftrace_enabled = this_cpu_get_ftrace_enabled();
499595
500
- __this_cpu_inc(irq_stat.mce_exceptions);
596
+ this_cpu_set_ftrace_enabled(0);
597
+ /* Do not use nmi_enter/exit for pseries hpte guest */
598
+ if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
599
+ nmi_enter();
501600
502
- if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
503
- handled = cur_cpu_spec->machine_check_early(regs);
601
+ hv_nmi_check_nonrecoverable(regs);
602
+
603
+ /*
604
+ * See if platform is capable of handling machine check.
605
+ */
606
+ if (ppc_md.machine_check_early)
607
+ handled = ppc_md.machine_check_early(regs);
608
+
609
+ if (radix_enabled() || !firmware_has_feature(FW_FEATURE_LPAR))
610
+ nmi_exit();
611
+
612
+ this_cpu_set_ftrace_enabled(ftrace_enabled);
613
+
504614 return handled;
505615 }
506616
....@@ -616,7 +726,7 @@
616726 {
617727 int ret;
618728
619
- __this_cpu_inc(irq_stat.hmi_exceptions);
729
+ local_paca->hmi_irqs++;
620730
621731 ret = hmi_handle_debugtrig(regs);
622732 if (ret >= 0)