commit | author | age
|
a07526
|
1 |
/* |
H |
2 |
* Copyright (C) 1991, 1992 Linus Torvalds |
|
3 |
* Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs |
|
4 |
*/ |
|
5 |
#include <linux/kallsyms.h> |
|
6 |
#include <linux/kprobes.h> |
|
7 |
#include <linux/uaccess.h> |
|
8 |
#include <linux/utsname.h> |
|
9 |
#include <linux/hardirq.h> |
2f529f
|
10 |
#include <linux/irq_pipeline.h> |
a07526
|
11 |
#include <linux/kdebug.h> |
H |
12 |
#include <linux/module.h> |
|
13 |
#include <linux/ptrace.h> |
|
14 |
#include <linux/sched/debug.h> |
|
15 |
#include <linux/sched/task_stack.h> |
|
16 |
#include <linux/ftrace.h> |
|
17 |
#include <linux/kexec.h> |
|
18 |
#include <linux/bug.h> |
|
19 |
#include <linux/nmi.h> |
|
20 |
#include <linux/sysfs.h> |
|
21 |
#include <linux/kasan.h> |
|
22 |
|
|
23 |
#include <asm/cpu_entry_area.h> |
|
24 |
#include <asm/stacktrace.h> |
|
25 |
#include <asm/unwind.h> |
|
26 |
|
|
27 |
int panic_on_unrecovered_nmi; |
|
28 |
int panic_on_io_nmi; |
|
29 |
static int die_counter; |
|
30 |
|
|
31 |
static struct pt_regs exec_summary_regs; |
|
32 |
|
|
33 |
bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, |
|
34 |
struct stack_info *info) |
|
35 |
{ |
|
36 |
unsigned long *begin = task_stack_page(task); |
|
37 |
unsigned long *end = task_stack_page(task) + THREAD_SIZE; |
|
38 |
|
|
39 |
if (stack < begin || stack >= end) |
|
40 |
return false; |
|
41 |
|
|
42 |
info->type = STACK_TYPE_TASK; |
|
43 |
info->begin = begin; |
|
44 |
info->end = end; |
|
45 |
info->next_sp = NULL; |
|
46 |
|
|
47 |
return true; |
|
48 |
} |
|
49 |
|
|
50 |
/* Called from get_stack_info_noinstr - so must be noinstr too */ |
|
51 |
bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) |
|
52 |
{ |
|
53 |
struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); |
|
54 |
|
|
55 |
void *begin = ss; |
|
56 |
void *end = ss + 1; |
|
57 |
|
|
58 |
if ((void *)stack < begin || (void *)stack >= end) |
|
59 |
return false; |
|
60 |
|
|
61 |
info->type = STACK_TYPE_ENTRY; |
|
62 |
info->begin = begin; |
|
63 |
info->end = end; |
|
64 |
info->next_sp = NULL; |
|
65 |
|
|
66 |
return true; |
|
67 |
} |
|
68 |
|
|
69 |
static void printk_stack_address(unsigned long address, int reliable, |
|
70 |
const char *log_lvl) |
|
71 |
{ |
|
72 |
touch_nmi_watchdog(); |
|
73 |
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); |
|
74 |
} |
|
75 |
|
|
76 |
static int copy_code(struct pt_regs *regs, u8 *buf, unsigned long src, |
|
77 |
unsigned int nbytes) |
|
78 |
{ |
|
79 |
if (!user_mode(regs)) |
|
80 |
return copy_from_kernel_nofault(buf, (u8 *)src, nbytes); |
|
81 |
|
|
82 |
/* The user space code from other tasks cannot be accessed. */ |
|
83 |
if (regs != task_pt_regs(current)) |
|
84 |
return -EPERM; |
|
85 |
/* |
|
86 |
* Make sure userspace isn't trying to trick us into dumping kernel |
|
87 |
* memory by pointing the userspace instruction pointer at it. |
|
88 |
*/ |
|
89 |
if (__chk_range_not_ok(src, nbytes, TASK_SIZE_MAX)) |
|
90 |
return -EINVAL; |
|
91 |
|
|
92 |
/* |
|
93 |
* Even if named copy_from_user_nmi() this can be invoked from |
|
94 |
* other contexts and will not try to resolve a pagefault, which is |
|
95 |
* the correct thing to do here as this code can be called from any |
|
96 |
* context. |
|
97 |
*/ |
|
98 |
return copy_from_user_nmi(buf, (void __user *)src, nbytes); |
|
99 |
} |
|
100 |
|
|
101 |
/* |
|
102 |
* There are a couple of reasons for the 2/3rd prologue, courtesy of Linus: |
|
103 |
* |
|
104 |
* In case where we don't have the exact kernel image (which, if we did, we can |
|
105 |
* simply disassemble and navigate to the RIP), the purpose of the bigger |
|
106 |
* prologue is to have more context and to be able to correlate the code from |
|
107 |
* the different toolchains better. |
|
108 |
* |
|
109 |
* In addition, it helps in recreating the register allocation of the failing |
|
110 |
* kernel and thus make sense of the register dump. |
|
111 |
* |
|
112 |
* What is more, the additional complication of a variable length insn arch like |
|
113 |
* x86 warrants having longer byte sequence before rIP so that the disassembler |
|
114 |
* can "sync" up properly and find instruction boundaries when decoding the |
|
115 |
* opcode bytes. |
|
116 |
* |
|
117 |
* Thus, the 2/3rds prologue and 64 byte OPCODE_BUFSIZE is just a random |
|
118 |
* guesstimate in attempt to achieve all of the above. |
|
119 |
*/ |
|
120 |
void show_opcodes(struct pt_regs *regs, const char *loglvl) |
|
121 |
{ |
|
122 |
#define PROLOGUE_SIZE 42 |
|
123 |
#define EPILOGUE_SIZE 21 |
|
124 |
#define OPCODE_BUFSIZE (PROLOGUE_SIZE + 1 + EPILOGUE_SIZE) |
|
125 |
u8 opcodes[OPCODE_BUFSIZE]; |
|
126 |
unsigned long prologue = regs->ip - PROLOGUE_SIZE; |
|
127 |
|
|
128 |
switch (copy_code(regs, opcodes, prologue, sizeof(opcodes))) { |
|
129 |
case 0: |
|
130 |
printk("%sCode: %" __stringify(PROLOGUE_SIZE) "ph <%02x> %" |
|
131 |
__stringify(EPILOGUE_SIZE) "ph\n", loglvl, opcodes, |
|
132 |
opcodes[PROLOGUE_SIZE], opcodes + PROLOGUE_SIZE + 1); |
|
133 |
break; |
|
134 |
case -EPERM: |
|
135 |
/* No access to the user space stack of other tasks. Ignore. */ |
|
136 |
break; |
|
137 |
default: |
|
138 |
printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n", |
|
139 |
loglvl, prologue); |
|
140 |
break; |
|
141 |
} |
|
142 |
} |
|
143 |
|
|
144 |
void show_ip(struct pt_regs *regs, const char *loglvl) |
|
145 |
{ |
|
146 |
#ifdef CONFIG_X86_32 |
|
147 |
printk("%sEIP: %pS\n", loglvl, (void *)regs->ip); |
|
148 |
#else |
|
149 |
printk("%sRIP: %04x:%pS\n", loglvl, (int)regs->cs, (void *)regs->ip); |
|
150 |
#endif |
|
151 |
show_opcodes(regs, loglvl); |
|
152 |
} |
|
153 |
|
|
154 |
void show_iret_regs(struct pt_regs *regs, const char *log_lvl) |
|
155 |
{ |
|
156 |
show_ip(regs, log_lvl); |
|
157 |
printk("%sRSP: %04x:%016lx EFLAGS: %08lx", log_lvl, (int)regs->ss, |
|
158 |
regs->sp, regs->flags); |
|
159 |
} |
|
160 |
|
|
161 |
static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, |
|
162 |
bool partial, const char *log_lvl) |
|
163 |
{ |
|
164 |
/* |
|
165 |
* These on_stack() checks aren't strictly necessary: the unwind code |
|
166 |
* has already validated the 'regs' pointer. The checks are done for |
|
167 |
* ordering reasons: if the registers are on the next stack, we don't |
|
168 |
* want to print them out yet. Otherwise they'll be shown as part of |
|
169 |
* the wrong stack. Later, when show_trace_log_lvl() switches to the |
|
170 |
* next stack, this function will be called again with the same regs so |
|
171 |
* they can be printed in the right context. |
|
172 |
*/ |
|
173 |
if (!partial && on_stack(info, regs, sizeof(*regs))) { |
|
174 |
__show_regs(regs, SHOW_REGS_SHORT, log_lvl); |
|
175 |
|
|
176 |
} else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, |
|
177 |
IRET_FRAME_SIZE)) { |
|
178 |
/* |
|
179 |
* When an interrupt or exception occurs in entry code, the |
|
180 |
* full pt_regs might not have been saved yet. In that case |
|
181 |
* just print the iret frame. |
|
182 |
*/ |
|
183 |
show_iret_regs(regs, log_lvl); |
|
184 |
} |
|
185 |
} |
|
186 |
|
|
187 |
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, |
|
188 |
unsigned long *stack, const char *log_lvl) |
|
189 |
{ |
|
190 |
struct unwind_state state; |
|
191 |
struct stack_info stack_info = {0}; |
|
192 |
unsigned long visit_mask = 0; |
|
193 |
int graph_idx = 0; |
|
194 |
bool partial = false; |
|
195 |
|
|
196 |
printk("%sCall Trace:\n", log_lvl); |
|
197 |
|
|
198 |
unwind_start(&state, task, regs, stack); |
|
199 |
stack = stack ? : get_stack_pointer(task, regs); |
|
200 |
regs = unwind_get_entry_regs(&state, &partial); |
|
201 |
|
|
202 |
/* |
|
203 |
* Iterate through the stacks, starting with the current stack pointer. |
|
204 |
* Each stack has a pointer to the next one. |
|
205 |
* |
|
206 |
* x86-64 can have several stacks: |
|
207 |
* - task stack |
|
208 |
* - interrupt stack |
|
209 |
* - HW exception stacks (double fault, nmi, debug, mce) |
|
210 |
* - entry stack |
|
211 |
* |
|
212 |
* x86-32 can have up to four stacks: |
|
213 |
* - task stack |
|
214 |
* - softirq stack |
|
215 |
* - hardirq stack |
|
216 |
* - entry stack |
|
217 |
*/ |
|
218 |
for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { |
|
219 |
const char *stack_name; |
|
220 |
|
|
221 |
if (get_stack_info(stack, task, &stack_info, &visit_mask)) { |
|
222 |
/* |
|
223 |
* We weren't on a valid stack. It's possible that |
|
224 |
* we overflowed a valid stack into a guard page. |
|
225 |
* See if the next page up is valid so that we can |
|
226 |
* generate some kind of backtrace if this happens. |
|
227 |
*/ |
|
228 |
stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); |
|
229 |
if (get_stack_info(stack, task, &stack_info, &visit_mask)) |
|
230 |
break; |
|
231 |
} |
|
232 |
|
|
233 |
stack_name = stack_type_name(stack_info.type); |
|
234 |
if (stack_name) |
|
235 |
printk("%s <%s>\n", log_lvl, stack_name); |
|
236 |
|
|
237 |
if (regs) |
|
238 |
show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); |
|
239 |
|
|
240 |
/* |
|
241 |
* Scan the stack, printing any text addresses we find. At the |
|
242 |
* same time, follow proper stack frames with the unwinder. |
|
243 |
* |
|
244 |
* Addresses found during the scan which are not reported by |
|
245 |
* the unwinder are considered to be additional clues which are |
|
246 |
* sometimes useful for debugging and are prefixed with '?'. |
|
247 |
* This also serves as a failsafe option in case the unwinder |
|
248 |
* goes off in the weeds. |
|
249 |
*/ |
|
250 |
for (; stack < stack_info.end; stack++) { |
|
251 |
unsigned long real_addr; |
|
252 |
int reliable = 0; |
|
253 |
unsigned long addr = READ_ONCE_NOCHECK(*stack); |
|
254 |
unsigned long *ret_addr_p = |
|
255 |
unwind_get_return_address_ptr(&state); |
|
256 |
|
|
257 |
if (!__kernel_text_address(addr)) |
|
258 |
continue; |
|
259 |
|
|
260 |
/* |
|
261 |
* Don't print regs->ip again if it was already printed |
|
262 |
* by show_regs_if_on_stack(). |
|
263 |
*/ |
|
264 |
if (regs && stack == ®s->ip) |
|
265 |
goto next; |
|
266 |
|
|
267 |
if (stack == ret_addr_p) |
|
268 |
reliable = 1; |
|
269 |
|
|
270 |
/* |
|
271 |
* When function graph tracing is enabled for a |
|
272 |
* function, its return address on the stack is |
|
273 |
* replaced with the address of an ftrace handler |
|
274 |
* (return_to_handler). In that case, before printing |
|
275 |
* the "real" address, we want to print the handler |
|
276 |
* address as an "unreliable" hint that function graph |
|
277 |
* tracing was involved. |
|
278 |
*/ |
|
279 |
real_addr = ftrace_graph_ret_addr(task, &graph_idx, |
|
280 |
addr, stack); |
|
281 |
if (real_addr != addr) |
|
282 |
printk_stack_address(addr, 0, log_lvl); |
|
283 |
printk_stack_address(real_addr, reliable, log_lvl); |
|
284 |
|
|
285 |
if (!reliable) |
|
286 |
continue; |
|
287 |
|
|
288 |
next: |
|
289 |
/* |
|
290 |
* Get the next frame from the unwinder. No need to |
|
291 |
* check for an error: if anything goes wrong, the rest |
|
292 |
* of the addresses will just be printed as unreliable. |
|
293 |
*/ |
|
294 |
unwind_next_frame(&state); |
|
295 |
|
|
296 |
/* if the frame has entry regs, print them */ |
|
297 |
regs = unwind_get_entry_regs(&state, &partial); |
|
298 |
if (regs) |
|
299 |
show_regs_if_on_stack(&stack_info, regs, partial, log_lvl); |
|
300 |
} |
|
301 |
|
|
302 |
if (stack_name) |
|
303 |
printk("%s </%s>\n", log_lvl, stack_name); |
|
304 |
} |
|
305 |
} |
|
306 |
|
|
307 |
void show_stack(struct task_struct *task, unsigned long *sp, |
|
308 |
const char *loglvl) |
|
309 |
{ |
|
310 |
task = task ? : current; |
|
311 |
|
|
312 |
/* |
|
313 |
* Stack frames below this one aren't interesting. Don't show them |
|
314 |
* if we're printing for %current. |
|
315 |
*/ |
|
316 |
if (!sp && task == current) |
|
317 |
sp = get_stack_pointer(current, NULL); |
|
318 |
|
|
319 |
show_trace_log_lvl(task, NULL, sp, loglvl); |
|
320 |
} |
|
321 |
|
|
322 |
void show_stack_regs(struct pt_regs *regs) |
|
323 |
{ |
|
324 |
show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); |
|
325 |
} |
|
326 |
|
|
327 |
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
|
328 |
static int die_owner = -1; |
|
329 |
static unsigned int die_nest_count; |
|
330 |
|
|
331 |
unsigned long oops_begin(void) |
|
332 |
{ |
|
333 |
int cpu; |
|
334 |
unsigned long flags; |
|
335 |
|
|
336 |
oops_enter(); |
|
337 |
|
|
338 |
/* racy, but better than risking deadlock. */ |
2f529f
|
339 |
flags = hard_local_irq_save(); |
a07526
|
340 |
cpu = smp_processor_id(); |
H |
341 |
if (!arch_spin_trylock(&die_lock)) { |
|
342 |
if (cpu == die_owner) |
|
343 |
/* nested oops. should stop eventually */; |
|
344 |
else |
|
345 |
arch_spin_lock(&die_lock); |
|
346 |
} |
|
347 |
die_nest_count++; |
|
348 |
die_owner = cpu; |
|
349 |
console_verbose(); |
|
350 |
bust_spinlocks(1); |
|
351 |
return flags; |
|
352 |
} |
|
353 |
NOKPROBE_SYMBOL(oops_begin); |
|
354 |
|
|
355 |
void __noreturn rewind_stack_do_exit(int signr); |
|
356 |
|
|
357 |
void oops_end(unsigned long flags, struct pt_regs *regs, int signr) |
|
358 |
{ |
|
359 |
if (regs && kexec_should_crash(current)) |
|
360 |
crash_kexec(regs); |
|
361 |
|
|
362 |
bust_spinlocks(0); |
|
363 |
die_owner = -1; |
|
364 |
add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); |
|
365 |
die_nest_count--; |
|
366 |
if (!die_nest_count) |
|
367 |
/* Nest count reaches zero, release the lock. */ |
|
368 |
arch_spin_unlock(&die_lock); |
2f529f
|
369 |
hard_local_irq_restore(flags); |
a07526
|
370 |
oops_exit(); |
H |
371 |
|
|
372 |
/* Executive summary in case the oops scrolled away */ |
|
373 |
__show_regs(&exec_summary_regs, SHOW_REGS_ALL, KERN_DEFAULT); |
|
374 |
|
|
375 |
if (!signr) |
|
376 |
return; |
|
377 |
if (in_interrupt()) |
|
378 |
panic("Fatal exception in interrupt"); |
|
379 |
if (panic_on_oops) |
|
380 |
panic("Fatal exception"); |
|
381 |
|
|
382 |
/* |
|
383 |
* We're not going to return, but we might be on an IST stack or |
|
384 |
* have very little stack space left. Rewind the stack and kill |
|
385 |
* the task. |
|
386 |
* Before we rewind the stack, we have to tell KASAN that we're going to |
|
387 |
* reuse the task stack and that existing poisons are invalid. |
|
388 |
*/ |
|
389 |
kasan_unpoison_task_stack(current); |
|
390 |
rewind_stack_do_exit(signr); |
|
391 |
} |
|
392 |
NOKPROBE_SYMBOL(oops_end); |
|
393 |
|
|
394 |
static void __die_header(const char *str, struct pt_regs *regs, long err) |
|
395 |
{ |
|
396 |
const char *pr = ""; |
|
397 |
|
2f529f
|
398 |
irq_pipeline_oops(); |
H |
399 |
|
a07526
|
400 |
/* Save the regs of the first oops for the executive summary later. */ |
H |
401 |
if (!die_counter) |
|
402 |
exec_summary_regs = *regs; |
|
403 |
|
|
404 |
if (IS_ENABLED(CONFIG_PREEMPTION)) |
|
405 |
pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; |
|
406 |
|
|
407 |
printk(KERN_DEFAULT |
2f529f
|
408 |
"%s: %04lx [#%d]%s%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, |
a07526
|
409 |
pr, |
H |
410 |
IS_ENABLED(CONFIG_SMP) ? " SMP" : "", |
|
411 |
debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", |
|
412 |
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", |
|
413 |
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? |
2f529f
|
414 |
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "", |
H |
415 |
irqs_pipelined() ? " IRQ_PIPELINE" : ""); |
a07526
|
416 |
} |
H |
417 |
NOKPROBE_SYMBOL(__die_header); |
|
418 |
|
|
419 |
static int __die_body(const char *str, struct pt_regs *regs, long err) |
|
420 |
{ |
|
421 |
show_regs(regs); |
|
422 |
print_modules(); |
|
423 |
|
|
424 |
if (notify_die(DIE_OOPS, str, regs, err, |
|
425 |
current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) |
|
426 |
return 1; |
|
427 |
|
|
428 |
return 0; |
|
429 |
} |
|
430 |
NOKPROBE_SYMBOL(__die_body); |
|
431 |
|
|
432 |
int __die(const char *str, struct pt_regs *regs, long err) |
|
433 |
{ |
|
434 |
__die_header(str, regs, err); |
|
435 |
return __die_body(str, regs, err); |
|
436 |
} |
|
437 |
NOKPROBE_SYMBOL(__die); |
|
438 |
|
|
439 |
/* |
|
440 |
* This is gone through when something in the kernel has done something bad |
|
441 |
* and is about to be terminated: |
|
442 |
*/ |
|
443 |
void die(const char *str, struct pt_regs *regs, long err) |
|
444 |
{ |
|
445 |
unsigned long flags = oops_begin(); |
|
446 |
int sig = SIGSEGV; |
|
447 |
|
|
448 |
if (__die(str, regs, err)) |
|
449 |
sig = 0; |
|
450 |
oops_end(flags, regs, sig); |
|
451 |
} |
|
452 |
|
|
453 |
void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr) |
|
454 |
{ |
|
455 |
unsigned long flags = oops_begin(); |
|
456 |
int sig = SIGSEGV; |
|
457 |
|
|
458 |
__die_header(str, regs, err); |
|
459 |
if (gp_addr) |
|
460 |
kasan_non_canonical_hook(gp_addr); |
|
461 |
if (__die_body(str, regs, err)) |
|
462 |
sig = 0; |
|
463 |
oops_end(flags, regs, sig); |
|
464 |
} |
|
465 |
|
|
466 |
void show_regs(struct pt_regs *regs) |
|
467 |
{ |
|
468 |
enum show_regs_mode print_kernel_regs; |
|
469 |
|
|
470 |
show_regs_print_info(KERN_DEFAULT); |
|
471 |
|
|
472 |
print_kernel_regs = user_mode(regs) ? SHOW_REGS_USER : SHOW_REGS_ALL; |
|
473 |
__show_regs(regs, print_kernel_regs, KERN_DEFAULT); |
|
474 |
|
|
475 |
/* |
|
476 |
* When in-kernel, we also print out the stack at the time of the fault.. |
|
477 |
*/ |
|
478 |
if (!user_mode(regs)) |
|
479 |
show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); |
|
480 |
} |