From 102a0743326a03cd1a1202ceda21e175b7d3575c Mon Sep 17 00:00:00 2001 From: hc <hc@nodka.com> Date: Tue, 20 Feb 2024 01:20:52 +0000 Subject: [PATCH] add new system file --- kernel/kernel/trace/trace_stack.c | 224 +++++++++++++++++++++++++++++++++++++++---------------- 1 files changed, 157 insertions(+), 67 deletions(-) diff --git a/kernel/kernel/trace/trace_stack.c b/kernel/kernel/trace/trace_stack.c index 4033709..c408423 100644 --- a/kernel/kernel/trace/trace_stack.c +++ b/kernel/kernel/trace/trace_stack.c @@ -5,6 +5,7 @@ */ #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> +#include <linux/security.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/spinlock.h> @@ -18,44 +19,32 @@ #include "trace.h" -static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = - { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; -unsigned stack_trace_index[STACK_TRACE_ENTRIES]; +#define STACK_TRACE_ENTRIES 500 -/* - * Reserve one entry for the passed in ip. This will allow - * us to remove most or all of the stack size overhead - * added by the stack tracer itself. - */ -struct stack_trace stack_trace_max = { - .max_entries = STACK_TRACE_ENTRIES - 1, - .entries = &stack_dump_trace[0], -}; +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES]; +static unsigned stack_trace_index[STACK_TRACE_ENTRIES]; -unsigned long stack_trace_max_size; -arch_spinlock_t stack_trace_max_lock = +static unsigned int stack_trace_nr_entries; +static unsigned long stack_trace_max_size; +static arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; -static int last_stack_tracer_enabled; -void stack_trace_print(void) +static void print_max_stack(void) { long i; int size; pr_emerg(" Depth Size Location (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); - for (i = 0; i < stack_trace_max.nr_entries; i++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + for (i = 0; i < stack_trace_nr_entries; i++) { + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -66,15 +55,104 @@ } /* - * When arch-specific code overrides this function, the following - * data should be filled up, assuming stack_trace_max_lock is held to - * prevent concurrent updates. - * stack_trace_index[] - * stack_trace_max - * stack_trace_max_size + * The stack tracer looks for a maximum stack at each call from a function. It + * registers a callback from ftrace, and in that callback it examines the stack + * size. It determines the stack size from the variable passed in, which is the + * address of a local variable in the stack_trace_call() callback function. + * The stack size is calculated by the address of the local variable to the top + * of the current stack. If that size is smaller than the currently saved max + * stack size, nothing more is done. + * + * If the size of the stack is greater than the maximum recorded size, then the + * following algorithm takes place. + * + * For architectures (like x86) that store the function's return address before + * saving the function's local variables, the stack will look something like + * this: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: return addr to entry code + * 11: start of sys_foo frame + * 20: return addr to sys_foo + * 21: start of kernel_func_bar frame + * 30: return addr to kernel_func_bar + * 31: [ do trace stack here ] + * + * The save_stack_trace() is called returning all the functions it finds in the + * current stack. Which would be (from the bottom of the stack to the top): + * + * return addr to kernel_func_bar + * return addr to sys_foo + * return addr to entry code + * + * Now to figure out how much each of these functions' local variable size is, + * a search of the stack is made to find these values. When a match is made, it + * is added to the stack_dump_trace[] array. The offset into the stack is saved + * in the stack_trace_index[] array. The above example would show: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 20 + * return addr to entry | 10 + * + * The print_max_stack() function above, uses these values to print the size of + * each function's portion of the stack. + * + * for (i = 0; i < nr_entries; i++) { + * size = i == nr_entries - 1 ? stack_trace_index[i] : + * stack_trace_index[i] - stack_trace_index[i+1] + * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]); + * } + * + * The above shows + * + * depth size location + * ----- ---- -------- + * 0 30 10 kernel_func_bar + * 1 20 10 sys_foo + * 2 10 10 entry code + * + * Now for architectures that might save the return address after the functions + * local variables (saving the link register before calling nested functions), + * this will cause the stack to look a little different: + * + * [ top of stack ] + * 0: sys call entry frame + * 10: start of sys_foo_frame + * 19: return addr to entry code << lr saved before calling kernel_func_bar + * 20: start of kernel_func_bar frame + * 29: return addr to sys_foo_frame << lr saved before calling next function + * 30: [ do trace stack here ] + * + * Although the functions returned by save_stack_trace() may be the same, the + * placement in the stack will be different. Using the same algorithm as above + * would yield: + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 30 + * return addr to sys_foo | 29 + * return addr to entry | 19 + * + * Where the mapping is off by one: + * + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! + * + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the + * values in stack_trace_index[] are shifted by one to and the number of + * stack trace entries is decremented by one. + * + * stack_dump_trace[] | stack_trace_index[] + * ------------------ + ------------------- + * return addr to kernel_func_bar | 29 + * return addr to sys_foo | 19 + * + * Although the entry function is not displayed, the first function (sys_foo) + * will still include the stack size of it. */ -void __weak -check_stack(unsigned long ip, unsigned long *stack) +static void check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; static int tracer_frame; @@ -110,13 +188,12 @@ stack_trace_max_size = this_size; - stack_trace_max.nr_entries = 0; - stack_trace_max.skip = 3; - - save_stack_trace(&stack_trace_max); + stack_trace_nr_entries = stack_trace_save(stack_dump_trace, + ARRAY_SIZE(stack_dump_trace) - 1, + 0); /* Skip over the overhead of the stack tracer itself */ - for (i = 0; i < stack_trace_max.nr_entries; i++) { + for (i = 0; i < stack_trace_nr_entries; i++) { if (stack_dump_trace[i] == ip) break; } @@ -125,7 +202,7 @@ * Some archs may not have the passed in ip in the dump. * If that happens, we need to show everything. */ - if (i == stack_trace_max.nr_entries) + if (i == stack_trace_nr_entries) i = 0; /* @@ -143,15 +220,13 @@ * loop will only happen once. This code only takes place * on a new max, so it is far from a fast path. */ - while (i < stack_trace_max.nr_entries) { + while (i < stack_trace_nr_entries) { int found = 0; stack_trace_index[x] = this_size; p = start; - for (; p < top && i < stack_trace_max.nr_entries; p++) { - if (stack_dump_trace[i] == ULONG_MAX) - break; + for (; p < top && i < stack_trace_nr_entries; p++) { /* * The READ_ONCE_NOCHECK is used to let KASAN know that * this is not a stack-out-of-bounds error. @@ -182,12 +257,24 @@ i++; } - stack_trace_max.nr_entries = x; - for (; x < i; x++) - stack_dump_trace[x] = ULONG_MAX; +#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER + /* + * Some archs will store the link register before calling + * nested functions. This means the saved return address + * comes after the local storage, and we need to shift + * for that. + */ + if (x > 1) { + memmove(&stack_trace_index[0], &stack_trace_index[1], + sizeof(stack_trace_index[0]) * (x - 1)); + x--; + } +#endif + + stack_trace_nr_entries = x; if (task_stack_end_corrupted(current)) { - stack_trace_print(); + print_max_stack(); BUG(); } @@ -291,7 +378,7 @@ { long n = *pos - 1; - if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX) + if (n >= stack_trace_nr_entries) return NULL; m->private = (void *)n; @@ -355,7 +442,7 @@ seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - stack_trace_max.nr_entries); + stack_trace_nr_entries); if (!stack_tracer_enabled && !stack_trace_max_size) print_disabled(m); @@ -365,12 +452,10 @@ i = *(long *)v; - if (i >= stack_trace_max.nr_entries || - stack_dump_trace[i] == ULONG_MAX) + if (i >= stack_trace_nr_entries) return 0; - if (i+1 == stack_trace_max.nr_entries || - stack_dump_trace[i+1] == ULONG_MAX) + if (i + 1 == stack_trace_nr_entries) size = stack_trace_index[i]; else size = stack_trace_index[i] - stack_trace_index[i+1]; @@ -391,6 +476,12 @@ static int stack_trace_open(struct inode *inode, struct file *file) { + int ret; + + ret = security_locked_down(LOCKDOWN_TRACEFS); + if (ret) + return ret; + return seq_open(file, &stack_trace_seq_ops); } @@ -408,6 +499,7 @@ { struct ftrace_ops *ops = inode->i_private; + /* Checks for tracefs lockdown */ return ftrace_regex_open(ops, FTRACE_ITER_FILTER, inode, file); } @@ -423,27 +515,24 @@ #endif /* CONFIG_DYNAMIC_FTRACE */ int -stack_trace_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +stack_trace_sysctl(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { + int was_enabled; int ret; mutex_lock(&stack_sysctl_mutex); + was_enabled = !!stack_tracer_enabled; ret = proc_dointvec(table, write, buffer, lenp, ppos); - if (ret || !write || - (last_stack_tracer_enabled == !!stack_tracer_enabled)) + if (ret || !write || (was_enabled == !!stack_tracer_enabled)) goto out; - - last_stack_tracer_enabled = !!stack_tracer_enabled; if (stack_tracer_enabled) register_ftrace_function(&trace_ops); else unregister_ftrace_function(&trace_ops); - out: mutex_unlock(&stack_sysctl_mutex); return ret; @@ -453,31 +542,32 @@ static __init int enable_stacktrace(char *str) { - if (strncmp(str, "_filter=", 8) == 0) - strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE); + int len; + + if ((len = str_has_prefix(str, "_filter="))) + strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE); stack_tracer_enabled = 1; - last_stack_tracer_enabled = 1; return 1; } __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { - struct dentry *d_tracer; + int ret; - d_tracer = tracing_init_dentry(); - if (IS_ERR(d_tracer)) + ret = tracing_init_dentry(); + if (ret) return 0; - trace_create_file("stack_max_size", 0644, d_tracer, + trace_create_file("stack_max_size", 0644, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, d_tracer, + trace_create_file("stack_trace", 0444, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, d_tracer, + trace_create_file("stack_trace_filter", 0644, NULL, &trace_ops, &stack_trace_filter_fops); #endif -- Gitblit v1.6.2