hc
2023-12-11 d2ccde1c8e90d38cee87a1b0309ad2827f3fd30d
kernel/kernel/trace/trace_stack.c
....@@ -5,6 +5,7 @@
55 */
66 #include <linux/sched/task_stack.h>
77 #include <linux/stacktrace.h>
8
+#include <linux/security.h>
89 #include <linux/kallsyms.h>
910 #include <linux/seq_file.h>
1011 #include <linux/spinlock.h>
....@@ -18,44 +19,32 @@
1819
1920 #include "trace.h"
2021
21
-static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
22
- { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
23
-unsigned stack_trace_index[STACK_TRACE_ENTRIES];
22
+#define STACK_TRACE_ENTRIES 500
2423
25
-/*
26
- * Reserve one entry for the passed in ip. This will allow
27
- * us to remove most or all of the stack size overhead
28
- * added by the stack tracer itself.
29
- */
30
-struct stack_trace stack_trace_max = {
31
- .max_entries = STACK_TRACE_ENTRIES - 1,
32
- .entries = &stack_dump_trace[0],
33
-};
24
+static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES];
25
+static unsigned stack_trace_index[STACK_TRACE_ENTRIES];
3426
35
-unsigned long stack_trace_max_size;
36
-arch_spinlock_t stack_trace_max_lock =
27
+static unsigned int stack_trace_nr_entries;
28
+static unsigned long stack_trace_max_size;
29
+static arch_spinlock_t stack_trace_max_lock =
3730 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
3831
3932 DEFINE_PER_CPU(int, disable_stack_tracer);
4033 static DEFINE_MUTEX(stack_sysctl_mutex);
4134
4235 int stack_tracer_enabled;
43
-static int last_stack_tracer_enabled;
4436
45
-void stack_trace_print(void)
37
+static void print_max_stack(void)
4638 {
4739 long i;
4840 int size;
4941
5042 pr_emerg(" Depth Size Location (%d entries)\n"
5143 " ----- ---- --------\n",
52
- stack_trace_max.nr_entries);
44
+ stack_trace_nr_entries);
5345
54
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
55
- if (stack_dump_trace[i] == ULONG_MAX)
56
- break;
57
- if (i+1 == stack_trace_max.nr_entries ||
58
- stack_dump_trace[i+1] == ULONG_MAX)
46
+ for (i = 0; i < stack_trace_nr_entries; i++) {
47
+ if (i + 1 == stack_trace_nr_entries)
5948 size = stack_trace_index[i];
6049 else
6150 size = stack_trace_index[i] - stack_trace_index[i+1];
....@@ -66,15 +55,104 @@
6655 }
6756
6857 /*
69
- * When arch-specific code overrides this function, the following
70
- * data should be filled up, assuming stack_trace_max_lock is held to
71
- * prevent concurrent updates.
72
- * stack_trace_index[]
73
- * stack_trace_max
74
- * stack_trace_max_size
58
+ * The stack tracer looks for a maximum stack at each call from a function. It
59
+ * registers a callback from ftrace, and in that callback it examines the stack
60
+ * size. It determines the stack size from the variable passed in, which is the
61
+ * address of a local variable in the stack_trace_call() callback function.
62
+ * The stack size is calculated by the address of the local variable to the top
63
+ * of the current stack. If that size is smaller than the currently saved max
64
+ * stack size, nothing more is done.
65
+ *
66
+ * If the size of the stack is greater than the maximum recorded size, then the
67
+ * following algorithm takes place.
68
+ *
69
+ * For architectures (like x86) that store the function's return address before
70
+ * saving the function's local variables, the stack will look something like
71
+ * this:
72
+ *
73
+ * [ top of stack ]
74
+ * 0: sys call entry frame
75
+ * 10: return addr to entry code
76
+ * 11: start of sys_foo frame
77
+ * 20: return addr to sys_foo
78
+ * 21: start of kernel_func_bar frame
79
+ * 30: return addr to kernel_func_bar
80
+ * 31: [ do trace stack here ]
81
+ *
82
+ * The save_stack_trace() is called returning all the functions it finds in the
83
+ * current stack. Which would be (from the bottom of the stack to the top):
84
+ *
85
+ * return addr to kernel_func_bar
86
+ * return addr to sys_foo
87
+ * return addr to entry code
88
+ *
89
+ * Now to figure out how much each of these functions' local variable size is,
90
+ * a search of the stack is made to find these values. When a match is made, it
91
+ * is added to the stack_dump_trace[] array. The offset into the stack is saved
92
+ * in the stack_trace_index[] array. The above example would show:
93
+ *
94
+ * stack_dump_trace[] | stack_trace_index[]
95
+ * ------------------ + -------------------
96
+ * return addr to kernel_func_bar | 30
97
+ * return addr to sys_foo | 20
98
+ * return addr to entry | 10
99
+ *
100
+ * The print_max_stack() function above, uses these values to print the size of
101
+ * each function's portion of the stack.
102
+ *
103
+ * for (i = 0; i < nr_entries; i++) {
104
+ * size = i == nr_entries - 1 ? stack_trace_index[i] :
105
+ * stack_trace_index[i] - stack_trace_index[i+1]
106
+ * print "%d %d %d %s\n", i, stack_trace_index[i], size, stack_dump_trace[i]);
107
+ * }
108
+ *
109
+ * The above shows
110
+ *
111
+ * depth size location
112
+ * ----- ---- --------
113
+ * 0 30 10 kernel_func_bar
114
+ * 1 20 10 sys_foo
115
+ * 2 10 10 entry code
116
+ *
117
+ * Now for architectures that might save the return address after the functions
118
+ * local variables (saving the link register before calling nested functions),
119
+ * this will cause the stack to look a little different:
120
+ *
121
+ * [ top of stack ]
122
+ * 0: sys call entry frame
123
+ * 10: start of sys_foo_frame
124
+ * 19: return addr to entry code << lr saved before calling kernel_func_bar
125
+ * 20: start of kernel_func_bar frame
126
+ * 29: return addr to sys_foo_frame << lr saved before calling next function
127
+ * 30: [ do trace stack here ]
128
+ *
129
+ * Although the functions returned by save_stack_trace() may be the same, the
130
+ * placement in the stack will be different. Using the same algorithm as above
131
+ * would yield:
132
+ *
133
+ * stack_dump_trace[] | stack_trace_index[]
134
+ * ------------------ + -------------------
135
+ * return addr to kernel_func_bar | 30
136
+ * return addr to sys_foo | 29
137
+ * return addr to entry | 19
138
+ *
139
+ * Where the mapping is off by one:
140
+ *
141
+ * kernel_func_bar stack frame size is 29 - 19 not 30 - 29!
142
+ *
143
+ * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the
144
+ * values in stack_trace_index[] are shifted by one to and the number of
145
+ * stack trace entries is decremented by one.
146
+ *
147
+ * stack_dump_trace[] | stack_trace_index[]
148
+ * ------------------ + -------------------
149
+ * return addr to kernel_func_bar | 29
150
+ * return addr to sys_foo | 19
151
+ *
152
+ * Although the entry function is not displayed, the first function (sys_foo)
153
+ * will still include the stack size of it.
75154 */
76
-void __weak
77
-check_stack(unsigned long ip, unsigned long *stack)
155
+static void check_stack(unsigned long ip, unsigned long *stack)
78156 {
79157 unsigned long this_size, flags; unsigned long *p, *top, *start;
80158 static int tracer_frame;
....@@ -110,13 +188,12 @@
110188
111189 stack_trace_max_size = this_size;
112190
113
- stack_trace_max.nr_entries = 0;
114
- stack_trace_max.skip = 3;
115
-
116
- save_stack_trace(&stack_trace_max);
191
+ stack_trace_nr_entries = stack_trace_save(stack_dump_trace,
192
+ ARRAY_SIZE(stack_dump_trace) - 1,
193
+ 0);
117194
118195 /* Skip over the overhead of the stack tracer itself */
119
- for (i = 0; i < stack_trace_max.nr_entries; i++) {
196
+ for (i = 0; i < stack_trace_nr_entries; i++) {
120197 if (stack_dump_trace[i] == ip)
121198 break;
122199 }
....@@ -125,7 +202,7 @@
125202 * Some archs may not have the passed in ip in the dump.
126203 * If that happens, we need to show everything.
127204 */
128
- if (i == stack_trace_max.nr_entries)
205
+ if (i == stack_trace_nr_entries)
129206 i = 0;
130207
131208 /*
....@@ -143,15 +220,13 @@
143220 * loop will only happen once. This code only takes place
144221 * on a new max, so it is far from a fast path.
145222 */
146
- while (i < stack_trace_max.nr_entries) {
223
+ while (i < stack_trace_nr_entries) {
147224 int found = 0;
148225
149226 stack_trace_index[x] = this_size;
150227 p = start;
151228
152
- for (; p < top && i < stack_trace_max.nr_entries; p++) {
153
- if (stack_dump_trace[i] == ULONG_MAX)
154
- break;
229
+ for (; p < top && i < stack_trace_nr_entries; p++) {
155230 /*
156231 * The READ_ONCE_NOCHECK is used to let KASAN know that
157232 * this is not a stack-out-of-bounds error.
....@@ -182,12 +257,24 @@
182257 i++;
183258 }
184259
185
- stack_trace_max.nr_entries = x;
186
- for (; x < i; x++)
187
- stack_dump_trace[x] = ULONG_MAX;
260
+#ifdef ARCH_FTRACE_SHIFT_STACK_TRACER
261
+ /*
262
+ * Some archs will store the link register before calling
263
+ * nested functions. This means the saved return address
264
+ * comes after the local storage, and we need to shift
265
+ * for that.
266
+ */
267
+ if (x > 1) {
268
+ memmove(&stack_trace_index[0], &stack_trace_index[1],
269
+ sizeof(stack_trace_index[0]) * (x - 1));
270
+ x--;
271
+ }
272
+#endif
273
+
274
+ stack_trace_nr_entries = x;
188275
189276 if (task_stack_end_corrupted(current)) {
190
- stack_trace_print();
277
+ print_max_stack();
191278 BUG();
192279 }
193280
....@@ -291,7 +378,7 @@
291378 {
292379 long n = *pos - 1;
293380
294
- if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
381
+ if (n >= stack_trace_nr_entries)
295382 return NULL;
296383
297384 m->private = (void *)n;
....@@ -355,7 +442,7 @@
355442 seq_printf(m, " Depth Size Location"
356443 " (%d entries)\n"
357444 " ----- ---- --------\n",
358
- stack_trace_max.nr_entries);
445
+ stack_trace_nr_entries);
359446
360447 if (!stack_tracer_enabled && !stack_trace_max_size)
361448 print_disabled(m);
....@@ -365,12 +452,10 @@
365452
366453 i = *(long *)v;
367454
368
- if (i >= stack_trace_max.nr_entries ||
369
- stack_dump_trace[i] == ULONG_MAX)
455
+ if (i >= stack_trace_nr_entries)
370456 return 0;
371457
372
- if (i+1 == stack_trace_max.nr_entries ||
373
- stack_dump_trace[i+1] == ULONG_MAX)
458
+ if (i + 1 == stack_trace_nr_entries)
374459 size = stack_trace_index[i];
375460 else
376461 size = stack_trace_index[i] - stack_trace_index[i+1];
....@@ -391,6 +476,12 @@
391476
392477 static int stack_trace_open(struct inode *inode, struct file *file)
393478 {
479
+ int ret;
480
+
481
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
482
+ if (ret)
483
+ return ret;
484
+
394485 return seq_open(file, &stack_trace_seq_ops);
395486 }
396487
....@@ -408,6 +499,7 @@
408499 {
409500 struct ftrace_ops *ops = inode->i_private;
410501
502
+ /* Checks for tracefs lockdown */
411503 return ftrace_regex_open(ops, FTRACE_ITER_FILTER,
412504 inode, file);
413505 }
....@@ -423,27 +515,24 @@
423515 #endif /* CONFIG_DYNAMIC_FTRACE */
424516
425517 int
426
-stack_trace_sysctl(struct ctl_table *table, int write,
427
- void __user *buffer, size_t *lenp,
428
- loff_t *ppos)
518
+stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
519
+ size_t *lenp, loff_t *ppos)
429520 {
521
+ int was_enabled;
430522 int ret;
431523
432524 mutex_lock(&stack_sysctl_mutex);
525
+ was_enabled = !!stack_tracer_enabled;
433526
434527 ret = proc_dointvec(table, write, buffer, lenp, ppos);
435528
436
- if (ret || !write ||
437
- (last_stack_tracer_enabled == !!stack_tracer_enabled))
529
+ if (ret || !write || (was_enabled == !!stack_tracer_enabled))
438530 goto out;
439
-
440
- last_stack_tracer_enabled = !!stack_tracer_enabled;
441531
442532 if (stack_tracer_enabled)
443533 register_ftrace_function(&trace_ops);
444534 else
445535 unregister_ftrace_function(&trace_ops);
446
-
447536 out:
448537 mutex_unlock(&stack_sysctl_mutex);
449538 return ret;
....@@ -453,31 +542,32 @@
453542
454543 static __init int enable_stacktrace(char *str)
455544 {
456
- if (strncmp(str, "_filter=", 8) == 0)
457
- strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
545
+ int len;
546
+
547
+ if ((len = str_has_prefix(str, "_filter=")))
548
+ strncpy(stack_trace_filter_buf, str + len, COMMAND_LINE_SIZE);
458549
459550 stack_tracer_enabled = 1;
460
- last_stack_tracer_enabled = 1;
461551 return 1;
462552 }
463553 __setup("stacktrace", enable_stacktrace);
464554
465555 static __init int stack_trace_init(void)
466556 {
467
- struct dentry *d_tracer;
557
+ int ret;
468558
469
- d_tracer = tracing_init_dentry();
470
- if (IS_ERR(d_tracer))
559
+ ret = tracing_init_dentry();
560
+ if (ret)
471561 return 0;
472562
473
- trace_create_file("stack_max_size", 0644, d_tracer,
563
+ trace_create_file("stack_max_size", 0644, NULL,
474564 &stack_trace_max_size, &stack_max_size_fops);
475565
476
- trace_create_file("stack_trace", 0444, d_tracer,
566
+ trace_create_file("stack_trace", 0444, NULL,
477567 NULL, &stack_trace_fops);
478568
479569 #ifdef CONFIG_DYNAMIC_FTRACE
480
- trace_create_file("stack_trace_filter", 0644, d_tracer,
570
+ trace_create_file("stack_trace_filter", 0644, NULL,
481571 &trace_ops, &stack_trace_filter_fops);
482572 #endif
483573