hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/tools/perf/util/thread-stack.c
....@@ -1,25 +1,20 @@
1
+// SPDX-License-Identifier: GPL-2.0-only
12 /*
23 * thread-stack.c: Synthesize a thread's stack using call / return events
34 * Copyright (c) 2014, Intel Corporation.
4
- *
5
- * This program is free software; you can redistribute it and/or modify it
6
- * under the terms and conditions of the GNU General Public License,
7
- * version 2, as published by the Free Software Foundation.
8
- *
9
- * This program is distributed in the hope it will be useful, but WITHOUT
10
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12
- * more details.
13
- *
145 */
156
167 #include <linux/rbtree.h>
178 #include <linux/list.h>
9
+#include <linux/log2.h>
10
+#include <linux/zalloc.h>
1811 #include <errno.h>
12
+#include <stdlib.h>
13
+#include <string.h>
1914 #include "thread.h"
2015 #include "event.h"
2116 #include "machine.h"
22
-#include "util.h"
17
+#include "env.h"
2318 #include "debug.h"
2419 #include "symbol.h"
2520 #include "comm.h"
....@@ -28,22 +23,45 @@
2823
2924 #define STACK_GROWTH 2048
3025
26
+/*
27
+ * State of retpoline detection.
28
+ *
29
+ * RETPOLINE_NONE: no retpoline detection
30
+ * X86_RETPOLINE_POSSIBLE: x86 retpoline possible
31
+ * X86_RETPOLINE_DETECTED: x86 retpoline detected
32
+ */
33
+enum retpoline_state_t {
34
+ RETPOLINE_NONE,
35
+ X86_RETPOLINE_POSSIBLE,
36
+ X86_RETPOLINE_DETECTED,
37
+};
38
+
3139 /**
3240 * struct thread_stack_entry - thread stack entry.
3341 * @ret_addr: return address
3442 * @timestamp: timestamp (if known)
3543 * @ref: external reference (e.g. db_id of sample)
3644 * @branch_count: the branch count when the entry was created
45
+ * @insn_count: the instruction count when the entry was created
46
+ * @cyc_count the cycle count when the entry was created
47
+ * @db_id: id used for db-export
3748 * @cp: call path
3849 * @no_call: a 'call' was not seen
50
+ * @trace_end: a 'call' but trace ended
51
+ * @non_call: a branch but not a 'call' to the start of a different symbol
3952 */
4053 struct thread_stack_entry {
4154 u64 ret_addr;
4255 u64 timestamp;
4356 u64 ref;
4457 u64 branch_count;
58
+ u64 insn_count;
59
+ u64 cyc_count;
60
+ u64 db_id;
4561 struct call_path *cp;
4662 bool no_call;
63
+ bool trace_end;
64
+ bool non_call;
4765 };
4866
4967 /**
....@@ -54,10 +72,18 @@
5472 * @sz: current maximum stack size
5573 * @trace_nr: current trace number
5674 * @branch_count: running branch count
75
+ * @insn_count: running instruction count
76
+ * @cyc_count running cycle count
5777 * @kernel_start: kernel start address
5878 * @last_time: last timestamp
5979 * @crp: call/return processor
6080 * @comm: current comm
81
+ * @arr_sz: size of array if this is the first element of an array
82
+ * @rstate: used to detect retpolines
83
+ * @br_stack_rb: branch stack (ring buffer)
84
+ * @br_stack_sz: maximum branch stack size
85
+ * @br_stack_pos: current position in @br_stack_rb
86
+ * @mispred_all: mark all branches as mispredicted
6187 */
6288 struct thread_stack {
6389 struct thread_stack_entry *stack;
....@@ -65,11 +91,29 @@
6591 size_t sz;
6692 u64 trace_nr;
6793 u64 branch_count;
94
+ u64 insn_count;
95
+ u64 cyc_count;
6896 u64 kernel_start;
6997 u64 last_time;
7098 struct call_return_processor *crp;
7199 struct comm *comm;
100
+ unsigned int arr_sz;
101
+ enum retpoline_state_t rstate;
102
+ struct branch_stack *br_stack_rb;
103
+ unsigned int br_stack_sz;
104
+ unsigned int br_stack_pos;
105
+ bool mispred_all;
72106 };
107
+
108
+/*
109
+ * Assume pid == tid == 0 identifies the idle task as defined by
110
+ * perf_session__register_idle_thread(). The idle task is really 1 task per cpu,
111
+ * and therefore requires a stack for each cpu.
112
+ */
113
+static inline bool thread_stack__per_cpu(struct thread *thread)
114
+{
115
+ return !(thread->tid || thread->pid_);
116
+}
73117
74118 static int thread_stack__grow(struct thread_stack *ts)
75119 {
....@@ -89,30 +133,110 @@
89133 return 0;
90134 }
91135
92
-static struct thread_stack *thread_stack__new(struct thread *thread,
93
- struct call_return_processor *crp)
136
+static int thread_stack__init(struct thread_stack *ts, struct thread *thread,
137
+ struct call_return_processor *crp,
138
+ bool callstack, unsigned int br_stack_sz)
94139 {
95
- struct thread_stack *ts;
140
+ int err;
96141
97
- ts = zalloc(sizeof(struct thread_stack));
98
- if (!ts)
99
- return NULL;
100
-
101
- if (thread_stack__grow(ts)) {
102
- free(ts);
103
- return NULL;
142
+ if (callstack) {
143
+ err = thread_stack__grow(ts);
144
+ if (err)
145
+ return err;
104146 }
105147
106
- if (thread->mg && thread->mg->machine)
107
- ts->kernel_start = machine__kernel_start(thread->mg->machine);
108
- else
148
+ if (br_stack_sz) {
149
+ size_t sz = sizeof(struct branch_stack);
150
+
151
+ sz += br_stack_sz * sizeof(struct branch_entry);
152
+ ts->br_stack_rb = zalloc(sz);
153
+ if (!ts->br_stack_rb)
154
+ return -ENOMEM;
155
+ ts->br_stack_sz = br_stack_sz;
156
+ }
157
+
158
+ if (thread->maps && thread->maps->machine) {
159
+ struct machine *machine = thread->maps->machine;
160
+ const char *arch = perf_env__arch(machine->env);
161
+
162
+ ts->kernel_start = machine__kernel_start(machine);
163
+ if (!strcmp(arch, "x86"))
164
+ ts->rstate = X86_RETPOLINE_POSSIBLE;
165
+ } else {
109166 ts->kernel_start = 1ULL << 63;
167
+ }
110168 ts->crp = crp;
169
+
170
+ return 0;
171
+}
172
+
173
+static struct thread_stack *thread_stack__new(struct thread *thread, int cpu,
174
+ struct call_return_processor *crp,
175
+ bool callstack,
176
+ unsigned int br_stack_sz)
177
+{
178
+ struct thread_stack *ts = thread->ts, *new_ts;
179
+ unsigned int old_sz = ts ? ts->arr_sz : 0;
180
+ unsigned int new_sz = 1;
181
+
182
+ if (thread_stack__per_cpu(thread) && cpu > 0)
183
+ new_sz = roundup_pow_of_two(cpu + 1);
184
+
185
+ if (!ts || new_sz > old_sz) {
186
+ new_ts = calloc(new_sz, sizeof(*ts));
187
+ if (!new_ts)
188
+ return NULL;
189
+ if (ts)
190
+ memcpy(new_ts, ts, old_sz * sizeof(*ts));
191
+ new_ts->arr_sz = new_sz;
192
+ zfree(&thread->ts);
193
+ thread->ts = new_ts;
194
+ ts = new_ts;
195
+ }
196
+
197
+ if (thread_stack__per_cpu(thread) && cpu > 0 &&
198
+ (unsigned int)cpu < ts->arr_sz)
199
+ ts += cpu;
200
+
201
+ if (!ts->stack &&
202
+ thread_stack__init(ts, thread, crp, callstack, br_stack_sz))
203
+ return NULL;
111204
112205 return ts;
113206 }
114207
115
-static int thread_stack__push(struct thread_stack *ts, u64 ret_addr)
208
+static struct thread_stack *thread__cpu_stack(struct thread *thread, int cpu)
209
+{
210
+ struct thread_stack *ts = thread->ts;
211
+
212
+ if (cpu < 0)
213
+ cpu = 0;
214
+
215
+ if (!ts || (unsigned int)cpu >= ts->arr_sz)
216
+ return NULL;
217
+
218
+ ts += cpu;
219
+
220
+ if (!ts->stack)
221
+ return NULL;
222
+
223
+ return ts;
224
+}
225
+
226
+static inline struct thread_stack *thread__stack(struct thread *thread,
227
+ int cpu)
228
+{
229
+ if (!thread)
230
+ return NULL;
231
+
232
+ if (thread_stack__per_cpu(thread))
233
+ return thread__cpu_stack(thread, cpu);
234
+
235
+ return thread->ts;
236
+}
237
+
238
+static int thread_stack__push(struct thread_stack *ts, u64 ret_addr,
239
+ bool trace_end)
116240 {
117241 int err = 0;
118242
....@@ -124,6 +248,7 @@
124248 }
125249 }
126250
251
+ ts->stack[ts->cnt].trace_end = trace_end;
127252 ts->stack[ts->cnt++].ret_addr = ret_addr;
128253
129254 return err;
....@@ -150,6 +275,18 @@
150275 }
151276 }
152277
278
+static void thread_stack__pop_trace_end(struct thread_stack *ts)
279
+{
280
+ size_t i;
281
+
282
+ for (i = ts->cnt; i; ) {
283
+ if (ts->stack[--i].trace_end)
284
+ ts->cnt = i;
285
+ else
286
+ return;
287
+ }
288
+}
289
+
153290 static bool thread_stack__in_kernel(struct thread_stack *ts)
154291 {
155292 if (!ts->cnt)
....@@ -169,20 +306,33 @@
169306 .comm = ts->comm,
170307 .db_id = 0,
171308 };
309
+ u64 *parent_db_id;
172310
173311 tse = &ts->stack[idx];
174312 cr.cp = tse->cp;
175313 cr.call_time = tse->timestamp;
176314 cr.return_time = timestamp;
177315 cr.branch_count = ts->branch_count - tse->branch_count;
316
+ cr.insn_count = ts->insn_count - tse->insn_count;
317
+ cr.cyc_count = ts->cyc_count - tse->cyc_count;
318
+ cr.db_id = tse->db_id;
178319 cr.call_ref = tse->ref;
179320 cr.return_ref = ref;
180321 if (tse->no_call)
181322 cr.flags |= CALL_RETURN_NO_CALL;
182323 if (no_return)
183324 cr.flags |= CALL_RETURN_NO_RETURN;
325
+ if (tse->non_call)
326
+ cr.flags |= CALL_RETURN_NON_CALL;
184327
185
- return crp->process(&cr, crp->data);
328
+ /*
329
+ * The parent db_id must be assigned before exporting the child. Note
330
+ * it is not possible to export the parent first because its information
331
+ * is not yet complete because its 'return' has not yet been processed.
332
+ */
333
+ parent_db_id = idx ? &(tse - 1)->db_id : NULL;
334
+
335
+ return crp->process(&cr, parent_db_id, crp->data);
186336 }
187337
188338 static int __thread_stack__flush(struct thread *thread, struct thread_stack *ts)
....@@ -192,6 +342,9 @@
192342
193343 if (!crp) {
194344 ts->cnt = 0;
345
+ ts->br_stack_pos = 0;
346
+ if (ts->br_stack_rb)
347
+ ts->br_stack_rb->nr = 0;
195348 return 0;
196349 }
197350
....@@ -210,25 +363,63 @@
210363
211364 int thread_stack__flush(struct thread *thread)
212365 {
213
- if (thread->ts)
214
- return __thread_stack__flush(thread, thread->ts);
366
+ struct thread_stack *ts = thread->ts;
367
+ unsigned int pos;
368
+ int err = 0;
215369
216
- return 0;
370
+ if (ts) {
371
+ for (pos = 0; pos < ts->arr_sz; pos++) {
372
+ int ret = __thread_stack__flush(thread, ts + pos);
373
+
374
+ if (ret)
375
+ err = ret;
376
+ }
377
+ }
378
+
379
+ return err;
217380 }
218381
219
-int thread_stack__event(struct thread *thread, u32 flags, u64 from_ip,
220
- u64 to_ip, u16 insn_len, u64 trace_nr)
382
+static void thread_stack__update_br_stack(struct thread_stack *ts, u32 flags,
383
+ u64 from_ip, u64 to_ip)
221384 {
385
+ struct branch_stack *bs = ts->br_stack_rb;
386
+ struct branch_entry *be;
387
+
388
+ if (!ts->br_stack_pos)
389
+ ts->br_stack_pos = ts->br_stack_sz;
390
+
391
+ ts->br_stack_pos -= 1;
392
+
393
+ be = &bs->entries[ts->br_stack_pos];
394
+ be->from = from_ip;
395
+ be->to = to_ip;
396
+ be->flags.value = 0;
397
+ be->flags.abort = !!(flags & PERF_IP_FLAG_TX_ABORT);
398
+ be->flags.in_tx = !!(flags & PERF_IP_FLAG_IN_TX);
399
+ /* No support for mispredict */
400
+ be->flags.mispred = ts->mispred_all;
401
+
402
+ if (bs->nr < ts->br_stack_sz)
403
+ bs->nr += 1;
404
+}
405
+
406
+int thread_stack__event(struct thread *thread, int cpu, u32 flags, u64 from_ip,
407
+ u64 to_ip, u16 insn_len, u64 trace_nr, bool callstack,
408
+ unsigned int br_stack_sz, bool mispred_all)
409
+{
410
+ struct thread_stack *ts = thread__stack(thread, cpu);
411
+
222412 if (!thread)
223413 return -EINVAL;
224414
225
- if (!thread->ts) {
226
- thread->ts = thread_stack__new(thread, NULL);
227
- if (!thread->ts) {
415
+ if (!ts) {
416
+ ts = thread_stack__new(thread, cpu, NULL, callstack, br_stack_sz);
417
+ if (!ts) {
228418 pr_warning("Out of memory: no thread stack\n");
229419 return -ENOMEM;
230420 }
231
- thread->ts->trace_nr = trace_nr;
421
+ ts->trace_nr = trace_nr;
422
+ ts->mispred_all = mispred_all;
232423 }
233424
234425 /*
....@@ -236,14 +427,20 @@
236427 * the stack might be completely invalid. Better to report nothing than
237428 * to report something misleading, so flush the stack.
238429 */
239
- if (trace_nr != thread->ts->trace_nr) {
240
- if (thread->ts->trace_nr)
241
- __thread_stack__flush(thread, thread->ts);
242
- thread->ts->trace_nr = trace_nr;
430
+ if (trace_nr != ts->trace_nr) {
431
+ if (ts->trace_nr)
432
+ __thread_stack__flush(thread, ts);
433
+ ts->trace_nr = trace_nr;
243434 }
244435
245
- /* Stop here if thread_stack__process() is in use */
246
- if (thread->ts->crp)
436
+ if (br_stack_sz)
437
+ thread_stack__update_br_stack(ts, flags, from_ip, to_ip);
438
+
439
+ /*
440
+ * Stop here if thread_stack__process() is in use, or not recording call
441
+ * stack.
442
+ */
443
+ if (ts->crp || !callstack)
247444 return 0;
248445
249446 if (flags & PERF_IP_FLAG_CALL) {
....@@ -254,33 +451,63 @@
254451 ret_addr = from_ip + insn_len;
255452 if (ret_addr == to_ip)
256453 return 0; /* Zero-length calls are excluded */
257
- return thread_stack__push(thread->ts, ret_addr);
258
- } else if (flags & PERF_IP_FLAG_RETURN) {
259
- if (!from_ip)
260
- return 0;
261
- thread_stack__pop(thread->ts, to_ip);
454
+ return thread_stack__push(ts, ret_addr,
455
+ flags & PERF_IP_FLAG_TRACE_END);
456
+ } else if (flags & PERF_IP_FLAG_TRACE_BEGIN) {
457
+ /*
458
+ * If the caller did not change the trace number (which would
459
+ * have flushed the stack) then try to make sense of the stack.
460
+ * Possibly, tracing began after returning to the current
461
+ * address, so try to pop that. Also, do not expect a call made
462
+ * when the trace ended, to return, so pop that.
463
+ */
464
+ thread_stack__pop(ts, to_ip);
465
+ thread_stack__pop_trace_end(ts);
466
+ } else if ((flags & PERF_IP_FLAG_RETURN) && from_ip) {
467
+ thread_stack__pop(ts, to_ip);
262468 }
263469
264470 return 0;
265471 }
266472
267
-void thread_stack__set_trace_nr(struct thread *thread, u64 trace_nr)
473
+void thread_stack__set_trace_nr(struct thread *thread, int cpu, u64 trace_nr)
268474 {
269
- if (!thread || !thread->ts)
475
+ struct thread_stack *ts = thread__stack(thread, cpu);
476
+
477
+ if (!ts)
270478 return;
271479
272
- if (trace_nr != thread->ts->trace_nr) {
273
- if (thread->ts->trace_nr)
274
- __thread_stack__flush(thread, thread->ts);
275
- thread->ts->trace_nr = trace_nr;
480
+ if (trace_nr != ts->trace_nr) {
481
+ if (ts->trace_nr)
482
+ __thread_stack__flush(thread, ts);
483
+ ts->trace_nr = trace_nr;
276484 }
485
+}
486
+
487
+static void __thread_stack__free(struct thread *thread, struct thread_stack *ts)
488
+{
489
+ __thread_stack__flush(thread, ts);
490
+ zfree(&ts->stack);
491
+ zfree(&ts->br_stack_rb);
492
+}
493
+
494
+static void thread_stack__reset(struct thread *thread, struct thread_stack *ts)
495
+{
496
+ unsigned int arr_sz = ts->arr_sz;
497
+
498
+ __thread_stack__free(thread, ts);
499
+ memset(ts, 0, sizeof(*ts));
500
+ ts->arr_sz = arr_sz;
277501 }
278502
279503 void thread_stack__free(struct thread *thread)
280504 {
281
- if (thread->ts) {
282
- __thread_stack__flush(thread, thread->ts);
283
- zfree(&thread->ts->stack);
505
+ struct thread_stack *ts = thread->ts;
506
+ unsigned int pos;
507
+
508
+ if (ts) {
509
+ for (pos = 0; pos < ts->arr_sz; pos++)
510
+ __thread_stack__free(thread, ts + pos);
284511 zfree(&thread->ts);
285512 }
286513 }
....@@ -290,9 +517,11 @@
290517 return ip < kernel_start ? PERF_CONTEXT_USER : PERF_CONTEXT_KERNEL;
291518 }
292519
293
-void thread_stack__sample(struct thread *thread, struct ip_callchain *chain,
520
+void thread_stack__sample(struct thread *thread, int cpu,
521
+ struct ip_callchain *chain,
294522 size_t sz, u64 ip, u64 kernel_start)
295523 {
524
+ struct thread_stack *ts = thread__stack(thread, cpu);
296525 u64 context = callchain_context(ip, kernel_start);
297526 u64 last_context;
298527 size_t i, j;
....@@ -305,15 +534,15 @@
305534 chain->ips[0] = context;
306535 chain->ips[1] = ip;
307536
308
- if (!thread || !thread->ts) {
537
+ if (!ts) {
309538 chain->nr = 2;
310539 return;
311540 }
312541
313542 last_context = context;
314543
315
- for (i = 2, j = 1; i < sz && j <= thread->ts->cnt; i++, j++) {
316
- ip = thread->ts->stack[thread->ts->cnt - j].ret_addr;
544
+ for (i = 2, j = 1; i < sz && j <= ts->cnt; i++, j++) {
545
+ ip = ts->stack[ts->cnt - j].ret_addr;
317546 context = callchain_context(ip, kernel_start);
318547 if (context != last_context) {
319548 if (i >= sz - 1)
....@@ -327,8 +556,201 @@
327556 chain->nr = i;
328557 }
329558
559
+/*
560
+ * Hardware sample records, created some time after the event occurred, need to
561
+ * have subsequent addresses removed from the call chain.
562
+ */
563
+void thread_stack__sample_late(struct thread *thread, int cpu,
564
+ struct ip_callchain *chain, size_t sz,
565
+ u64 sample_ip, u64 kernel_start)
566
+{
567
+ struct thread_stack *ts = thread__stack(thread, cpu);
568
+ u64 sample_context = callchain_context(sample_ip, kernel_start);
569
+ u64 last_context, context, ip;
570
+ size_t nr = 0, j;
571
+
572
+ if (sz < 2) {
573
+ chain->nr = 0;
574
+ return;
575
+ }
576
+
577
+ if (!ts)
578
+ goto out;
579
+
580
+ /*
581
+ * When tracing kernel space, kernel addresses occur at the top of the
582
+ * call chain after the event occurred but before tracing stopped.
583
+ * Skip them.
584
+ */
585
+ for (j = 1; j <= ts->cnt; j++) {
586
+ ip = ts->stack[ts->cnt - j].ret_addr;
587
+ context = callchain_context(ip, kernel_start);
588
+ if (context == PERF_CONTEXT_USER ||
589
+ (context == sample_context && ip == sample_ip))
590
+ break;
591
+ }
592
+
593
+ last_context = sample_ip; /* Use sample_ip as an invalid context */
594
+
595
+ for (; nr < sz && j <= ts->cnt; nr++, j++) {
596
+ ip = ts->stack[ts->cnt - j].ret_addr;
597
+ context = callchain_context(ip, kernel_start);
598
+ if (context != last_context) {
599
+ if (nr >= sz - 1)
600
+ break;
601
+ chain->ips[nr++] = context;
602
+ last_context = context;
603
+ }
604
+ chain->ips[nr] = ip;
605
+ }
606
+out:
607
+ if (nr) {
608
+ chain->nr = nr;
609
+ } else {
610
+ chain->ips[0] = sample_context;
611
+ chain->ips[1] = sample_ip;
612
+ chain->nr = 2;
613
+ }
614
+}
615
+
616
+void thread_stack__br_sample(struct thread *thread, int cpu,
617
+ struct branch_stack *dst, unsigned int sz)
618
+{
619
+ struct thread_stack *ts = thread__stack(thread, cpu);
620
+ const size_t bsz = sizeof(struct branch_entry);
621
+ struct branch_stack *src;
622
+ struct branch_entry *be;
623
+ unsigned int nr;
624
+
625
+ dst->nr = 0;
626
+
627
+ if (!ts)
628
+ return;
629
+
630
+ src = ts->br_stack_rb;
631
+ if (!src->nr)
632
+ return;
633
+
634
+ dst->nr = min((unsigned int)src->nr, sz);
635
+
636
+ be = &dst->entries[0];
637
+ nr = min(ts->br_stack_sz - ts->br_stack_pos, (unsigned int)dst->nr);
638
+ memcpy(be, &src->entries[ts->br_stack_pos], bsz * nr);
639
+
640
+ if (src->nr >= ts->br_stack_sz) {
641
+ sz -= nr;
642
+ be = &dst->entries[nr];
643
+ nr = min(ts->br_stack_pos, sz);
644
+ memcpy(be, &src->entries[0], bsz * ts->br_stack_pos);
645
+ }
646
+}
647
+
648
+/* Start of user space branch entries */
649
+static bool us_start(struct branch_entry *be, u64 kernel_start, bool *start)
650
+{
651
+ if (!*start)
652
+ *start = be->to && be->to < kernel_start;
653
+
654
+ return *start;
655
+}
656
+
657
+/*
658
+ * Start of branch entries after the ip fell in between 2 branches, or user
659
+ * space branch entries.
660
+ */
661
+static bool ks_start(struct branch_entry *be, u64 sample_ip, u64 kernel_start,
662
+ bool *start, struct branch_entry *nb)
663
+{
664
+ if (!*start) {
665
+ *start = (nb && sample_ip >= be->to && sample_ip <= nb->from) ||
666
+ be->from < kernel_start ||
667
+ (be->to && be->to < kernel_start);
668
+ }
669
+
670
+ return *start;
671
+}
672
+
673
+/*
674
+ * Hardware sample records, created some time after the event occurred, need to
675
+ * have subsequent addresses removed from the branch stack.
676
+ */
677
+void thread_stack__br_sample_late(struct thread *thread, int cpu,
678
+ struct branch_stack *dst, unsigned int sz,
679
+ u64 ip, u64 kernel_start)
680
+{
681
+ struct thread_stack *ts = thread__stack(thread, cpu);
682
+ struct branch_entry *d, *s, *spos, *ssz;
683
+ struct branch_stack *src;
684
+ unsigned int nr = 0;
685
+ bool start = false;
686
+
687
+ dst->nr = 0;
688
+
689
+ if (!ts)
690
+ return;
691
+
692
+ src = ts->br_stack_rb;
693
+ if (!src->nr)
694
+ return;
695
+
696
+ spos = &src->entries[ts->br_stack_pos];
697
+ ssz = &src->entries[ts->br_stack_sz];
698
+
699
+ d = &dst->entries[0];
700
+ s = spos;
701
+
702
+ if (ip < kernel_start) {
703
+ /*
704
+ * User space sample: start copying branch entries when the
705
+ * branch is in user space.
706
+ */
707
+ for (s = spos; s < ssz && nr < sz; s++) {
708
+ if (us_start(s, kernel_start, &start)) {
709
+ *d++ = *s;
710
+ nr += 1;
711
+ }
712
+ }
713
+
714
+ if (src->nr >= ts->br_stack_sz) {
715
+ for (s = &src->entries[0]; s < spos && nr < sz; s++) {
716
+ if (us_start(s, kernel_start, &start)) {
717
+ *d++ = *s;
718
+ nr += 1;
719
+ }
720
+ }
721
+ }
722
+ } else {
723
+ struct branch_entry *nb = NULL;
724
+
725
+ /*
726
+ * Kernel space sample: start copying branch entries when the ip
727
+ * falls in between 2 branches (or the branch is in user space
728
+ * because then the start must have been missed).
729
+ */
730
+ for (s = spos; s < ssz && nr < sz; s++) {
731
+ if (ks_start(s, ip, kernel_start, &start, nb)) {
732
+ *d++ = *s;
733
+ nr += 1;
734
+ }
735
+ nb = s;
736
+ }
737
+
738
+ if (src->nr >= ts->br_stack_sz) {
739
+ for (s = &src->entries[0]; s < spos && nr < sz; s++) {
740
+ if (ks_start(s, ip, kernel_start, &start, nb)) {
741
+ *d++ = *s;
742
+ nr += 1;
743
+ }
744
+ nb = s;
745
+ }
746
+ }
747
+ }
748
+
749
+ dst->nr = nr;
750
+}
751
+
330752 struct call_return_processor *
331
-call_return_processor__new(int (*process)(struct call_return *cr, void *data),
753
+call_return_processor__new(int (*process)(struct call_return *cr, u64 *parent_db_id, void *data),
332754 void *data)
333755 {
334756 struct call_return_processor *crp;
....@@ -358,10 +780,13 @@
358780
359781 static int thread_stack__push_cp(struct thread_stack *ts, u64 ret_addr,
360782 u64 timestamp, u64 ref, struct call_path *cp,
361
- bool no_call)
783
+ bool no_call, bool trace_end)
362784 {
363785 struct thread_stack_entry *tse;
364786 int err;
787
+
788
+ if (!cp)
789
+ return -ENOMEM;
365790
366791 if (ts->cnt == ts->sz) {
367792 err = thread_stack__grow(ts);
....@@ -374,8 +799,13 @@
374799 tse->timestamp = timestamp;
375800 tse->ref = ref;
376801 tse->branch_count = ts->branch_count;
802
+ tse->insn_count = ts->insn_count;
803
+ tse->cyc_count = ts->cyc_count;
377804 tse->cp = cp;
378805 tse->no_call = no_call;
806
+ tse->trace_end = trace_end;
807
+ tse->non_call = false;
808
+ tse->db_id = 0;
379809
380810 return 0;
381811 }
....@@ -397,14 +827,16 @@
397827 timestamp, ref, false);
398828 }
399829
400
- if (ts->stack[ts->cnt - 1].ret_addr == ret_addr) {
830
+ if (ts->stack[ts->cnt - 1].ret_addr == ret_addr &&
831
+ !ts->stack[ts->cnt - 1].non_call) {
401832 return thread_stack__call_return(thread, ts, --ts->cnt,
402833 timestamp, ref, false);
403834 } else {
404835 size_t i = ts->cnt - 1;
405836
406837 while (i--) {
407
- if (ts->stack[i].ret_addr != ret_addr)
838
+ if (ts->stack[i].ret_addr != ret_addr ||
839
+ ts->stack[i].non_call)
408840 continue;
409841 i += 1;
410842 while (ts->cnt > i) {
....@@ -423,7 +855,7 @@
423855 return 1;
424856 }
425857
426
-static int thread_stack__bottom(struct thread *thread, struct thread_stack *ts,
858
+static int thread_stack__bottom(struct thread_stack *ts,
427859 struct perf_sample *sample,
428860 struct addr_location *from_al,
429861 struct addr_location *to_al, u64 ref)
....@@ -445,11 +877,26 @@
445877
446878 cp = call_path__findnew(cpr, &cpr->call_path, sym, ip,
447879 ts->kernel_start);
448
- if (!cp)
449
- return -ENOMEM;
450880
451
- return thread_stack__push_cp(thread->ts, ip, sample->time, ref, cp,
452
- true);
881
+ return thread_stack__push_cp(ts, ip, sample->time, ref, cp,
882
+ true, false);
883
+}
884
+
885
+static int thread_stack__pop_ks(struct thread *thread, struct thread_stack *ts,
886
+ struct perf_sample *sample, u64 ref)
887
+{
888
+ u64 tm = sample->time;
889
+ int err;
890
+
891
+ /* Return to userspace, so pop all kernel addresses */
892
+ while (thread_stack__in_kernel(ts)) {
893
+ err = thread_stack__call_return(thread, ts, --ts->cnt,
894
+ tm, ref, true);
895
+ if (err)
896
+ return err;
897
+ }
898
+
899
+ return 0;
453900 }
454901
455902 static int thread_stack__no_call_return(struct thread *thread,
....@@ -459,59 +906,91 @@
459906 struct addr_location *to_al, u64 ref)
460907 {
461908 struct call_path_root *cpr = ts->crp->cpr;
909
+ struct call_path *root = &cpr->call_path;
910
+ struct symbol *fsym = from_al->sym;
911
+ struct symbol *tsym = to_al->sym;
462912 struct call_path *cp, *parent;
463913 u64 ks = ts->kernel_start;
914
+ u64 addr = sample->addr;
915
+ u64 tm = sample->time;
916
+ u64 ip = sample->ip;
464917 int err;
465918
466
- if (sample->ip >= ks && sample->addr < ks) {
919
+ if (ip >= ks && addr < ks) {
467920 /* Return to userspace, so pop all kernel addresses */
468
- while (thread_stack__in_kernel(ts)) {
469
- err = thread_stack__call_return(thread, ts, --ts->cnt,
470
- sample->time, ref,
471
- true);
472
- if (err)
473
- return err;
474
- }
921
+ err = thread_stack__pop_ks(thread, ts, sample, ref);
922
+ if (err)
923
+ return err;
475924
476925 /* If the stack is empty, push the userspace address */
477926 if (!ts->cnt) {
478
- cp = call_path__findnew(cpr, &cpr->call_path,
479
- to_al->sym, sample->addr,
480
- ts->kernel_start);
481
- if (!cp)
482
- return -ENOMEM;
483
- return thread_stack__push_cp(ts, 0, sample->time, ref,
484
- cp, true);
927
+ cp = call_path__findnew(cpr, root, tsym, addr, ks);
928
+ return thread_stack__push_cp(ts, 0, tm, ref, cp, true,
929
+ false);
485930 }
486
- } else if (thread_stack__in_kernel(ts) && sample->ip < ks) {
931
+ } else if (thread_stack__in_kernel(ts) && ip < ks) {
487932 /* Return to userspace, so pop all kernel addresses */
488
- while (thread_stack__in_kernel(ts)) {
489
- err = thread_stack__call_return(thread, ts, --ts->cnt,
490
- sample->time, ref,
491
- true);
492
- if (err)
493
- return err;
494
- }
933
+ err = thread_stack__pop_ks(thread, ts, sample, ref);
934
+ if (err)
935
+ return err;
495936 }
496937
497938 if (ts->cnt)
498939 parent = ts->stack[ts->cnt - 1].cp;
499940 else
500
- parent = &cpr->call_path;
941
+ parent = root;
501942
502
- /* This 'return' had no 'call', so push and pop top of stack */
503
- cp = call_path__findnew(cpr, parent, from_al->sym, sample->ip,
504
- ts->kernel_start);
505
- if (!cp)
506
- return -ENOMEM;
943
+ if (parent->sym == from_al->sym) {
944
+ /*
945
+ * At the bottom of the stack, assume the missing 'call' was
946
+ * before the trace started. So, pop the current symbol and push
947
+ * the 'to' symbol.
948
+ */
949
+ if (ts->cnt == 1) {
950
+ err = thread_stack__call_return(thread, ts, --ts->cnt,
951
+ tm, ref, false);
952
+ if (err)
953
+ return err;
954
+ }
507955
508
- err = thread_stack__push_cp(ts, sample->addr, sample->time, ref, cp,
509
- true);
956
+ if (!ts->cnt) {
957
+ cp = call_path__findnew(cpr, root, tsym, addr, ks);
958
+
959
+ return thread_stack__push_cp(ts, addr, tm, ref, cp,
960
+ true, false);
961
+ }
962
+
963
+ /*
964
+ * Otherwise assume the 'return' is being used as a jump (e.g.
965
+ * retpoline) and just push the 'to' symbol.
966
+ */
967
+ cp = call_path__findnew(cpr, parent, tsym, addr, ks);
968
+
969
+ err = thread_stack__push_cp(ts, 0, tm, ref, cp, true, false);
970
+ if (!err)
971
+ ts->stack[ts->cnt - 1].non_call = true;
972
+
973
+ return err;
974
+ }
975
+
976
+ /*
977
+ * Assume 'parent' has not yet returned, so push 'to', and then push and
978
+ * pop 'from'.
979
+ */
980
+
981
+ cp = call_path__findnew(cpr, parent, tsym, addr, ks);
982
+
983
+ err = thread_stack__push_cp(ts, addr, tm, ref, cp, true, false);
510984 if (err)
511985 return err;
512986
513
- return thread_stack__pop_cp(thread, ts, sample->addr, sample->time, ref,
514
- to_al->sym);
987
+ cp = call_path__findnew(cpr, cp, fsym, ip, ks);
988
+
989
+ err = thread_stack__push_cp(ts, ip, tm, ref, cp, true, false);
990
+ if (err)
991
+ return err;
992
+
993
+ return thread_stack__call_return(thread, ts, --ts->cnt, tm, ref, false);
515994 }
516995
517996 static int thread_stack__trace_begin(struct thread *thread,
....@@ -526,7 +1005,7 @@
5261005
5271006 /* Pop trace end */
5281007 tse = &ts->stack[ts->cnt - 1];
529
- if (tse->cp->sym == NULL && tse->cp->ip == 0) {
1008
+ if (tse->trace_end) {
5301009 err = thread_stack__call_return(thread, ts, --ts->cnt,
5311010 timestamp, ref, false);
5321011 if (err)
....@@ -549,13 +1028,75 @@
5491028
5501029 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp, NULL, 0,
5511030 ts->kernel_start);
552
- if (!cp)
553
- return -ENOMEM;
5541031
5551032 ret_addr = sample->ip + sample->insn_len;
5561033
5571034 return thread_stack__push_cp(ts, ret_addr, sample->time, ref, cp,
558
- false);
1035
+ false, true);
1036
+}
1037
+
1038
+static bool is_x86_retpoline(const char *name)
1039
+{
1040
+ const char *p = strstr(name, "__x86_indirect_thunk_");
1041
+
1042
+ return p == name || !strcmp(name, "__indirect_thunk_start");
1043
+}
1044
+
1045
+/*
1046
+ * x86 retpoline functions pollute the call graph. This function removes them.
1047
+ * This does not handle function return thunks, nor is there any improvement
1048
+ * for the handling of inline thunks or extern thunks.
1049
+ */
1050
+static int thread_stack__x86_retpoline(struct thread_stack *ts,
1051
+ struct perf_sample *sample,
1052
+ struct addr_location *to_al)
1053
+{
1054
+ struct thread_stack_entry *tse = &ts->stack[ts->cnt - 1];
1055
+ struct call_path_root *cpr = ts->crp->cpr;
1056
+ struct symbol *sym = tse->cp->sym;
1057
+ struct symbol *tsym = to_al->sym;
1058
+ struct call_path *cp;
1059
+
1060
+ if (sym && is_x86_retpoline(sym->name)) {
1061
+ /*
1062
+ * This is a x86 retpoline fn. It pollutes the call graph by
1063
+ * showing up everywhere there is an indirect branch, but does
1064
+ * not itself mean anything. Here the top-of-stack is removed,
1065
+ * by decrementing the stack count, and then further down, the
1066
+ * resulting top-of-stack is replaced with the actual target.
1067
+ * The result is that the retpoline functions will no longer
1068
+ * appear in the call graph. Note this only affects the call
1069
+ * graph, since all the original branches are left unchanged.
1070
+ */
1071
+ ts->cnt -= 1;
1072
+ sym = ts->stack[ts->cnt - 2].cp->sym;
1073
+ if (sym && sym == tsym && to_al->addr != tsym->start) {
1074
+ /*
1075
+ * Target is back to the middle of the symbol we came
1076
+ * from so assume it is an indirect jmp and forget it
1077
+ * altogether.
1078
+ */
1079
+ ts->cnt -= 1;
1080
+ return 0;
1081
+ }
1082
+ } else if (sym && sym == tsym) {
1083
+ /*
1084
+ * Target is back to the symbol we came from so assume it is an
1085
+ * indirect jmp and forget it altogether.
1086
+ */
1087
+ ts->cnt -= 1;
1088
+ return 0;
1089
+ }
1090
+
1091
+ cp = call_path__findnew(cpr, ts->stack[ts->cnt - 2].cp, tsym,
1092
+ sample->addr, ts->kernel_start);
1093
+ if (!cp)
1094
+ return -ENOMEM;
1095
+
1096
+ /* Replace the top-of-stack with the actual target */
1097
+ ts->stack[ts->cnt - 1].cp = cp;
1098
+
1099
+ return 0;
5591100 }
5601101
5611102 int thread_stack__process(struct thread *thread, struct comm *comm,
....@@ -564,26 +1105,26 @@
5641105 struct addr_location *to_al, u64 ref,
5651106 struct call_return_processor *crp)
5661107 {
567
- struct thread_stack *ts = thread->ts;
1108
+ struct thread_stack *ts = thread__stack(thread, sample->cpu);
1109
+ enum retpoline_state_t rstate;
5681110 int err = 0;
5691111
570
- if (ts) {
571
- if (!ts->crp) {
572
- /* Supersede thread_stack__event() */
573
- thread_stack__free(thread);
574
- thread->ts = thread_stack__new(thread, crp);
575
- if (!thread->ts)
576
- return -ENOMEM;
577
- ts = thread->ts;
578
- ts->comm = comm;
579
- }
580
- } else {
581
- thread->ts = thread_stack__new(thread, crp);
582
- if (!thread->ts)
1112
+ if (ts && !ts->crp) {
1113
+ /* Supersede thread_stack__event() */
1114
+ thread_stack__reset(thread, ts);
1115
+ ts = NULL;
1116
+ }
1117
+
1118
+ if (!ts) {
1119
+ ts = thread_stack__new(thread, sample->cpu, crp, true, 0);
1120
+ if (!ts)
5831121 return -ENOMEM;
584
- ts = thread->ts;
5851122 ts->comm = comm;
5861123 }
1124
+
1125
+ rstate = ts->rstate;
1126
+ if (rstate == X86_RETPOLINE_DETECTED)
1127
+ ts->rstate = X86_RETPOLINE_POSSIBLE;
5871128
5881129 /* Flush stack on exec */
5891130 if (ts->comm != comm && thread->pid_ == thread->tid) {
....@@ -595,16 +1136,18 @@
5951136
5961137 /* If the stack is empty, put the current symbol on the stack */
5971138 if (!ts->cnt) {
598
- err = thread_stack__bottom(thread, ts, sample, from_al, to_al,
599
- ref);
1139
+ err = thread_stack__bottom(ts, sample, from_al, to_al, ref);
6001140 if (err)
6011141 return err;
6021142 }
6031143
6041144 ts->branch_count += 1;
1145
+ ts->insn_count += sample->insn_cnt;
1146
+ ts->cyc_count += sample->cyc_cnt;
6051147 ts->last_time = sample->time;
6061148
6071149 if (sample->flags & PERF_IP_FLAG_CALL) {
1150
+ bool trace_end = sample->flags & PERF_IP_FLAG_TRACE_END;
6081151 struct call_path_root *cpr = ts->crp->cpr;
6091152 struct call_path *cp;
6101153 u64 ret_addr;
....@@ -619,13 +1162,37 @@
6191162 cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
6201163 to_al->sym, sample->addr,
6211164 ts->kernel_start);
622
- if (!cp)
623
- return -ENOMEM;
6241165 err = thread_stack__push_cp(ts, ret_addr, sample->time, ref,
625
- cp, false);
1166
+ cp, false, trace_end);
1167
+
1168
+ /*
1169
+ * A call to the same symbol but not the start of the symbol,
1170
+ * may be the start of a x86 retpoline.
1171
+ */
1172
+ if (!err && rstate == X86_RETPOLINE_POSSIBLE && to_al->sym &&
1173
+ from_al->sym == to_al->sym &&
1174
+ to_al->addr != to_al->sym->start)
1175
+ ts->rstate = X86_RETPOLINE_DETECTED;
1176
+
6261177 } else if (sample->flags & PERF_IP_FLAG_RETURN) {
627
- if (!sample->ip || !sample->addr)
1178
+ if (!sample->addr) {
1179
+ u32 return_from_kernel = PERF_IP_FLAG_SYSCALLRET |
1180
+ PERF_IP_FLAG_INTERRUPT;
1181
+
1182
+ if (!(sample->flags & return_from_kernel))
1183
+ return 0;
1184
+
1185
+ /* Pop kernel stack */
1186
+ return thread_stack__pop_ks(thread, ts, sample, ref);
1187
+ }
1188
+
1189
+ if (!sample->ip)
6281190 return 0;
1191
+
1192
+ /* x86 retpoline 'return' doesn't match the stack */
1193
+ if (rstate == X86_RETPOLINE_DETECTED && ts->cnt > 2 &&
1194
+ ts->stack[ts->cnt - 1].ret_addr != sample->addr)
1195
+ return thread_stack__x86_retpoline(ts, sample, to_al);
6291196
6301197 err = thread_stack__pop_cp(thread, ts, sample->addr,
6311198 sample->time, ref, from_al->sym);
....@@ -639,14 +1206,35 @@
6391206 err = thread_stack__trace_begin(thread, ts, sample->time, ref);
6401207 } else if (sample->flags & PERF_IP_FLAG_TRACE_END) {
6411208 err = thread_stack__trace_end(ts, sample, ref);
1209
+ } else if (sample->flags & PERF_IP_FLAG_BRANCH &&
1210
+ from_al->sym != to_al->sym && to_al->sym &&
1211
+ to_al->addr == to_al->sym->start) {
1212
+ struct call_path_root *cpr = ts->crp->cpr;
1213
+ struct call_path *cp;
1214
+
1215
+ /*
1216
+ * The compiler might optimize a call/ret combination by making
1217
+ * it a jmp. Make that visible by recording on the stack a
1218
+ * branch to the start of a different symbol. Note, that means
1219
+ * when a ret pops the stack, all jmps must be popped off first.
1220
+ */
1221
+ cp = call_path__findnew(cpr, ts->stack[ts->cnt - 1].cp,
1222
+ to_al->sym, sample->addr,
1223
+ ts->kernel_start);
1224
+ err = thread_stack__push_cp(ts, 0, sample->time, ref, cp, false,
1225
+ false);
1226
+ if (!err)
1227
+ ts->stack[ts->cnt - 1].non_call = true;
6421228 }
6431229
6441230 return err;
6451231 }
6461232
647
-size_t thread_stack__depth(struct thread *thread)
1233
+size_t thread_stack__depth(struct thread *thread, int cpu)
6481234 {
649
- if (!thread->ts)
1235
+ struct thread_stack *ts = thread__stack(thread, cpu);
1236
+
1237
+ if (!ts)
6501238 return 0;
651
- return thread->ts->cnt;
1239
+ return ts->cnt;
6521240 }