hc
2023-12-08 01573e231f18eb2d99162747186f59511f56b64d
kernel/tools/perf/builtin-record.c
....@@ -8,10 +8,7 @@
88 */
99 #include "builtin.h"
1010
11
-#include "perf.h"
12
-
1311 #include "util/build-id.h"
14
-#include "util/util.h"
1512 #include <subcmd/parse-options.h>
1613 #include "util/parse-events.h"
1714 #include "util/config.h"
....@@ -23,10 +20,12 @@
2320 #include "util/evlist.h"
2421 #include "util/evsel.h"
2522 #include "util/debug.h"
26
-#include "util/drv_configs.h"
23
+#include "util/mmap.h"
24
+#include "util/target.h"
2725 #include "util/session.h"
2826 #include "util/tool.h"
2927 #include "util/symbol.h"
28
+#include "util/record.h"
3029 #include "util/cpumap.h"
3130 #include "util/thread_map.h"
3231 #include "util/data.h"
....@@ -35,24 +34,44 @@
3534 #include "util/tsc.h"
3635 #include "util/parse-branch-options.h"
3736 #include "util/parse-regs-options.h"
37
+#include "util/perf_api_probe.h"
3838 #include "util/llvm-utils.h"
3939 #include "util/bpf-loader.h"
4040 #include "util/trigger.h"
4141 #include "util/perf-hooks.h"
42
+#include "util/cpu-set-sched.h"
43
+#include "util/synthetic-events.h"
4244 #include "util/time-utils.h"
4345 #include "util/units.h"
46
+#include "util/bpf-event.h"
47
+#include "util/util.h"
48
+#include "util/pfm.h"
49
+#include "util/clockid.h"
4450 #include "asm/bug.h"
51
+#include "perf.h"
4552
4653 #include <errno.h>
4754 #include <inttypes.h>
4855 #include <locale.h>
4956 #include <poll.h>
57
+#include <pthread.h>
5058 #include <unistd.h>
5159 #include <sched.h>
5260 #include <signal.h>
61
+#ifdef HAVE_EVENTFD_SUPPORT
62
+#include <sys/eventfd.h>
63
+#endif
5364 #include <sys/mman.h>
5465 #include <sys/wait.h>
66
+#include <sys/types.h>
67
+#include <sys/stat.h>
68
+#include <fcntl.h>
69
+#include <linux/err.h>
70
+#include <linux/string.h>
5571 #include <linux/time64.h>
72
+#include <linux/zalloc.h>
73
+#include <linux/bitmap.h>
74
+#include <sys/time.h>
5675
5776 struct switch_output {
5877 bool enabled;
....@@ -61,6 +80,9 @@
6180 unsigned long time;
6281 const char *str;
6382 bool set;
83
+ char **filenames;
84
+ int num_files;
85
+ int cur_file;
6486 };
6587
6688 struct record {
....@@ -69,9 +91,12 @@
6991 u64 bytes_written;
7092 struct perf_data data;
7193 struct auxtrace_record *itr;
72
- struct perf_evlist *evlist;
94
+ struct evlist *evlist;
7395 struct perf_session *session;
96
+ struct evlist *sb_evlist;
97
+ pthread_t thread_id;
7498 int realtime_prio;
99
+ bool switch_output_event_set;
75100 bool no_buildid;
76101 bool no_buildid_set;
77102 bool no_buildid_cache;
....@@ -81,11 +106,19 @@
81106 bool timestamp_boundary;
82107 struct switch_output switch_output;
83108 unsigned long long samples;
109
+ struct mmap_cpu_mask affinity_mask;
110
+ unsigned long output_max_size; /* = 0: unlimited */
84111 };
112
+
113
+static volatile int done;
85114
86115 static volatile int auxtrace_record__snapshot_started;
87116 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
88117 static DEFINE_TRIGGER(switch_output_trigger);
118
+
119
+static const char *affinity_tags[PERF_AFFINITY_MAX] = {
120
+ "SYS", "NODE", "CPU"
121
+};
89122
90123 static bool switch_output_signal(struct record *rec)
91124 {
....@@ -106,19 +139,371 @@
106139 trigger_is_ready(&switch_output_trigger);
107140 }
108141
109
-static int record__write(struct record *rec, void *bf, size_t size)
142
+static bool record__output_max_size_exceeded(struct record *rec)
110143 {
111
- if (perf_data__write(rec->session->data, bf, size) < 0) {
144
+ return rec->output_max_size &&
145
+ (rec->bytes_written >= rec->output_max_size);
146
+}
147
+
148
+static int record__write(struct record *rec, struct mmap *map __maybe_unused,
149
+ void *bf, size_t size)
150
+{
151
+ struct perf_data_file *file = &rec->session->data->file;
152
+
153
+ if (perf_data_file__write(file, bf, size) < 0) {
112154 pr_err("failed to write perf data, error: %m\n");
113155 return -1;
114156 }
115157
116158 rec->bytes_written += size;
117159
160
+ if (record__output_max_size_exceeded(rec) && !done) {
161
+ fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
162
+ " stopping session ]\n",
163
+ rec->bytes_written >> 10);
164
+ done = 1;
165
+ }
166
+
118167 if (switch_output_size(rec))
119168 trigger_hit(&switch_output_trigger);
120169
121170 return 0;
171
+}
172
+
173
+static int record__aio_enabled(struct record *rec);
174
+static int record__comp_enabled(struct record *rec);
175
+static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
176
+ void *src, size_t src_size);
177
+
178
+#ifdef HAVE_AIO_SUPPORT
179
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
180
+ void *buf, size_t size, off_t off)
181
+{
182
+ int rc;
183
+
184
+ cblock->aio_fildes = trace_fd;
185
+ cblock->aio_buf = buf;
186
+ cblock->aio_nbytes = size;
187
+ cblock->aio_offset = off;
188
+ cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
189
+
190
+ do {
191
+ rc = aio_write(cblock);
192
+ if (rc == 0) {
193
+ break;
194
+ } else if (errno != EAGAIN) {
195
+ cblock->aio_fildes = -1;
196
+ pr_err("failed to queue perf data, error: %m\n");
197
+ break;
198
+ }
199
+ } while (1);
200
+
201
+ return rc;
202
+}
203
+
204
+static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
205
+{
206
+ void *rem_buf;
207
+ off_t rem_off;
208
+ size_t rem_size;
209
+ int rc, aio_errno;
210
+ ssize_t aio_ret, written;
211
+
212
+ aio_errno = aio_error(cblock);
213
+ if (aio_errno == EINPROGRESS)
214
+ return 0;
215
+
216
+ written = aio_ret = aio_return(cblock);
217
+ if (aio_ret < 0) {
218
+ if (aio_errno != EINTR)
219
+ pr_err("failed to write perf data, error: %m\n");
220
+ written = 0;
221
+ }
222
+
223
+ rem_size = cblock->aio_nbytes - written;
224
+
225
+ if (rem_size == 0) {
226
+ cblock->aio_fildes = -1;
227
+ /*
228
+ * md->refcount is incremented in record__aio_pushfn() for
229
+ * every aio write request started in record__aio_push() so
230
+ * decrement it because the request is now complete.
231
+ */
232
+ perf_mmap__put(&md->core);
233
+ rc = 1;
234
+ } else {
235
+ /*
236
+ * aio write request may require restart with the
237
+ * reminder if the kernel didn't write whole
238
+ * chunk at once.
239
+ */
240
+ rem_off = cblock->aio_offset + written;
241
+ rem_buf = (void *)(cblock->aio_buf + written);
242
+ record__aio_write(cblock, cblock->aio_fildes,
243
+ rem_buf, rem_size, rem_off);
244
+ rc = 0;
245
+ }
246
+
247
+ return rc;
248
+}
249
+
250
+static int record__aio_sync(struct mmap *md, bool sync_all)
251
+{
252
+ struct aiocb **aiocb = md->aio.aiocb;
253
+ struct aiocb *cblocks = md->aio.cblocks;
254
+ struct timespec timeout = { 0, 1000 * 1000 * 1 }; /* 1ms */
255
+ int i, do_suspend;
256
+
257
+ do {
258
+ do_suspend = 0;
259
+ for (i = 0; i < md->aio.nr_cblocks; ++i) {
260
+ if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
261
+ if (sync_all)
262
+ aiocb[i] = NULL;
263
+ else
264
+ return i;
265
+ } else {
266
+ /*
267
+ * Started aio write is not complete yet
268
+ * so it has to be waited before the
269
+ * next allocation.
270
+ */
271
+ aiocb[i] = &cblocks[i];
272
+ do_suspend = 1;
273
+ }
274
+ }
275
+ if (!do_suspend)
276
+ return -1;
277
+
278
+ while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
279
+ if (!(errno == EAGAIN || errno == EINTR))
280
+ pr_err("failed to sync perf data, error: %m\n");
281
+ }
282
+ } while (1);
283
+}
284
+
285
+struct record_aio {
286
+ struct record *rec;
287
+ void *data;
288
+ size_t size;
289
+};
290
+
291
+static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
292
+{
293
+ struct record_aio *aio = to;
294
+
295
+ /*
296
+ * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
297
+ * to release space in the kernel buffer as fast as possible, calling
298
+ * perf_mmap__consume() from perf_mmap__push() function.
299
+ *
300
+ * That lets the kernel to proceed with storing more profiling data into
301
+ * the kernel buffer earlier than other per-cpu kernel buffers are handled.
302
+ *
303
+ * Coping can be done in two steps in case the chunk of profiling data
304
+ * crosses the upper bound of the kernel buffer. In this case we first move
305
+ * part of data from map->start till the upper bound and then the reminder
306
+ * from the beginning of the kernel buffer till the end of the data chunk.
307
+ */
308
+
309
+ if (record__comp_enabled(aio->rec)) {
310
+ size = zstd_compress(aio->rec->session, aio->data + aio->size,
311
+ mmap__mmap_len(map) - aio->size,
312
+ buf, size);
313
+ } else {
314
+ memcpy(aio->data + aio->size, buf, size);
315
+ }
316
+
317
+ if (!aio->size) {
318
+ /*
319
+ * Increment map->refcount to guard map->aio.data[] buffer
320
+ * from premature deallocation because map object can be
321
+ * released earlier than aio write request started on
322
+ * map->aio.data[] buffer is complete.
323
+ *
324
+ * perf_mmap__put() is done at record__aio_complete()
325
+ * after started aio request completion or at record__aio_push()
326
+ * if the request failed to start.
327
+ */
328
+ perf_mmap__get(&map->core);
329
+ }
330
+
331
+ aio->size += size;
332
+
333
+ return size;
334
+}
335
+
336
+static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
337
+{
338
+ int ret, idx;
339
+ int trace_fd = rec->session->data->file.fd;
340
+ struct record_aio aio = { .rec = rec, .size = 0 };
341
+
342
+ /*
343
+ * Call record__aio_sync() to wait till map->aio.data[] buffer
344
+ * becomes available after previous aio write operation.
345
+ */
346
+
347
+ idx = record__aio_sync(map, false);
348
+ aio.data = map->aio.data[idx];
349
+ ret = perf_mmap__push(map, &aio, record__aio_pushfn);
350
+ if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
351
+ return ret;
352
+
353
+ rec->samples++;
354
+ ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
355
+ if (!ret) {
356
+ *off += aio.size;
357
+ rec->bytes_written += aio.size;
358
+ if (switch_output_size(rec))
359
+ trigger_hit(&switch_output_trigger);
360
+ } else {
361
+ /*
362
+ * Decrement map->refcount incremented in record__aio_pushfn()
363
+ * back if record__aio_write() operation failed to start, otherwise
364
+ * map->refcount is decremented in record__aio_complete() after
365
+ * aio write operation finishes successfully.
366
+ */
367
+ perf_mmap__put(&map->core);
368
+ }
369
+
370
+ return ret;
371
+}
372
+
373
+static off_t record__aio_get_pos(int trace_fd)
374
+{
375
+ return lseek(trace_fd, 0, SEEK_CUR);
376
+}
377
+
378
+static void record__aio_set_pos(int trace_fd, off_t pos)
379
+{
380
+ lseek(trace_fd, pos, SEEK_SET);
381
+}
382
+
383
+static void record__aio_mmap_read_sync(struct record *rec)
384
+{
385
+ int i;
386
+ struct evlist *evlist = rec->evlist;
387
+ struct mmap *maps = evlist->mmap;
388
+
389
+ if (!record__aio_enabled(rec))
390
+ return;
391
+
392
+ for (i = 0; i < evlist->core.nr_mmaps; i++) {
393
+ struct mmap *map = &maps[i];
394
+
395
+ if (map->core.base)
396
+ record__aio_sync(map, true);
397
+ }
398
+}
399
+
400
+static int nr_cblocks_default = 1;
401
+static int nr_cblocks_max = 4;
402
+
403
+static int record__aio_parse(const struct option *opt,
404
+ const char *str,
405
+ int unset)
406
+{
407
+ struct record_opts *opts = (struct record_opts *)opt->value;
408
+
409
+ if (unset) {
410
+ opts->nr_cblocks = 0;
411
+ } else {
412
+ if (str)
413
+ opts->nr_cblocks = strtol(str, NULL, 0);
414
+ if (!opts->nr_cblocks)
415
+ opts->nr_cblocks = nr_cblocks_default;
416
+ }
417
+
418
+ return 0;
419
+}
420
+#else /* HAVE_AIO_SUPPORT */
421
+static int nr_cblocks_max = 0;
422
+
423
+static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
424
+ off_t *off __maybe_unused)
425
+{
426
+ return -1;
427
+}
428
+
429
+static off_t record__aio_get_pos(int trace_fd __maybe_unused)
430
+{
431
+ return -1;
432
+}
433
+
434
+static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
435
+{
436
+}
437
+
438
+static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
439
+{
440
+}
441
+#endif
442
+
443
+static int record__aio_enabled(struct record *rec)
444
+{
445
+ return rec->opts.nr_cblocks > 0;
446
+}
447
+
448
+#define MMAP_FLUSH_DEFAULT 1
449
+static int record__mmap_flush_parse(const struct option *opt,
450
+ const char *str,
451
+ int unset)
452
+{
453
+ int flush_max;
454
+ struct record_opts *opts = (struct record_opts *)opt->value;
455
+ static struct parse_tag tags[] = {
456
+ { .tag = 'B', .mult = 1 },
457
+ { .tag = 'K', .mult = 1 << 10 },
458
+ { .tag = 'M', .mult = 1 << 20 },
459
+ { .tag = 'G', .mult = 1 << 30 },
460
+ { .tag = 0 },
461
+ };
462
+
463
+ if (unset)
464
+ return 0;
465
+
466
+ if (str) {
467
+ opts->mmap_flush = parse_tag_value(str, tags);
468
+ if (opts->mmap_flush == (int)-1)
469
+ opts->mmap_flush = strtol(str, NULL, 0);
470
+ }
471
+
472
+ if (!opts->mmap_flush)
473
+ opts->mmap_flush = MMAP_FLUSH_DEFAULT;
474
+
475
+ flush_max = evlist__mmap_size(opts->mmap_pages);
476
+ flush_max /= 4;
477
+ if (opts->mmap_flush > flush_max)
478
+ opts->mmap_flush = flush_max;
479
+
480
+ return 0;
481
+}
482
+
483
+#ifdef HAVE_ZSTD_SUPPORT
484
+static unsigned int comp_level_default = 1;
485
+
486
+static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
487
+{
488
+ struct record_opts *opts = opt->value;
489
+
490
+ if (unset) {
491
+ opts->comp_level = 0;
492
+ } else {
493
+ if (str)
494
+ opts->comp_level = strtol(str, NULL, 0);
495
+ if (!opts->comp_level)
496
+ opts->comp_level = comp_level_default;
497
+ }
498
+
499
+ return 0;
500
+}
501
+#endif
502
+static unsigned int comp_level_max = 22;
503
+
504
+static int record__comp_enabled(struct record *rec)
505
+{
506
+ return rec->opts.comp_level > 0;
122507 }
123508
124509 static int process_synthesized_event(struct perf_tool *tool,
....@@ -127,20 +512,41 @@
127512 struct machine *machine __maybe_unused)
128513 {
129514 struct record *rec = container_of(tool, struct record, tool);
130
- return record__write(rec, event, event->header.size);
515
+ return record__write(rec, NULL, event, event->header.size);
131516 }
132517
133
-static int record__pushfn(void *to, void *bf, size_t size)
518
+static int process_locked_synthesized_event(struct perf_tool *tool,
519
+ union perf_event *event,
520
+ struct perf_sample *sample __maybe_unused,
521
+ struct machine *machine __maybe_unused)
522
+{
523
+ static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
524
+ int ret;
525
+
526
+ pthread_mutex_lock(&synth_lock);
527
+ ret = process_synthesized_event(tool, event, sample, machine);
528
+ pthread_mutex_unlock(&synth_lock);
529
+ return ret;
530
+}
531
+
532
+static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
134533 {
135534 struct record *rec = to;
136535
536
+ if (record__comp_enabled(rec)) {
537
+ size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
538
+ bf = map->data;
539
+ }
540
+
137541 rec->samples++;
138
- return record__write(rec, bf, size);
542
+ return record__write(rec, map, bf, size);
139543 }
140544
141
-static volatile int done;
142545 static volatile int signr = -1;
143546 static volatile int child_finished;
547
+#ifdef HAVE_EVENTFD_SUPPORT
548
+static int done_fd = -1;
549
+#endif
144550
145551 static void sig_handler(int sig)
146552 {
....@@ -150,6 +556,21 @@
150556 signr = sig;
151557
152558 done = 1;
559
+#ifdef HAVE_EVENTFD_SUPPORT
560
+{
561
+ u64 tmp = 1;
562
+ /*
563
+ * It is possible for this signal handler to run after done is checked
564
+ * in the main loop, but before the perf counter fds are polled. If this
565
+ * happens, the poll() will continue to wait even though done is set,
566
+ * and will only break out if either another signal is received, or the
567
+ * counters are ready for read. To ensure the poll() doesn't sleep when
568
+ * done is set, use an eventfd (done_fd) to wake up the poll().
569
+ */
570
+ if (write(done_fd, &tmp, sizeof(tmp)) < 0)
571
+ pr_err("failed to signal wakeup fd, error: %m\n");
572
+}
573
+#endif // HAVE_EVENTFD_SUPPORT
153574 }
154575
155576 static void sigsegv_handler(int sig)
....@@ -170,6 +591,7 @@
170591 #ifdef HAVE_AUXTRACE_SUPPORT
171592
172593 static int record__process_auxtrace(struct perf_tool *tool,
594
+ struct mmap *map,
173595 union perf_event *event, void *data1,
174596 size_t len1, void *data2, size_t len2)
175597 {
....@@ -178,7 +600,7 @@
178600 size_t padding;
179601 u8 pad[8] = {0};
180602
181
- if (!perf_data__is_pipe(data)) {
603
+ if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
182604 off_t file_offset;
183605 int fd = perf_data__fd(data);
184606 int err;
....@@ -197,21 +619,21 @@
197619 if (padding)
198620 padding = 8 - padding;
199621
200
- record__write(rec, event, event->header.size);
201
- record__write(rec, data1, len1);
622
+ record__write(rec, map, event, event->header.size);
623
+ record__write(rec, map, data1, len1);
202624 if (len2)
203
- record__write(rec, data2, len2);
204
- record__write(rec, &pad, padding);
625
+ record__write(rec, map, data2, len2);
626
+ record__write(rec, map, &pad, padding);
205627
206628 return 0;
207629 }
208630
209631 static int record__auxtrace_mmap_read(struct record *rec,
210
- struct auxtrace_mmap *mm)
632
+ struct mmap *map)
211633 {
212634 int ret;
213635
214
- ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
636
+ ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
215637 record__process_auxtrace);
216638 if (ret < 0)
217639 return ret;
....@@ -223,11 +645,11 @@
223645 }
224646
225647 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
226
- struct auxtrace_mmap *mm)
648
+ struct mmap *map)
227649 {
228650 int ret;
229651
230
- ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
652
+ ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
231653 record__process_auxtrace,
232654 rec->opts.auxtrace_snapshot_size);
233655 if (ret < 0)
....@@ -244,14 +666,13 @@
244666 int i;
245667 int rc = 0;
246668
247
- for (i = 0; i < rec->evlist->nr_mmaps; i++) {
248
- struct auxtrace_mmap *mm =
249
- &rec->evlist->mmap[i].auxtrace_mmap;
669
+ for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
670
+ struct mmap *map = &rec->evlist->mmap[i];
250671
251
- if (!mm->base)
672
+ if (!map->auxtrace_mmap.base)
252673 continue;
253674
254
- if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
675
+ if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
255676 rc = -1;
256677 goto out;
257678 }
....@@ -260,17 +681,33 @@
260681 return rc;
261682 }
262683
263
-static void record__read_auxtrace_snapshot(struct record *rec)
684
+static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
264685 {
265686 pr_debug("Recording AUX area tracing snapshot\n");
266687 if (record__auxtrace_read_snapshot_all(rec) < 0) {
267688 trigger_error(&auxtrace_snapshot_trigger);
268689 } else {
269
- if (auxtrace_record__snapshot_finish(rec->itr))
690
+ if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
270691 trigger_error(&auxtrace_snapshot_trigger);
271692 else
272693 trigger_ready(&auxtrace_snapshot_trigger);
273694 }
695
+}
696
+
697
+static int record__auxtrace_snapshot_exit(struct record *rec)
698
+{
699
+ if (trigger_is_error(&auxtrace_snapshot_trigger))
700
+ return 0;
701
+
702
+ if (!auxtrace_record__snapshot_started &&
703
+ auxtrace_record__snapshot_start(rec->itr))
704
+ return -1;
705
+
706
+ record__read_auxtrace_snapshot(rec, true);
707
+ if (trigger_is_error(&auxtrace_snapshot_trigger))
708
+ return -1;
709
+
710
+ return 0;
274711 }
275712
276713 static int record__auxtrace_init(struct record *rec)
....@@ -288,6 +725,11 @@
288725 if (err)
289726 return err;
290727
728
+ err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
729
+ rec->opts.auxtrace_sample_opts);
730
+ if (err)
731
+ return err;
732
+
291733 return auxtrace_parse_filters(rec->evlist);
292734 }
293735
....@@ -295,18 +737,25 @@
295737
296738 static inline
297739 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
298
- struct auxtrace_mmap *mm __maybe_unused)
740
+ struct mmap *map __maybe_unused)
299741 {
300742 return 0;
301743 }
302744
303745 static inline
304
-void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
746
+void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
747
+ bool on_exit __maybe_unused)
305748 {
306749 }
307750
308751 static inline
309752 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
753
+{
754
+ return 0;
755
+}
756
+
757
+static inline
758
+int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
310759 {
311760 return 0;
312761 }
....@@ -318,15 +767,90 @@
318767
319768 #endif
320769
770
+static int record__config_text_poke(struct evlist *evlist)
771
+{
772
+ struct evsel *evsel;
773
+ int err;
774
+
775
+ /* Nothing to do if text poke is already configured */
776
+ evlist__for_each_entry(evlist, evsel) {
777
+ if (evsel->core.attr.text_poke)
778
+ return 0;
779
+ }
780
+
781
+ err = parse_events(evlist, "dummy:u", NULL);
782
+ if (err)
783
+ return err;
784
+
785
+ evsel = evlist__last(evlist);
786
+
787
+ evsel->core.attr.freq = 0;
788
+ evsel->core.attr.sample_period = 1;
789
+ evsel->core.attr.text_poke = 1;
790
+ evsel->core.attr.ksymbol = 1;
791
+
792
+ evsel->core.system_wide = true;
793
+ evsel->no_aux_samples = true;
794
+ evsel->immediate = true;
795
+
796
+ /* Text poke must be collected on all CPUs */
797
+ perf_cpu_map__put(evsel->core.own_cpus);
798
+ evsel->core.own_cpus = perf_cpu_map__new(NULL);
799
+ perf_cpu_map__put(evsel->core.cpus);
800
+ evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
801
+
802
+ evsel__set_sample_bit(evsel, TIME);
803
+
804
+ return 0;
805
+}
806
+
807
+static bool record__kcore_readable(struct machine *machine)
808
+{
809
+ char kcore[PATH_MAX];
810
+ int fd;
811
+
812
+ scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
813
+
814
+ fd = open(kcore, O_RDONLY);
815
+ if (fd < 0)
816
+ return false;
817
+
818
+ close(fd);
819
+
820
+ return true;
821
+}
822
+
823
+static int record__kcore_copy(struct machine *machine, struct perf_data *data)
824
+{
825
+ char from_dir[PATH_MAX];
826
+ char kcore_dir[PATH_MAX];
827
+ int ret;
828
+
829
+ snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
830
+
831
+ ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
832
+ if (ret)
833
+ return ret;
834
+
835
+ return kcore_copy(from_dir, kcore_dir);
836
+}
837
+
321838 static int record__mmap_evlist(struct record *rec,
322
- struct perf_evlist *evlist)
839
+ struct evlist *evlist)
323840 {
324841 struct record_opts *opts = &rec->opts;
842
+ bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
843
+ opts->auxtrace_sample_mode;
325844 char msg[512];
326845
327
- if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
846
+ if (opts->affinity != PERF_AFFINITY_SYS)
847
+ cpu__setup_cpunode_map();
848
+
849
+ if (evlist__mmap_ex(evlist, opts->mmap_pages,
328850 opts->auxtrace_mmap_pages,
329
- opts->auxtrace_snapshot_mode) < 0) {
851
+ auxtrace_overwrite,
852
+ opts->nr_cblocks, opts->affinity,
853
+ opts->mmap_flush, opts->comp_level) < 0) {
330854 if (errno == EPERM) {
331855 pr_err("Permission error mapping pages.\n"
332856 "Consider increasing "
....@@ -355,43 +879,55 @@
355879 static int record__open(struct record *rec)
356880 {
357881 char msg[BUFSIZ];
358
- struct perf_evsel *pos;
359
- struct perf_evlist *evlist = rec->evlist;
882
+ struct evsel *pos;
883
+ struct evlist *evlist = rec->evlist;
360884 struct perf_session *session = rec->session;
361885 struct record_opts *opts = &rec->opts;
362
- struct perf_evsel_config_term *err_term;
363886 int rc = 0;
364887
365888 /*
366
- * For initial_delay we need to add a dummy event so that we can track
367
- * PERF_RECORD_MMAP while we wait for the initial delay to enable the
368
- * real events, the ones asked by the user.
889
+ * For initial_delay or system wide, we need to add a dummy event so
890
+ * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
891
+ * event synthesis.
369892 */
370
- if (opts->initial_delay) {
371
- if (perf_evlist__add_dummy(evlist))
372
- return -ENOMEM;
893
+ if (opts->initial_delay || target__has_cpu(&opts->target)) {
894
+ pos = perf_evlist__get_tracking_event(evlist);
895
+ if (!evsel__is_dummy_event(pos)) {
896
+ /* Set up dummy event. */
897
+ if (evlist__add_dummy(evlist))
898
+ return -ENOMEM;
899
+ pos = evlist__last(evlist);
900
+ perf_evlist__set_tracking_event(evlist, pos);
901
+ }
373902
374
- pos = perf_evlist__first(evlist);
375
- pos->tracking = 0;
376
- pos = perf_evlist__last(evlist);
377
- pos->tracking = 1;
378
- pos->attr.enable_on_exec = 1;
903
+ /*
904
+ * Enable the dummy event when the process is forked for
905
+ * initial_delay, immediately for system wide.
906
+ */
907
+ if (opts->initial_delay && !pos->immediate)
908
+ pos->core.attr.enable_on_exec = 1;
909
+ else
910
+ pos->immediate = 1;
379911 }
380912
381913 perf_evlist__config(evlist, opts, &callchain_param);
382914
383915 evlist__for_each_entry(evlist, pos) {
384916 try_again:
385
- if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
386
- if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
917
+ if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
918
+ if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
387919 if (verbose > 0)
388920 ui__warning("%s\n", msg);
389921 goto try_again;
390922 }
391
-
923
+ if ((errno == EINVAL || errno == EBADF) &&
924
+ pos->leader != pos &&
925
+ pos->weak_group) {
926
+ pos = perf_evlist__reset_weak_group(evlist, pos, true);
927
+ goto try_again;
928
+ }
392929 rc = -errno;
393
- perf_evsel__open_strerror(pos, &opts->target,
394
- errno, msg, sizeof(msg));
930
+ evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
395931 ui__error("%s\n", msg);
396932 goto out;
397933 }
....@@ -399,18 +935,21 @@
399935 pos->supported = true;
400936 }
401937
402
- if (perf_evlist__apply_filters(evlist, &pos)) {
403
- pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
404
- pos->filter, perf_evsel__name(pos), errno,
405
- str_error_r(errno, msg, sizeof(msg)));
406
- rc = -1;
407
- goto out;
938
+ if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
939
+ pr_warning(
940
+"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
941
+"check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
942
+"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
943
+"file is not found in the buildid cache or in the vmlinux path.\n\n"
944
+"Samples in kernel modules won't be resolved at all.\n\n"
945
+"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
946
+"even with a suitable vmlinux or kallsyms file.\n\n");
408947 }
409948
410
- if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
411
- pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
412
- err_term->val.drv_cfg, perf_evsel__name(pos), errno,
413
- str_error_r(errno, msg, sizeof(msg)));
949
+ if (perf_evlist__apply_filters(evlist, &pos)) {
950
+ pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
951
+ pos->filter, evsel__name(pos), errno,
952
+ str_error_r(errno, msg, sizeof(msg)));
414953 rc = -1;
415954 goto out;
416955 }
....@@ -428,7 +967,7 @@
428967 static int process_sample_event(struct perf_tool *tool,
429968 union perf_event *event,
430969 struct perf_sample *sample,
431
- struct perf_evsel *evsel,
970
+ struct evsel *evsel,
432971 struct machine *machine)
433972 {
434973 struct record *rec = container_of(tool, struct record, tool);
....@@ -447,10 +986,9 @@
447986
448987 static int process_buildids(struct record *rec)
449988 {
450
- struct perf_data *data = &rec->data;
451989 struct perf_session *session = rec->session;
452990
453
- if (data->size == 0)
991
+ if (perf_data__size(&rec->data) == 0)
454992 return 0;
455993
456994 /*
....@@ -510,13 +1048,61 @@
5101048 .type = PERF_RECORD_FINISHED_ROUND,
5111049 };
5121050
513
-static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
514
- bool overwrite)
1051
+static void record__adjust_affinity(struct record *rec, struct mmap *map)
1052
+{
1053
+ if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1054
+ !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1055
+ rec->affinity_mask.nbits)) {
1056
+ bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1057
+ bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1058
+ map->affinity_mask.bits, rec->affinity_mask.nbits);
1059
+ sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1060
+ (cpu_set_t *)rec->affinity_mask.bits);
1061
+ if (verbose == 2)
1062
+ mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1063
+ }
1064
+}
1065
+
1066
+static size_t process_comp_header(void *record, size_t increment)
1067
+{
1068
+ struct perf_record_compressed *event = record;
1069
+ size_t size = sizeof(*event);
1070
+
1071
+ if (increment) {
1072
+ event->header.size += increment;
1073
+ return increment;
1074
+ }
1075
+
1076
+ event->header.type = PERF_RECORD_COMPRESSED;
1077
+ event->header.size = size;
1078
+
1079
+ return size;
1080
+}
1081
+
1082
+static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1083
+ void *src, size_t src_size)
1084
+{
1085
+ size_t compressed;
1086
+ size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1087
+
1088
+ compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1089
+ max_record_size, process_comp_header);
1090
+
1091
+ session->bytes_transferred += src_size;
1092
+ session->bytes_compressed += compressed;
1093
+
1094
+ return compressed;
1095
+}
1096
+
1097
+static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1098
+ bool overwrite, bool synch)
5151099 {
5161100 u64 bytes_written = rec->bytes_written;
5171101 int i;
5181102 int rc = 0;
519
- struct perf_mmap *maps;
1103
+ struct mmap *maps;
1104
+ int trace_fd = rec->data.file.fd;
1105
+ off_t off = 0;
5201106
5211107 if (!evlist)
5221108 return 0;
....@@ -528,29 +1114,56 @@
5281114 if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
5291115 return 0;
5301116
531
- for (i = 0; i < evlist->nr_mmaps; i++) {
532
- struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
1117
+ if (record__aio_enabled(rec))
1118
+ off = record__aio_get_pos(trace_fd);
5331119
534
- if (maps[i].base) {
535
- if (perf_mmap__push(&maps[i], rec, record__pushfn) != 0) {
536
- rc = -1;
537
- goto out;
1120
+ for (i = 0; i < evlist->core.nr_mmaps; i++) {
1121
+ u64 flush = 0;
1122
+ struct mmap *map = &maps[i];
1123
+
1124
+ if (map->core.base) {
1125
+ record__adjust_affinity(rec, map);
1126
+ if (synch) {
1127
+ flush = map->core.flush;
1128
+ map->core.flush = 1;
5381129 }
1130
+ if (!record__aio_enabled(rec)) {
1131
+ if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1132
+ if (synch)
1133
+ map->core.flush = flush;
1134
+ rc = -1;
1135
+ goto out;
1136
+ }
1137
+ } else {
1138
+ if (record__aio_push(rec, map, &off) < 0) {
1139
+ record__aio_set_pos(trace_fd, off);
1140
+ if (synch)
1141
+ map->core.flush = flush;
1142
+ rc = -1;
1143
+ goto out;
1144
+ }
1145
+ }
1146
+ if (synch)
1147
+ map->core.flush = flush;
5391148 }
5401149
541
- if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
542
- record__auxtrace_mmap_read(rec, mm) != 0) {
1150
+ if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1151
+ !rec->opts.auxtrace_sample_mode &&
1152
+ record__auxtrace_mmap_read(rec, map) != 0) {
5431153 rc = -1;
5441154 goto out;
5451155 }
5461156 }
1157
+
1158
+ if (record__aio_enabled(rec))
1159
+ record__aio_set_pos(trace_fd, off);
5471160
5481161 /*
5491162 * Mark the round finished in case we wrote
5501163 * at least one event.
5511164 */
5521165 if (bytes_written != rec->bytes_written)
553
- rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
1166
+ rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
5541167
5551168 if (overwrite)
5561169 perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
....@@ -558,15 +1171,15 @@
5581171 return rc;
5591172 }
5601173
561
-static int record__mmap_read_all(struct record *rec)
1174
+static int record__mmap_read_all(struct record *rec, bool synch)
5621175 {
5631176 int err;
5641177
565
- err = record__mmap_read_evlist(rec, rec->evlist, false);
1178
+ err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
5661179 if (err)
5671180 return err;
5681181
569
- return record__mmap_read_evlist(rec, rec->evlist, true);
1182
+ return record__mmap_read_evlist(rec, rec->evlist, true, synch);
5701183 }
5711184
5721185 static void record__init_features(struct record *rec)
....@@ -580,7 +1193,7 @@
5801193 if (rec->no_buildid)
5811194 perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
5821195
583
- if (!have_tracepoints(&rec->evlist->entries))
1196
+ if (!have_tracepoints(&rec->evlist->core.entries))
5841197 perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
5851198
5861199 if (!rec->opts.branch_stack)
....@@ -588,6 +1201,16 @@
5881201
5891202 if (!rec->opts.full_auxtrace)
5901203 perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1204
+
1205
+ if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1206
+ perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1207
+
1208
+ if (!rec->opts.use_clockid)
1209
+ perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1210
+
1211
+ perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1212
+ if (!record__comp_enabled(rec))
1213
+ perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
5911214
5921215 perf_header__clear_feat(&session->header, HEADER_STAT);
5931216 }
....@@ -602,7 +1225,7 @@
6021225 return;
6031226
6041227 rec->session->header.data_size += rec->bytes_written;
605
- data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1228
+ data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
6061229
6071230 if (!rec->no_buildid) {
6081231 process_buildids(rec);
....@@ -618,7 +1241,7 @@
6181241 static int record__synthesize_workload(struct record *rec, bool tail)
6191242 {
6201243 int err;
621
- struct thread_map *thread_map;
1244
+ struct perf_thread_map *thread_map;
6221245
6231246 if (rec->opts.tail_synthesize != tail)
6241247 return 0;
....@@ -630,9 +1253,8 @@
6301253 err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
6311254 process_synthesized_event,
6321255 &rec->session->machines.host,
633
- rec->opts.sample_address,
634
- rec->opts.proc_map_timeout);
635
- thread_map__put(thread_map);
1256
+ rec->opts.sample_address);
1257
+ perf_thread_map__put(thread_map);
6361258 return err;
6371259 }
6381260
....@@ -643,9 +1265,12 @@
6431265 {
6441266 struct perf_data *data = &rec->data;
6451267 int fd, err;
1268
+ char *new_filename;
6461269
6471270 /* Same Size: "2015122520103046"*/
6481271 char timestamp[] = "InvalidTimestamp";
1272
+
1273
+ record__aio_mmap_read_sync(rec);
6491274
6501275 record__synthesize(rec, true);
6511276 if (target__none(&rec->opts.target))
....@@ -661,7 +1286,7 @@
6611286
6621287 fd = perf_data__switch(data, timestamp,
6631288 rec->session->header.data_offset,
664
- at_exit);
1289
+ at_exit, &new_filename);
6651290 if (fd >= 0 && !at_exit) {
6661291 rec->bytes_written = 0;
6671292 rec->session->header.data_size = 0;
....@@ -669,7 +1294,22 @@
6691294
6701295 if (!quiet)
6711296 fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
672
- data->file.path, timestamp);
1297
+ data->path, timestamp);
1298
+
1299
+ if (rec->switch_output.num_files) {
1300
+ int n = rec->switch_output.cur_file + 1;
1301
+
1302
+ if (n >= rec->switch_output.num_files)
1303
+ n = 0;
1304
+ rec->switch_output.cur_file = n;
1305
+ if (rec->switch_output.filenames[n]) {
1306
+ remove(rec->switch_output.filenames[n]);
1307
+ zfree(&rec->switch_output.filenames[n]);
1308
+ }
1309
+ rec->switch_output.filenames[n] = new_filename;
1310
+ } else {
1311
+ free(new_filename);
1312
+ }
6731313
6741314 /* Output tracking events */
6751315 if (!at_exit) {
....@@ -709,23 +1349,14 @@
7091349 static void snapshot_sig_handler(int sig);
7101350 static void alarm_sig_handler(int sig);
7111351
712
-int __weak
713
-perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
714
- struct perf_tool *tool __maybe_unused,
715
- perf_event__handler_t process __maybe_unused,
716
- struct machine *machine __maybe_unused)
717
-{
718
- return 0;
719
-}
720
-
7211352 static const struct perf_event_mmap_page *
722
-perf_evlist__pick_pc(struct perf_evlist *evlist)
1353
+perf_evlist__pick_pc(struct evlist *evlist)
7231354 {
7241355 if (evlist) {
725
- if (evlist->mmap && evlist->mmap[0].base)
726
- return evlist->mmap[0].base;
727
- if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
728
- return evlist->overwrite_mmap[0].base;
1356
+ if (evlist->mmap && evlist->mmap[0].core.base)
1357
+ return evlist->mmap[0].core.base;
1358
+ if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1359
+ return evlist->overwrite_mmap[0].core.base;
7291360 }
7301361 return NULL;
7311362 }
....@@ -749,6 +1380,7 @@
7491380 struct perf_tool *tool = &rec->tool;
7501381 int fd = perf_data__fd(data);
7511382 int err = 0;
1383
+ event_op f = process_synthesized_event;
7521384
7531385 if (rec->opts.tail_synthesize != tail)
7541386 return 0;
....@@ -758,7 +1390,7 @@
7581390 * We need to synthesize events first, because some
7591391 * features works on top of them (on report side).
7601392 */
761
- err = perf_event__synthesize_attrs(tool, session,
1393
+ err = perf_event__synthesize_attrs(tool, rec->evlist,
7621394 process_synthesized_event);
7631395 if (err < 0) {
7641396 pr_err("Couldn't synthesize attrs.\n");
....@@ -772,7 +1404,7 @@
7721404 return err;
7731405 }
7741406
775
- if (have_tracepoints(&rec->evlist->entries)) {
1407
+ if (have_tracepoints(&rec->evlist->core.entries)) {
7761408 /*
7771409 * FIXME err <= 0 here actually means that
7781410 * there were no tracepoints so its not really
....@@ -795,6 +1427,15 @@
7951427 process_synthesized_event, machine);
7961428 if (err)
7971429 goto out;
1430
+
1431
+ /* Synthesize id_index before auxtrace_info */
1432
+ if (rec->opts.auxtrace_sample_mode) {
1433
+ err = perf_event__synthesize_id_index(tool,
1434
+ process_synthesized_event,
1435
+ session->evlist, machine);
1436
+ if (err)
1437
+ goto out;
1438
+ }
7981439
7991440 if (rec->opts.full_auxtrace) {
8001441 err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
....@@ -829,7 +1470,7 @@
8291470 if (err)
8301471 goto out;
8311472
832
- err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
1473
+ err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
8331474 process_synthesized_event,
8341475 NULL);
8351476 if (err < 0) {
....@@ -837,18 +1478,129 @@
8371478 return err;
8381479 }
8391480
840
- err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
1481
+ err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
8411482 process_synthesized_event, NULL);
8421483 if (err < 0) {
8431484 pr_err("Couldn't synthesize cpu map.\n");
8441485 return err;
8451486 }
8461487
847
- err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
848
- process_synthesized_event, opts->sample_address,
849
- opts->proc_map_timeout, 1);
1488
+ err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1489
+ machine, opts);
1490
+ if (err < 0)
1491
+ pr_warning("Couldn't synthesize bpf events.\n");
1492
+
1493
+ err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1494
+ machine);
1495
+ if (err < 0)
1496
+ pr_warning("Couldn't synthesize cgroup events.\n");
1497
+
1498
+ if (rec->opts.nr_threads_synthesize > 1) {
1499
+ perf_set_multithreaded();
1500
+ f = process_locked_synthesized_event;
1501
+ }
1502
+
1503
+ err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1504
+ f, opts->sample_address,
1505
+ rec->opts.nr_threads_synthesize);
1506
+
1507
+ if (rec->opts.nr_threads_synthesize > 1)
1508
+ perf_set_singlethreaded();
1509
+
8501510 out:
8511511 return err;
1512
+}
1513
+
1514
+static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1515
+{
1516
+ struct record *rec = data;
1517
+ pthread_kill(rec->thread_id, SIGUSR2);
1518
+ return 0;
1519
+}
1520
+
1521
+static int record__setup_sb_evlist(struct record *rec)
1522
+{
1523
+ struct record_opts *opts = &rec->opts;
1524
+
1525
+ if (rec->sb_evlist != NULL) {
1526
+ /*
1527
+ * We get here if --switch-output-event populated the
1528
+ * sb_evlist, so associate a callback that will send a SIGUSR2
1529
+ * to the main thread.
1530
+ */
1531
+ evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1532
+ rec->thread_id = pthread_self();
1533
+ }
1534
+#ifdef HAVE_LIBBPF_SUPPORT
1535
+ if (!opts->no_bpf_event) {
1536
+ if (rec->sb_evlist == NULL) {
1537
+ rec->sb_evlist = evlist__new();
1538
+
1539
+ if (rec->sb_evlist == NULL) {
1540
+ pr_err("Couldn't create side band evlist.\n.");
1541
+ return -1;
1542
+ }
1543
+ }
1544
+
1545
+ if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1546
+ pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1547
+ return -1;
1548
+ }
1549
+ }
1550
+#endif
1551
+ if (perf_evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1552
+ pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1553
+ opts->no_bpf_event = true;
1554
+ }
1555
+
1556
+ return 0;
1557
+}
1558
+
1559
+static int record__init_clock(struct record *rec)
1560
+{
1561
+ struct perf_session *session = rec->session;
1562
+ struct timespec ref_clockid;
1563
+ struct timeval ref_tod;
1564
+ u64 ref;
1565
+
1566
+ if (!rec->opts.use_clockid)
1567
+ return 0;
1568
+
1569
+ if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1570
+ session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1571
+
1572
+ session->header.env.clock.clockid = rec->opts.clockid;
1573
+
1574
+ if (gettimeofday(&ref_tod, NULL) != 0) {
1575
+ pr_err("gettimeofday failed, cannot set reference time.\n");
1576
+ return -1;
1577
+ }
1578
+
1579
+ if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1580
+ pr_err("clock_gettime failed, cannot set reference time.\n");
1581
+ return -1;
1582
+ }
1583
+
1584
+ ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1585
+ (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1586
+
1587
+ session->header.env.clock.tod_ns = ref;
1588
+
1589
+ ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1590
+ (u64) ref_clockid.tv_nsec;
1591
+
1592
+ session->header.env.clock.clockid_ns = ref;
1593
+ return 0;
1594
+}
1595
+
1596
+static void hit_auxtrace_snapshot_trigger(struct record *rec)
1597
+{
1598
+ if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1599
+ trigger_hit(&auxtrace_snapshot_trigger);
1600
+ auxtrace_record__snapshot_started = 1;
1601
+ if (auxtrace_record__snapshot_start(rec->itr))
1602
+ trigger_error(&auxtrace_snapshot_trigger);
1603
+ }
8521604 }
8531605
8541606 static int __cmd_record(struct record *rec, int argc, const char **argv)
....@@ -863,6 +1615,8 @@
8631615 struct perf_session *session;
8641616 bool disabled = false, draining = false;
8651617 int fd;
1618
+ float ratio = 0;
1619
+ enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
8661620
8671621 atexit(record__sig_exit);
8681622 signal(SIGCHLD, sig_handler);
....@@ -872,6 +1626,15 @@
8721626
8731627 if (rec->opts.record_namespaces)
8741628 tool->namespace_events = true;
1629
+
1630
+ if (rec->opts.record_cgroup) {
1631
+#ifdef HAVE_FILE_HANDLE
1632
+ tool->cgroup_events = true;
1633
+#else
1634
+ pr_err("cgroup tracking is not supported\n");
1635
+ return -1;
1636
+#endif
1637
+ }
8751638
8761639 if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
8771640 signal(SIGUSR2, snapshot_sig_handler);
....@@ -884,13 +1647,44 @@
8841647 }
8851648
8861649 session = perf_session__new(data, false, tool);
887
- if (session == NULL) {
1650
+ if (IS_ERR(session)) {
8881651 pr_err("Perf session creation failed.\n");
889
- return -1;
1652
+ return PTR_ERR(session);
8901653 }
8911654
8921655 fd = perf_data__fd(data);
8931656 rec->session = session;
1657
+
1658
+ if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1659
+ pr_err("Compression initialization failed.\n");
1660
+ return -1;
1661
+ }
1662
+#ifdef HAVE_EVENTFD_SUPPORT
1663
+ done_fd = eventfd(0, EFD_NONBLOCK);
1664
+ if (done_fd < 0) {
1665
+ pr_err("Failed to create wakeup eventfd, error: %m\n");
1666
+ status = -1;
1667
+ goto out_delete_session;
1668
+ }
1669
+ err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
1670
+ if (err < 0) {
1671
+ pr_err("Failed to add wakeup eventfd to poll list\n");
1672
+ status = err;
1673
+ goto out_delete_session;
1674
+ }
1675
+#endif // HAVE_EVENTFD_SUPPORT
1676
+
1677
+ session->header.env.comp_type = PERF_COMP_ZSTD;
1678
+ session->header.env.comp_level = rec->opts.comp_level;
1679
+
1680
+ if (rec->opts.kcore &&
1681
+ !record__kcore_readable(&session->machines.host)) {
1682
+ pr_err("ERROR: kcore is not readable.\n");
1683
+ return -1;
1684
+ }
1685
+
1686
+ if (record__init_clock(rec))
1687
+ return -1;
8941688
8951689 record__init_features(rec);
8961690
....@@ -911,12 +1705,21 @@
9111705 * because we synthesize event name through the pipe
9121706 * and need the id for that.
9131707 */
914
- if (data->is_pipe && rec->evlist->nr_entries == 1)
1708
+ if (data->is_pipe && rec->evlist->core.nr_entries == 1)
9151709 rec->opts.sample_id = true;
9161710
9171711 if (record__open(rec) != 0) {
9181712 err = -1;
9191713 goto out_child;
1714
+ }
1715
+ session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1716
+
1717
+ if (rec->opts.kcore) {
1718
+ err = record__kcore_copy(&session->machines.host, data);
1719
+ if (err) {
1720
+ pr_err("ERROR: Failed to copy kcore\n");
1721
+ goto out_child;
1722
+ }
9201723 }
9211724
9221725 err = bpf__apply_obj_config();
....@@ -933,7 +1736,7 @@
9331736 * Normally perf_session__new would do this, but it doesn't have the
9341737 * evlist.
9351738 */
936
- if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1739
+ if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
9371740 pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
9381741 rec->tool.ordered_events = false;
9391742 }
....@@ -951,13 +1754,17 @@
9511754 goto out_child;
9521755 }
9531756
1757
+ err = -1;
9541758 if (!rec->no_buildid
9551759 && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
9561760 pr_err("Couldn't generate buildids. "
9571761 "Use --no-buildid to profile anyway.\n");
958
- err = -1;
9591762 goto out_child;
9601763 }
1764
+
1765
+ err = record__setup_sb_evlist(rec);
1766
+ if (err)
1767
+ goto out_child;
9611768
9621769 err = record__synthesize(rec, false);
9631770 if (err < 0)
....@@ -980,7 +1787,7 @@
9801787 * so don't spoil it by prematurely enabling them.
9811788 */
9821789 if (!target__none(&opts->target) && !opts->initial_delay)
983
- perf_evlist__enable(rec->evlist);
1790
+ evlist__enable(rec->evlist);
9841791
9851792 /*
9861793 * Let the child rip
....@@ -1031,9 +1838,16 @@
10311838 perf_evlist__start_workload(rec->evlist);
10321839 }
10331840
1841
+ if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1842
+ goto out_child;
1843
+
10341844 if (opts->initial_delay) {
1035
- usleep(opts->initial_delay * USEC_PER_MSEC);
1036
- perf_evlist__enable(rec->evlist);
1845
+ pr_info(EVLIST_DISABLED_MSG);
1846
+ if (opts->initial_delay > 0) {
1847
+ usleep(opts->initial_delay * USEC_PER_MSEC);
1848
+ evlist__enable(rec->evlist);
1849
+ pr_info(EVLIST_ENABLED_MSG);
1850
+ }
10371851 }
10381852
10391853 trigger_ready(&auxtrace_snapshot_trigger);
....@@ -1053,7 +1867,7 @@
10531867 if (trigger_is_hit(&switch_output_trigger) || done || draining)
10541868 perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
10551869
1056
- if (record__mmap_read_all(rec) < 0) {
1870
+ if (record__mmap_read_all(rec, false) < 0) {
10571871 trigger_error(&auxtrace_snapshot_trigger);
10581872 trigger_error(&switch_output_trigger);
10591873 err = -1;
....@@ -1063,7 +1877,7 @@
10631877 if (auxtrace_record__snapshot_started) {
10641878 auxtrace_record__snapshot_started = 0;
10651879 if (!trigger_is_error(&auxtrace_snapshot_trigger))
1066
- record__read_auxtrace_snapshot(rec);
1880
+ record__read_auxtrace_snapshot(rec, false);
10671881 if (trigger_is_error(&auxtrace_snapshot_trigger)) {
10681882 pr_err("AUX area tracing snapshot failed\n");
10691883 err = -1;
....@@ -1112,7 +1926,7 @@
11121926 if (hits == rec->samples) {
11131927 if (done || draining)
11141928 break;
1115
- err = perf_evlist__poll(rec->evlist, -1);
1929
+ err = evlist__poll(rec->evlist, -1);
11161930 /*
11171931 * Propagate error, only if there's any. Ignore positive
11181932 * number of returned events and interrupt error.
....@@ -1121,8 +1935,27 @@
11211935 err = 0;
11221936 waking++;
11231937
1124
- if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1938
+ if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
11251939 draining = true;
1940
+ }
1941
+
1942
+ if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1943
+ switch (cmd) {
1944
+ case EVLIST_CTL_CMD_ENABLE:
1945
+ pr_info(EVLIST_ENABLED_MSG);
1946
+ break;
1947
+ case EVLIST_CTL_CMD_DISABLE:
1948
+ pr_info(EVLIST_DISABLED_MSG);
1949
+ break;
1950
+ case EVLIST_CTL_CMD_SNAPSHOT:
1951
+ hit_auxtrace_snapshot_trigger(rec);
1952
+ evlist__ctlfd_ack(rec->evlist);
1953
+ break;
1954
+ case EVLIST_CTL_CMD_ACK:
1955
+ case EVLIST_CTL_CMD_UNSUPPORTED:
1956
+ default:
1957
+ break;
1958
+ }
11261959 }
11271960
11281961 /*
....@@ -1132,12 +1965,16 @@
11321965 */
11331966 if (done && !disabled && !target__none(&opts->target)) {
11341967 trigger_off(&auxtrace_snapshot_trigger);
1135
- perf_evlist__disable(rec->evlist);
1968
+ evlist__disable(rec->evlist);
11361969 disabled = true;
11371970 }
11381971 }
1972
+
11391973 trigger_off(&auxtrace_snapshot_trigger);
11401974 trigger_off(&switch_output_trigger);
1975
+
1976
+ if (opts->auxtrace_snapshot_on_exit)
1977
+ record__auxtrace_snapshot_exit(rec);
11411978
11421979 if (forks && workload_exec_errno) {
11431980 char msg[STRERR_BUFSIZE];
....@@ -1154,6 +1991,15 @@
11541991 record__synthesize_workload(rec, true);
11551992
11561993 out_child:
1994
+ evlist__finalize_ctlfd(rec->evlist);
1995
+ record__mmap_read_all(rec, true);
1996
+ record__aio_mmap_read_sync(rec);
1997
+
1998
+ if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1999
+ ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2000
+ session->header.env.comp_ratio = ratio + 0.5;
2001
+ }
2002
+
11572003 if (forks) {
11582004 int exit_status;
11592005
....@@ -1200,13 +2046,27 @@
12002046 else
12012047 samples[0] = '\0';
12022048
1203
- fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
2049
+ fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s",
12042050 perf_data__size(data) / 1024.0 / 1024.0,
1205
- data->file.path, postfix, samples);
2051
+ data->path, postfix, samples);
2052
+ if (ratio) {
2053
+ fprintf(stderr, ", compressed (original %.3f MB, ratio is %.3f)",
2054
+ rec->session->bytes_transferred / 1024.0 / 1024.0,
2055
+ ratio);
2056
+ }
2057
+ fprintf(stderr, " ]\n");
12062058 }
12072059
12082060 out_delete_session:
2061
+#ifdef HAVE_EVENTFD_SUPPORT
2062
+ if (done_fd >= 0)
2063
+ close(done_fd);
2064
+#endif
2065
+ zstd_fini(&session->zstd_data);
12092066 perf_session__delete(session);
2067
+
2068
+ if (!opts->no_bpf_event)
2069
+ perf_evlist__stop_sb_thread(rec->sb_evlist);
12102070 return status;
12112071 }
12122072
....@@ -1287,91 +2147,57 @@
12872147 var = "call-graph.record-mode";
12882148 return perf_default_config(var, value, cb);
12892149 }
2150
+#ifdef HAVE_AIO_SUPPORT
2151
+ if (!strcmp(var, "record.aio")) {
2152
+ rec->opts.nr_cblocks = strtol(value, NULL, 0);
2153
+ if (!rec->opts.nr_cblocks)
2154
+ rec->opts.nr_cblocks = nr_cblocks_default;
2155
+ }
2156
+#endif
12902157
12912158 return 0;
12922159 }
12932160
1294
-struct clockid_map {
1295
- const char *name;
1296
- int clockid;
1297
-};
12982161
1299
-#define CLOCKID_MAP(n, c) \
1300
- { .name = n, .clockid = (c), }
1301
-
1302
-#define CLOCKID_END { .name = NULL, }
1303
-
1304
-
1305
-/*
1306
- * Add the missing ones, we need to build on many distros...
1307
- */
1308
-#ifndef CLOCK_MONOTONIC_RAW
1309
-#define CLOCK_MONOTONIC_RAW 4
1310
-#endif
1311
-#ifndef CLOCK_BOOTTIME
1312
-#define CLOCK_BOOTTIME 7
1313
-#endif
1314
-#ifndef CLOCK_TAI
1315
-#define CLOCK_TAI 11
1316
-#endif
1317
-
1318
-static const struct clockid_map clockids[] = {
1319
- /* available for all events, NMI safe */
1320
- CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1321
- CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1322
-
1323
- /* available for some events */
1324
- CLOCKID_MAP("realtime", CLOCK_REALTIME),
1325
- CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1326
- CLOCKID_MAP("tai", CLOCK_TAI),
1327
-
1328
- /* available for the lazy */
1329
- CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1330
- CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1331
- CLOCKID_MAP("real", CLOCK_REALTIME),
1332
- CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1333
-
1334
- CLOCKID_END,
1335
-};
1336
-
1337
-static int parse_clockid(const struct option *opt, const char *str, int unset)
2162
+static int record__parse_affinity(const struct option *opt, const char *str, int unset)
13382163 {
13392164 struct record_opts *opts = (struct record_opts *)opt->value;
1340
- const struct clockid_map *cm;
1341
- const char *ostr = str;
2165
+
2166
+ if (unset || !str)
2167
+ return 0;
2168
+
2169
+ if (!strcasecmp(str, "node"))
2170
+ opts->affinity = PERF_AFFINITY_NODE;
2171
+ else if (!strcasecmp(str, "cpu"))
2172
+ opts->affinity = PERF_AFFINITY_CPU;
2173
+
2174
+ return 0;
2175
+}
2176
+
2177
+static int parse_output_max_size(const struct option *opt,
2178
+ const char *str, int unset)
2179
+{
2180
+ unsigned long *s = (unsigned long *)opt->value;
2181
+ static struct parse_tag tags_size[] = {
2182
+ { .tag = 'B', .mult = 1 },
2183
+ { .tag = 'K', .mult = 1 << 10 },
2184
+ { .tag = 'M', .mult = 1 << 20 },
2185
+ { .tag = 'G', .mult = 1 << 30 },
2186
+ { .tag = 0 },
2187
+ };
2188
+ unsigned long val;
13422189
13432190 if (unset) {
1344
- opts->use_clockid = 0;
2191
+ *s = 0;
13452192 return 0;
13462193 }
13472194
1348
- /* no arg passed */
1349
- if (!str)
2195
+ val = parse_tag_value(str, tags_size);
2196
+ if (val != (unsigned long) -1) {
2197
+ *s = val;
13502198 return 0;
1351
-
1352
- /* no setting it twice */
1353
- if (opts->use_clockid)
1354
- return -1;
1355
-
1356
- opts->use_clockid = true;
1357
-
1358
- /* if its a number, we're done */
1359
- if (sscanf(str, "%d", &opts->clockid) == 1)
1360
- return 0;
1361
-
1362
- /* allow a "CLOCK_" prefix to the name */
1363
- if (!strncasecmp(str, "CLOCK_", 6))
1364
- str += 6;
1365
-
1366
- for (cm = clockids; cm->name; cm++) {
1367
- if (!strcasecmp(str, cm->name)) {
1368
- opts->clockid = cm->clockid;
1369
- return 0;
1370
- }
13712199 }
13722200
1373
- opts->use_clockid = false;
1374
- ui__warning("unknown clockid %s, check man page\n", ostr);
13752201 return -1;
13762202 }
13772203
....@@ -1418,9 +2244,18 @@
14182244 return ret;
14192245 }
14202246
2247
+static int parse_control_option(const struct option *opt,
2248
+ const char *str,
2249
+ int unset __maybe_unused)
2250
+{
2251
+ struct record_opts *opts = opt->value;
2252
+
2253
+ return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2254
+}
2255
+
14212256 static void switch_output_size_warn(struct record *rec)
14222257 {
1423
- u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
2258
+ u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
14242259 struct switch_output *s = &rec->switch_output;
14252260
14262261 wakeup_size /= 2;
....@@ -1454,10 +2289,19 @@
14542289 };
14552290 unsigned long val;
14562291
2292
+ /*
2293
+ * If we're using --switch-output-events, then we imply its
2294
+ * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2295
+ * thread to its parent.
2296
+ */
2297
+ if (rec->switch_output_event_set)
2298
+ goto do_signal;
2299
+
14572300 if (!s->set)
14582301 return 0;
14592302
14602303 if (!strcmp(s->str, "signal")) {
2304
+do_signal:
14612305 s->signal = true;
14622306 pr_debug("switch-output with SIGUSR2 signal\n");
14632307 goto enabled;
....@@ -1497,6 +2341,31 @@
14972341 };
14982342 const char * const *record_usage = __record_usage;
14992343
2344
+static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2345
+ struct perf_sample *sample, struct machine *machine)
2346
+{
2347
+ /*
2348
+ * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2349
+ * no need to add them twice.
2350
+ */
2351
+ if (!(event->header.misc & PERF_RECORD_MISC_USER))
2352
+ return 0;
2353
+ return perf_event__process_mmap(tool, event, sample, machine);
2354
+}
2355
+
2356
+static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2357
+ struct perf_sample *sample, struct machine *machine)
2358
+{
2359
+ /*
2360
+ * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2361
+ * no need to add them twice.
2362
+ */
2363
+ if (!(event->header.misc & PERF_RECORD_MISC_USER))
2364
+ return 0;
2365
+
2366
+ return perf_event__process_mmap2(tool, event, sample, machine);
2367
+}
2368
+
15002369 /*
15012370 * XXX Ideally would be local to cmd_record() and passed to a record__new
15022371 * because we need to have access to it in record__exit, that is called
....@@ -1518,7 +2387,10 @@
15182387 .uses_mmap = true,
15192388 .default_per_cpu = true,
15202389 },
1521
- .proc_map_timeout = 500,
2390
+ .mmap_flush = MMAP_FLUSH_DEFAULT,
2391
+ .nr_threads_synthesize = 1,
2392
+ .ctl_fd = -1,
2393
+ .ctl_fd_ack = -1,
15222394 },
15232395 .tool = {
15242396 .sample = process_sample_event,
....@@ -1526,8 +2398,8 @@
15262398 .exit = perf_event__process_exit,
15272399 .comm = perf_event__process_comm,
15282400 .namespaces = perf_event__process_namespaces,
1529
- .mmap = perf_event__process_mmap,
1530
- .mmap2 = perf_event__process_mmap2,
2401
+ .mmap = build_id__process_mmap,
2402
+ .mmap2 = build_id__process_mmap2,
15312403 .ordered_events = true,
15322404 },
15332405 };
....@@ -1568,7 +2440,7 @@
15682440 OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
15692441 "list of cpus to monitor"),
15702442 OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1571
- OPT_STRING('o', "output", &record.data.file.path, "file",
2443
+ OPT_STRING('o', "output", &record.data.path, "file",
15722444 "output file name"),
15732445 OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
15742446 &record.opts.no_inherit_set,
....@@ -1576,6 +2448,7 @@
15762448 OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
15772449 "synthesize non-sample events at the end of output"),
15782450 OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2451
+ OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
15792452 OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
15802453 "Fail if the specified frequency can't be used"),
15812454 OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
....@@ -1584,6 +2457,9 @@
15842457 OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
15852458 "number of mmap data pages and AUX area tracing mmap pages",
15862459 record__parse_mmap_pages),
2460
+ OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2461
+ "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2462
+ record__mmap_flush_parse),
15872463 OPT_BOOLEAN(0, "group", &record.opts.group,
15882464 "put the counters into a counter group"),
15892465 OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
....@@ -1617,8 +2493,9 @@
16172493 OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
16182494 "monitor event in cgroup name only",
16192495 parse_cgroups),
1620
- OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1621
- "ms to wait before starting measurement after program start"),
2496
+ OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2497
+ "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2498
+ OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
16222499 OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
16232500 "user to profile"),
16242501
....@@ -1637,10 +2514,10 @@
16372514 "use per-thread mmaps"),
16382515 OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
16392516 "sample selected machine registers on interrupt,"
1640
- " use -I ? to list register names", parse_regs),
2517
+ " use '-I?' to list register names", parse_intr_regs),
16412518 OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
16422519 "sample selected machine registers on interrupt,"
1643
- " use -I ? to list register names", parse_regs),
2520
+ " use '--user-regs=?' to list register names", parse_user_regs),
16442521 OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
16452522 "Record running/enabled time of read (:S) events"),
16462523 OPT_CALLBACK('k', "clockid", &record.opts,
....@@ -1648,18 +2525,27 @@
16482525 parse_clockid),
16492526 OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
16502527 "opts", "AUX area tracing Snapshot Mode", ""),
1651
- OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
2528
+ OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2529
+ "opts", "sample AUX area", ""),
2530
+ OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
16522531 "per thread proc mmap processing timeout in ms"),
16532532 OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
16542533 "Record namespaces events"),
1655
- OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1656
- "Record context switch events"),
2534
+ OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2535
+ "Record cgroup events"),
2536
+ OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2537
+ &record.opts.record_switch_events_set,
2538
+ "Record context switch events"),
16572539 OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
16582540 "Configure all used events to run in kernel space.",
16592541 PARSE_OPT_EXCLUSIVE),
16602542 OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
16612543 "Configure all used events to run in user space.",
16622544 PARSE_OPT_EXCLUSIVE),
2545
+ OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2546
+ "collect kernel callchains"),
2547
+ OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2548
+ "collect user callchains"),
16632549 OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
16642550 "clang binary to use for compiling BPF scriptlets"),
16652551 OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
....@@ -1673,11 +2559,45 @@
16732559 OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
16742560 "Record timestamp boundary (time of first/last samples)"),
16752561 OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
1676
- &record.switch_output.set, "signal,size,time",
1677
- "Switch output when receive SIGUSR2 or cross size,time threshold",
2562
+ &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2563
+ "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
16782564 "signal"),
2565
+ OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2566
+ "switch output event selector. use 'perf list' to list available events",
2567
+ parse_events_option_new_evlist),
2568
+ OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2569
+ "Limit number of switch output generated files"),
16792570 OPT_BOOLEAN(0, "dry-run", &dry_run,
16802571 "Parse options then exit"),
2572
+#ifdef HAVE_AIO_SUPPORT
2573
+ OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2574
+ &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2575
+ record__aio_parse),
2576
+#endif
2577
+ OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2578
+ "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2579
+ record__parse_affinity),
2580
+#ifdef HAVE_ZSTD_SUPPORT
2581
+ OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2582
+ "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2583
+ record__parse_comp_level),
2584
+#endif
2585
+ OPT_CALLBACK(0, "max-size", &record.output_max_size,
2586
+ "size", "Limit the maximum size of the output file", parse_output_max_size),
2587
+ OPT_UINTEGER(0, "num-thread-synthesize",
2588
+ &record.opts.nr_threads_synthesize,
2589
+ "number of threads to run for event synthesis"),
2590
+#ifdef HAVE_LIBPFM
2591
+ OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2592
+ "libpfm4 event selector. use 'perf list' to list available events",
2593
+ parse_libpfm_events_option),
2594
+#endif
2595
+ OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2596
+ "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2597
+ "\t\t\t 'snapshot': AUX area tracing snapshot).\n"
2598
+ "\t\t\t Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2599
+ "\t\t\t Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2600
+ parse_control_option),
16812601 OPT_END()
16822602 };
16832603
....@@ -1712,7 +2632,9 @@
17122632 # undef REASON
17132633 #endif
17142634
1715
- rec->evlist = perf_evlist__new();
2635
+ rec->opts.affinity = PERF_AFFINITY_SYS;
2636
+
2637
+ rec->evlist = evlist__new();
17162638 if (rec->evlist == NULL)
17172639 return -ENOMEM;
17182640
....@@ -1734,21 +2656,41 @@
17342656 "cgroup monitoring only available in system-wide mode");
17352657
17362658 }
2659
+
2660
+ if (rec->opts.kcore)
2661
+ rec->data.is_dir = true;
2662
+
2663
+ if (rec->opts.comp_level != 0) {
2664
+ pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2665
+ rec->no_buildid = true;
2666
+ }
2667
+
17372668 if (rec->opts.record_switch_events &&
17382669 !perf_can_record_switch_events()) {
17392670 ui__error("kernel does not support recording context switch events\n");
17402671 parse_options_usage(record_usage, record_options, "switch-events", 0);
1741
- return -EINVAL;
2672
+ err = -EINVAL;
2673
+ goto out_opts;
17422674 }
17432675
17442676 if (switch_output_setup(rec)) {
17452677 parse_options_usage(record_usage, record_options, "switch-output", 0);
1746
- return -EINVAL;
2678
+ err = -EINVAL;
2679
+ goto out_opts;
17472680 }
17482681
17492682 if (rec->switch_output.time) {
17502683 signal(SIGALRM, alarm_sig_handler);
17512684 alarm(rec->switch_output.time);
2685
+ }
2686
+
2687
+ if (rec->switch_output.num_files) {
2688
+ rec->switch_output.filenames = calloc(sizeof(char *),
2689
+ rec->switch_output.num_files);
2690
+ if (!rec->switch_output.filenames) {
2691
+ err = -EINVAL;
2692
+ goto out_opts;
2693
+ }
17522694 }
17532695
17542696 /*
....@@ -1758,6 +2700,17 @@
17582700 symbol_conf.allow_aliases = true;
17592701
17602702 symbol__init(NULL);
2703
+
2704
+ if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2705
+ rec->affinity_mask.nbits = cpu__max_cpu();
2706
+ rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2707
+ if (!rec->affinity_mask.bits) {
2708
+ pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2709
+ err = -ENOMEM;
2710
+ goto out_opts;
2711
+ }
2712
+ pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2713
+ }
17612714
17622715 err = record__auxtrace_init(rec);
17632716 if (err)
....@@ -1775,16 +2728,6 @@
17752728 }
17762729
17772730 err = -ENOMEM;
1778
-
1779
- if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
1780
- pr_warning(
1781
-"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1782
-"check /proc/sys/kernel/kptr_restrict.\n\n"
1783
-"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1784
-"file is not found in the buildid cache or in the vmlinux path.\n\n"
1785
-"Samples in kernel modules won't be resolved at all.\n\n"
1786
-"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1787
-"even with a suitable vmlinux or kallsyms file.\n\n");
17882731
17892732 if (rec->no_buildid_cache || rec->no_buildid) {
17902733 disable_buildid_cache();
....@@ -1820,8 +2763,8 @@
18202763 if (record.opts.overwrite)
18212764 record.opts.tail_synthesize = true;
18222765
1823
- if (rec->evlist->nr_entries == 0 &&
1824
- __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2766
+ if (rec->evlist->core.nr_entries == 0 &&
2767
+ __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
18252768 pr_err("Not enough memory for event selector list\n");
18262769 goto out;
18272770 }
....@@ -1865,16 +2808,38 @@
18652808 if (rec->opts.full_auxtrace)
18662809 rec->buildid_all = true;
18672810
2811
+ if (rec->opts.text_poke) {
2812
+ err = record__config_text_poke(rec->evlist);
2813
+ if (err) {
2814
+ pr_err("record__config_text_poke failed, error %d\n", err);
2815
+ goto out;
2816
+ }
2817
+ }
2818
+
18682819 if (record_opts__config(&rec->opts)) {
18692820 err = -EINVAL;
18702821 goto out;
18712822 }
18722823
2824
+ if (rec->opts.nr_cblocks > nr_cblocks_max)
2825
+ rec->opts.nr_cblocks = nr_cblocks_max;
2826
+ pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2827
+
2828
+ pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2829
+ pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2830
+
2831
+ if (rec->opts.comp_level > comp_level_max)
2832
+ rec->opts.comp_level = comp_level_max;
2833
+ pr_debug("comp level: %d\n", rec->opts.comp_level);
2834
+
18732835 err = __cmd_record(&record, argc, argv);
18742836 out:
1875
- perf_evlist__delete(rec->evlist);
2837
+ bitmap_free(rec->affinity_mask.bits);
2838
+ evlist__delete(rec->evlist);
18762839 symbol__exit();
18772840 auxtrace_record__free(rec->itr);
2841
+out_opts:
2842
+ evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
18782843 return err;
18792844 }
18802845
....@@ -1882,12 +2847,7 @@
18822847 {
18832848 struct record *rec = &record;
18842849
1885
- if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1886
- trigger_hit(&auxtrace_snapshot_trigger);
1887
- auxtrace_record__snapshot_started = 1;
1888
- if (auxtrace_record__snapshot_start(record.itr))
1889
- trigger_error(&auxtrace_snapshot_trigger);
1890
- }
2850
+ hit_auxtrace_snapshot_trigger(rec);
18912851
18922852 if (switch_output_signal(rec))
18932853 trigger_hit(&switch_output_trigger);