hc
2024-10-09 05e59e5fb0064c97a1c10921ecd549f2d4a58565
kernel/kernel/events/ring_buffer.c
....@@ -1,3 +1,4 @@
1
+// SPDX-License-Identifier: GPL-2.0
12 /*
23 * Performance events ring-buffer code:
34 *
....@@ -5,8 +6,6 @@
56 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
67 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
78 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
- *
9
- * For licensing details see kernel-base/COPYING
109 */
1110
1211 #include <linux/perf_event.h>
....@@ -36,17 +35,33 @@
3635 */
3736 static void perf_output_get_handle(struct perf_output_handle *handle)
3837 {
39
- struct ring_buffer *rb = handle->rb;
38
+ struct perf_buffer *rb = handle->rb;
4039
4140 preempt_disable();
42
- local_inc(&rb->nest);
41
+
42
+ /*
43
+ * Avoid an explicit LOAD/STORE such that architectures with memops
44
+ * can use them.
45
+ */
46
+ (*(volatile unsigned int *)&rb->nest)++;
4347 handle->wakeup = local_read(&rb->wakeup);
4448 }
4549
4650 static void perf_output_put_handle(struct perf_output_handle *handle)
4751 {
48
- struct ring_buffer *rb = handle->rb;
52
+ struct perf_buffer *rb = handle->rb;
4953 unsigned long head;
54
+ unsigned int nest;
55
+
56
+ /*
57
+ * If this isn't the outermost nesting, we don't have to update
58
+ * @rb->user_page->data_head.
59
+ */
60
+ nest = READ_ONCE(rb->nest);
61
+ if (nest > 1) {
62
+ WRITE_ONCE(rb->nest, nest - 1);
63
+ goto out;
64
+ }
5065
5166 again:
5267 /*
....@@ -64,15 +79,6 @@
6479 * IRQ/NMI can happen here and advance @rb->head, causing our
6580 * load above to be stale.
6681 */
67
-
68
- /*
69
- * If this isn't the outermost nesting, we don't have to update
70
- * @rb->user_page->data_head.
71
- */
72
- if (local_read(&rb->nest) > 1) {
73
- local_dec(&rb->nest);
74
- goto out;
75
- }
7682
7783 /*
7884 * Since the mmap() consumer (userspace) can run on a different CPU:
....@@ -109,7 +115,7 @@
109115 * write will (temporarily) publish a stale value.
110116 */
111117 barrier();
112
- local_set(&rb->nest, 0);
118
+ WRITE_ONCE(rb->nest, 0);
113119
114120 /*
115121 * Ensure we decrement @rb->nest before we validate the @rb->head.
....@@ -117,7 +123,7 @@
117123 */
118124 barrier();
119125 if (unlikely(head != local_read(&rb->head))) {
120
- local_inc(&rb->nest);
126
+ WRITE_ONCE(rb->nest, 1);
121127 goto again;
122128 }
123129
....@@ -141,10 +147,11 @@
141147
142148 static __always_inline int
143149 __perf_output_begin(struct perf_output_handle *handle,
150
+ struct perf_sample_data *data,
144151 struct perf_event *event, unsigned int size,
145152 bool backward)
146153 {
147
- struct ring_buffer *rb;
154
+ struct perf_buffer *rb;
148155 unsigned long tail, offset, head;
149156 int have_lost, page_shift;
150157 struct {
....@@ -231,18 +238,16 @@
231238 handle->size = (1UL << page_shift) - offset;
232239
233240 if (unlikely(have_lost)) {
234
- struct perf_sample_data sample_data;
235
-
236241 lost_event.header.size = sizeof(lost_event);
237242 lost_event.header.type = PERF_RECORD_LOST;
238243 lost_event.header.misc = 0;
239244 lost_event.id = event->id;
240245 lost_event.lost = local_xchg(&rb->lost, 0);
241246
242
- perf_event_header__init_id(&lost_event.header,
243
- &sample_data, event);
247
+ /* XXX mostly redundant; @data is already fully initializes */
248
+ perf_event_header__init_id(&lost_event.header, data, event);
244249 perf_output_put(handle, lost_event);
245
- perf_event__output_id_sample(event, handle, &sample_data);
250
+ perf_event__output_id_sample(event, handle, data);
246251 }
247252
248253 return 0;
....@@ -257,22 +262,25 @@
257262 }
258263
259264 int perf_output_begin_forward(struct perf_output_handle *handle,
260
- struct perf_event *event, unsigned int size)
265
+ struct perf_sample_data *data,
266
+ struct perf_event *event, unsigned int size)
261267 {
262
- return __perf_output_begin(handle, event, size, false);
268
+ return __perf_output_begin(handle, data, event, size, false);
263269 }
264270
265271 int perf_output_begin_backward(struct perf_output_handle *handle,
272
+ struct perf_sample_data *data,
266273 struct perf_event *event, unsigned int size)
267274 {
268
- return __perf_output_begin(handle, event, size, true);
275
+ return __perf_output_begin(handle, data, event, size, true);
269276 }
270277
271278 int perf_output_begin(struct perf_output_handle *handle,
279
+ struct perf_sample_data *data,
272280 struct perf_event *event, unsigned int size)
273281 {
274282
275
- return __perf_output_begin(handle, event, size,
283
+ return __perf_output_begin(handle, data, event, size,
276284 unlikely(is_write_backward(event)));
277285 }
278286
....@@ -295,7 +303,7 @@
295303 }
296304
297305 static void
298
-ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
306
+ring_buffer_init(struct perf_buffer *rb, long watermark, int flags)
299307 {
300308 long max_size = perf_data_size(rb);
301309
....@@ -310,7 +318,7 @@
310318 else
311319 rb->overwrite = 1;
312320
313
- atomic_set(&rb->refcount, 1);
321
+ refcount_set(&rb->refcount, 1);
314322
315323 INIT_LIST_HEAD(&rb->event_list);
316324 spin_lock_init(&rb->event_lock);
....@@ -355,7 +363,8 @@
355363 {
356364 struct perf_event *output_event = event;
357365 unsigned long aux_head, aux_tail;
358
- struct ring_buffer *rb;
366
+ struct perf_buffer *rb;
367
+ unsigned int nest;
359368
360369 if (output_event->parent)
361370 output_event = output_event->parent;
....@@ -383,15 +392,18 @@
383392 if (!atomic_read(&rb->aux_mmap_count))
384393 goto err;
385394
386
- if (!atomic_inc_not_zero(&rb->aux_refcount))
395
+ if (!refcount_inc_not_zero(&rb->aux_refcount))
387396 goto err;
388397
398
+ nest = READ_ONCE(rb->aux_nest);
389399 /*
390400 * Nesting is not supported for AUX area, make sure nested
391401 * writers are caught early
392402 */
393
- if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
403
+ if (WARN_ON_ONCE(nest))
394404 goto err_put;
405
+
406
+ WRITE_ONCE(rb->aux_nest, nest + 1);
395407
396408 aux_head = rb->aux_head;
397409
....@@ -420,7 +432,7 @@
420432 if (!handle->size) { /* A, matches D */
421433 event->pending_disable = smp_processor_id();
422434 perf_output_wakeup(handle);
423
- local_set(&rb->aux_nest, 0);
435
+ WRITE_ONCE(rb->aux_nest, 0);
424436 goto err_put;
425437 }
426438 }
....@@ -439,7 +451,7 @@
439451 }
440452 EXPORT_SYMBOL_GPL(perf_aux_output_begin);
441453
442
-static __always_inline bool rb_need_aux_wakeup(struct ring_buffer *rb)
454
+static __always_inline bool rb_need_aux_wakeup(struct perf_buffer *rb)
443455 {
444456 if (rb->aux_overwrite)
445457 return false;
....@@ -465,7 +477,7 @@
465477 void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
466478 {
467479 bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED);
468
- struct ring_buffer *rb = handle->rb;
480
+ struct perf_buffer *rb = handle->rb;
469481 unsigned long aux_head;
470482
471483 /* in overwrite mode, driver provides aux_head via handle */
....@@ -481,14 +493,21 @@
481493 rb->aux_head += size;
482494 }
483495
484
- if (size || handle->aux_flags) {
485
- /*
486
- * Only send RECORD_AUX if we have something useful to communicate
487
- */
488
-
496
+ /*
497
+ * Only send RECORD_AUX if we have something useful to communicate
498
+ *
499
+ * Note: the OVERWRITE records by themselves are not considered
500
+ * useful, as they don't communicate any *new* information,
501
+ * aside from the short-lived offset, that becomes history at
502
+ * the next event sched-in and therefore isn't useful.
503
+ * The userspace that needs to copy out AUX data in overwrite
504
+ * mode should know to use user_page::aux_head for the actual
505
+ * offset. So, from now on we don't output AUX records that
506
+ * have *only* OVERWRITE flag set.
507
+ */
508
+ if (size || (handle->aux_flags & ~(u64)PERF_AUX_FLAG_OVERWRITE))
489509 perf_event_aux_event(handle->event, aux_head, size,
490
- handle->aux_flags);
491
- }
510
+ handle->aux_flags);
492511
493512 WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
494513 if (rb_need_aux_wakeup(rb))
....@@ -502,7 +521,7 @@
502521
503522 handle->event = NULL;
504523
505
- local_set(&rb->aux_nest, 0);
524
+ WRITE_ONCE(rb->aux_nest, 0);
506525 /* can't be last */
507526 rb_free_aux(rb);
508527 ring_buffer_put(rb);
....@@ -515,7 +534,7 @@
515534 */
516535 int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
517536 {
518
- struct ring_buffer *rb = handle->rb;
537
+ struct perf_buffer *rb = handle->rb;
519538
520539 if (size > handle->size)
521540 return -ENOSPC;
....@@ -545,6 +564,42 @@
545564 }
546565 EXPORT_SYMBOL_GPL(perf_get_aux);
547566
567
+/*
568
+ * Copy out AUX data from an AUX handle.
569
+ */
570
+long perf_output_copy_aux(struct perf_output_handle *aux_handle,
571
+ struct perf_output_handle *handle,
572
+ unsigned long from, unsigned long to)
573
+{
574
+ struct perf_buffer *rb = aux_handle->rb;
575
+ unsigned long tocopy, remainder, len = 0;
576
+ void *addr;
577
+
578
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
579
+ to &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
580
+
581
+ do {
582
+ tocopy = PAGE_SIZE - offset_in_page(from);
583
+ if (to > from)
584
+ tocopy = min(tocopy, to - from);
585
+ if (!tocopy)
586
+ break;
587
+
588
+ addr = rb->aux_pages[from >> PAGE_SHIFT];
589
+ addr += offset_in_page(from);
590
+
591
+ remainder = perf_output_copy(handle, addr, tocopy);
592
+ if (remainder)
593
+ return -EFAULT;
594
+
595
+ len += tocopy;
596
+ from += tocopy;
597
+ from &= (rb->aux_nr_pages << PAGE_SHIFT) - 1;
598
+ } while (to != from);
599
+
600
+ return len;
601
+}
602
+
548603 #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
549604
550605 static struct page *rb_alloc_aux_page(int node, int order)
....@@ -573,7 +628,7 @@
573628 return page;
574629 }
575630
576
-static void rb_free_aux_page(struct ring_buffer *rb, int idx)
631
+static void rb_free_aux_page(struct perf_buffer *rb, int idx)
577632 {
578633 struct page *page = virt_to_page(rb->aux_pages[idx]);
579634
....@@ -582,7 +637,7 @@
582637 __free_page(page);
583638 }
584639
585
-static void __rb_free_aux(struct ring_buffer *rb)
640
+static void __rb_free_aux(struct perf_buffer *rb)
586641 {
587642 int pg;
588643
....@@ -609,34 +664,31 @@
609664 }
610665 }
611666
612
-int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
667
+int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event,
613668 pgoff_t pgoff, int nr_pages, long watermark, int flags)
614669 {
615670 bool overwrite = !(flags & RING_BUFFER_WRITABLE);
616671 int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu);
617
- int ret = -ENOMEM, max_order = 0;
672
+ int ret = -ENOMEM, max_order;
618673
619674 if (!has_aux(event))
620675 return -EOPNOTSUPP;
621676
622
- if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) {
623
- /*
624
- * We need to start with the max_order that fits in nr_pages,
625
- * not the other way around, hence ilog2() and not get_order.
626
- */
627
- max_order = ilog2(nr_pages);
677
+ /*
678
+ * We need to start with the max_order that fits in nr_pages,
679
+ * not the other way around, hence ilog2() and not get_order.
680
+ */
681
+ max_order = ilog2(nr_pages);
628682
629
- /*
630
- * PMU requests more than one contiguous chunks of memory
631
- * for SW double buffering
632
- */
633
- if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) &&
634
- !overwrite) {
635
- if (!max_order)
636
- return -EINVAL;
683
+ /*
684
+ * PMU requests more than one contiguous chunks of memory
685
+ * for SW double buffering
686
+ */
687
+ if (!overwrite) {
688
+ if (!max_order)
689
+ return -EINVAL;
637690
638
- max_order--;
639
- }
691
+ max_order--;
640692 }
641693
642694 rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL,
....@@ -686,7 +738,7 @@
686738 * we keep a refcount here to make sure either of the two can
687739 * reference them safely.
688740 */
689
- atomic_set(&rb->aux_refcount, 1);
741
+ refcount_set(&rb->aux_refcount, 1);
690742
691743 rb->aux_overwrite = overwrite;
692744 rb->aux_watermark = watermark;
....@@ -703,9 +755,9 @@
703755 return ret;
704756 }
705757
706
-void rb_free_aux(struct ring_buffer *rb)
758
+void rb_free_aux(struct perf_buffer *rb)
707759 {
708
- if (atomic_dec_and_test(&rb->aux_refcount))
760
+ if (refcount_dec_and_test(&rb->aux_refcount))
709761 __rb_free_aux(rb);
710762 }
711763
....@@ -716,7 +768,7 @@
716768 */
717769
718770 static struct page *
719
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
771
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
720772 {
721773 if (pgoff > rb->nr_pages)
722774 return NULL;
....@@ -740,13 +792,21 @@
740792 return page_address(page);
741793 }
742794
743
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
795
+static void perf_mmap_free_page(void *addr)
744796 {
745
- struct ring_buffer *rb;
797
+ struct page *page = virt_to_page(addr);
798
+
799
+ page->mapping = NULL;
800
+ __free_page(page);
801
+}
802
+
803
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
804
+{
805
+ struct perf_buffer *rb;
746806 unsigned long size;
747807 int i;
748808
749
- size = sizeof(struct ring_buffer);
809
+ size = sizeof(struct perf_buffer);
750810 size += nr_pages * sizeof(void *);
751811
752812 if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
....@@ -774,9 +834,9 @@
774834
775835 fail_data_pages:
776836 for (i--; i >= 0; i--)
777
- free_page((unsigned long)rb->data_pages[i]);
837
+ perf_mmap_free_page(rb->data_pages[i]);
778838
779
- free_page((unsigned long)rb->user_page);
839
+ perf_mmap_free_page(rb->user_page);
780840
781841 fail_user_page:
782842 kfree(rb);
....@@ -785,32 +845,19 @@
785845 return NULL;
786846 }
787847
788
-static void perf_mmap_free_page(unsigned long addr)
789
-{
790
- struct page *page = virt_to_page((void *)addr);
791
-
792
- page->mapping = NULL;
793
- __free_page(page);
794
-}
795
-
796
-void rb_free(struct ring_buffer *rb)
848
+void rb_free(struct perf_buffer *rb)
797849 {
798850 int i;
799851
800
- perf_mmap_free_page((unsigned long)rb->user_page);
852
+ perf_mmap_free_page(rb->user_page);
801853 for (i = 0; i < rb->nr_pages; i++)
802
- perf_mmap_free_page((unsigned long)rb->data_pages[i]);
854
+ perf_mmap_free_page(rb->data_pages[i]);
803855 kfree(rb);
804856 }
805857
806858 #else
807
-static int data_page_nr(struct ring_buffer *rb)
808
-{
809
- return rb->nr_pages << page_order(rb);
810
-}
811
-
812859 static struct page *
813
-__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
860
+__perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
814861 {
815862 /* The '>' counts in the user page. */
816863 if (pgoff > data_page_nr(rb))
....@@ -828,11 +875,11 @@
828875
829876 static void rb_free_work(struct work_struct *work)
830877 {
831
- struct ring_buffer *rb;
878
+ struct perf_buffer *rb;
832879 void *base;
833880 int i, nr;
834881
835
- rb = container_of(work, struct ring_buffer, work);
882
+ rb = container_of(work, struct perf_buffer, work);
836883 nr = data_page_nr(rb);
837884
838885 base = rb->user_page;
....@@ -844,18 +891,18 @@
844891 kfree(rb);
845892 }
846893
847
-void rb_free(struct ring_buffer *rb)
894
+void rb_free(struct perf_buffer *rb)
848895 {
849896 schedule_work(&rb->work);
850897 }
851898
852
-struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
899
+struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
853900 {
854
- struct ring_buffer *rb;
901
+ struct perf_buffer *rb;
855902 unsigned long size;
856903 void *all_buf;
857904
858
- size = sizeof(struct ring_buffer);
905
+ size = sizeof(struct perf_buffer);
859906 size += sizeof(void *);
860907
861908 rb = kzalloc(size, GFP_KERNEL);
....@@ -889,7 +936,7 @@
889936 #endif
890937
891938 struct page *
892
-perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
939
+perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
893940 {
894941 if (rb->aux_nr_pages) {
895942 /* above AUX space */