hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/trace/ring_buffer.c
....@@ -11,6 +11,7 @@
1111 #include <linux/trace_seq.h>
1212 #include <linux/spinlock.h>
1313 #include <linux/irq_work.h>
14
+#include <linux/security.h>
1415 #include <linux/uaccess.h>
1516 #include <linux/hardirq.h>
1617 #include <linux/kthread.h> /* for self test */
....@@ -201,7 +202,7 @@
201202 case RINGBUF_TYPE_DATA:
202203 return rb_event_data_length(event);
203204 default:
204
- BUG();
205
+ WARN_ON_ONCE(1);
205206 }
206207 /* not hit */
207208 return 0;
....@@ -257,7 +258,7 @@
257258 {
258259 if (extended_time(event))
259260 event = skip_time_extend(event);
260
- BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
261
+ WARN_ON_ONCE(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
261262 /* If length is in len field, then array[0] has the data */
262263 if (event->type_len)
263264 return (void *)&event->array[0];
....@@ -277,6 +278,9 @@
277278
278279 #define for_each_buffer_cpu(buffer, cpu) \
279280 for_each_cpu(cpu, buffer->cpumask)
281
+
282
+#define for_each_online_buffer_cpu(buffer, cpu) \
283
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
280284
281285 #define TS_SHIFT 27
282286 #define TS_MASK ((1ULL << TS_SHIFT) - 1)
....@@ -307,8 +311,6 @@
307311 #define RB_MISSED_EVENTS (1 << 31)
308312 /* Missed count stored at end */
309313 #define RB_MISSED_STORED (1 << 30)
310
-
311
-#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
312314
313315 struct buffer_data_page {
314316 u64 time_stamp; /* page time stamp */
....@@ -353,24 +355,11 @@
353355 local_set(&bpage->commit, 0);
354356 }
355357
356
-/**
357
- * ring_buffer_page_len - the size of data on the page.
358
- * @page: The page to read
359
- *
360
- * Returns the amount of data on the page, including buffer page header.
361
- */
362
-size_t ring_buffer_page_len(void *page)
358
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
363359 {
364
- struct buffer_data_page *bpage = page;
365
-
366
- return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
367
- + BUF_PAGE_HDR_SIZE;
360
+ return local_read(&bpage->page->commit);
368361 }
369362
370
-/*
371
- * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
372
- * this issue out.
373
- */
374363 static void free_buffer_page(struct buffer_page *bpage)
375364 {
376365 free_page((unsigned long)bpage->page);
....@@ -426,6 +415,7 @@
426415 struct irq_work work;
427416 wait_queue_head_t waiters;
428417 wait_queue_head_t full_waiters;
418
+ long wait_index;
429419 bool waiters_pending;
430420 bool full_waiters_pending;
431421 bool wakeup_full;
....@@ -437,11 +427,26 @@
437427 struct rb_event_info {
438428 u64 ts;
439429 u64 delta;
430
+ u64 before;
431
+ u64 after;
440432 unsigned long length;
441433 struct buffer_page *tail_page;
442434 int add_timestamp;
443435 };
444436
437
+/*
438
+ * Used for the add_timestamp
439
+ * NONE
440
+ * EXTEND - wants a time extend
441
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
442
+ * FORCE - force a full time stamp.
443
+ */
444
+enum {
445
+ RB_ADD_STAMP_NONE = 0,
446
+ RB_ADD_STAMP_EXTEND = BIT(1),
447
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
448
+ RB_ADD_STAMP_FORCE = BIT(3)
449
+};
445450 /*
446451 * Used for which event context the event is in.
447452 * TRANSITION = 0
....@@ -461,13 +466,36 @@
461466 RB_CTX_MAX
462467 };
463468
469
+#if BITS_PER_LONG == 32
470
+#define RB_TIME_32
471
+#endif
472
+
473
+/* To test on 64 bit machines */
474
+//#define RB_TIME_32
475
+
476
+#ifdef RB_TIME_32
477
+
478
+struct rb_time_struct {
479
+ local_t cnt;
480
+ local_t top;
481
+ local_t bottom;
482
+};
483
+#else
484
+#include <asm/local64.h>
485
+struct rb_time_struct {
486
+ local64_t time;
487
+};
488
+#endif
489
+typedef struct rb_time_struct rb_time_t;
490
+
464491 /*
465492 * head_page == tail_page && head == tail then buffer is empty.
466493 */
467494 struct ring_buffer_per_cpu {
468495 int cpu;
469496 atomic_t record_disabled;
470
- struct ring_buffer *buffer;
497
+ atomic_t resize_disabled;
498
+ struct trace_buffer *buffer;
471499 raw_spinlock_t reader_lock; /* serialize readers */
472500 arch_spinlock_t lock;
473501 struct lock_class_key lock_key;
....@@ -489,10 +517,18 @@
489517 local_t dropped_events;
490518 local_t committing;
491519 local_t commits;
520
+ local_t pages_touched;
521
+ local_t pages_lost;
522
+ local_t pages_read;
523
+ long last_pages_touch;
524
+ size_t shortest_full;
492525 unsigned long read;
493526 unsigned long read_bytes;
494
- u64 write_stamp;
527
+ rb_time_t write_stamp;
528
+ rb_time_t before_stamp;
495529 u64 read_stamp;
530
+ /* pages removed since last reset */
531
+ unsigned long pages_removed;
496532 /* ring buffer pages to update, > 0 to add, < 0 to remove */
497533 long nr_pages_to_update;
498534 struct list_head new_pages; /* new pages to add */
....@@ -502,11 +538,11 @@
502538 struct rb_irq_work irq_work;
503539 };
504540
505
-struct ring_buffer {
541
+struct trace_buffer {
506542 unsigned flags;
507543 int cpus;
508544 atomic_t record_disabled;
509
- atomic_t resize_disabled;
545
+ atomic_t resizing;
510546 cpumask_var_t cpumask;
511547
512548 struct lock_class_key *reader_lock_key;
....@@ -525,11 +561,257 @@
525561 struct ring_buffer_iter {
526562 struct ring_buffer_per_cpu *cpu_buffer;
527563 unsigned long head;
564
+ unsigned long next_event;
528565 struct buffer_page *head_page;
529566 struct buffer_page *cache_reader_page;
530567 unsigned long cache_read;
568
+ unsigned long cache_pages_removed;
531569 u64 read_stamp;
570
+ u64 page_stamp;
571
+ struct ring_buffer_event *event;
572
+ int missed_events;
532573 };
574
+
575
+#ifdef RB_TIME_32
576
+
577
+/*
578
+ * On 32 bit machines, local64_t is very expensive. As the ring
579
+ * buffer doesn't need all the features of a true 64 bit atomic,
580
+ * on 32 bit, it uses these functions (64 still uses local64_t).
581
+ *
582
+ * For the ring buffer, 64 bit required operations for the time is
583
+ * the following:
584
+ *
585
+ * - Only need 59 bits (uses 60 to make it even).
586
+ * - Reads may fail if it interrupted a modification of the time stamp.
587
+ * It will succeed if it did not interrupt another write even if
588
+ * the read itself is interrupted by a write.
589
+ * It returns whether it was successful or not.
590
+ *
591
+ * - Writes always succeed and will overwrite other writes and writes
592
+ * that were done by events interrupting the current write.
593
+ *
594
+ * - A write followed by a read of the same time stamp will always succeed,
595
+ * but may not contain the same value.
596
+ *
597
+ * - A cmpxchg will fail if it interrupted another write or cmpxchg.
598
+ * Other than that, it acts like a normal cmpxchg.
599
+ *
600
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
601
+ * (bottom being the least significant 30 bits of the 60 bit time stamp).
602
+ *
603
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
604
+ * Each update will increment this counter by one.
605
+ * When reading the top and bottom, if the two counter bits match then the
606
+ * top and bottom together make a valid 60 bit number.
607
+ */
608
+#define RB_TIME_SHIFT 30
609
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
610
+
611
+static inline int rb_time_cnt(unsigned long val)
612
+{
613
+ return (val >> RB_TIME_SHIFT) & 3;
614
+}
615
+
616
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
617
+{
618
+ u64 val;
619
+
620
+ val = top & RB_TIME_VAL_MASK;
621
+ val <<= RB_TIME_SHIFT;
622
+ val |= bottom & RB_TIME_VAL_MASK;
623
+
624
+ return val;
625
+}
626
+
627
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
628
+{
629
+ unsigned long top, bottom;
630
+ unsigned long c;
631
+
632
+ /*
633
+ * If the read is interrupted by a write, then the cnt will
634
+ * be different. Loop until both top and bottom have been read
635
+ * without interruption.
636
+ */
637
+ do {
638
+ c = local_read(&t->cnt);
639
+ top = local_read(&t->top);
640
+ bottom = local_read(&t->bottom);
641
+ } while (c != local_read(&t->cnt));
642
+
643
+ *cnt = rb_time_cnt(top);
644
+
645
+ /* If top and bottom counts don't match, this interrupted a write */
646
+ if (*cnt != rb_time_cnt(bottom))
647
+ return false;
648
+
649
+ *ret = rb_time_val(top, bottom);
650
+ return true;
651
+}
652
+
653
+static bool rb_time_read(rb_time_t *t, u64 *ret)
654
+{
655
+ unsigned long cnt;
656
+
657
+ return __rb_time_read(t, ret, &cnt);
658
+}
659
+
660
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
661
+{
662
+ return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
663
+}
664
+
665
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
666
+{
667
+ *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
668
+ *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
669
+}
670
+
671
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
672
+{
673
+ val = rb_time_val_cnt(val, cnt);
674
+ local_set(t, val);
675
+}
676
+
677
+static void rb_time_set(rb_time_t *t, u64 val)
678
+{
679
+ unsigned long cnt, top, bottom;
680
+
681
+ rb_time_split(val, &top, &bottom);
682
+
683
+ /* Writes always succeed with a valid number even if it gets interrupted. */
684
+ do {
685
+ cnt = local_inc_return(&t->cnt);
686
+ rb_time_val_set(&t->top, top, cnt);
687
+ rb_time_val_set(&t->bottom, bottom, cnt);
688
+ } while (cnt != local_read(&t->cnt));
689
+}
690
+
691
+static inline bool
692
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
693
+{
694
+ unsigned long ret;
695
+
696
+ ret = local_cmpxchg(l, expect, set);
697
+ return ret == expect;
698
+}
699
+
700
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
701
+{
702
+ unsigned long cnt, top, bottom;
703
+ unsigned long cnt2, top2, bottom2;
704
+ u64 val;
705
+
706
+ /* The cmpxchg always fails if it interrupted an update */
707
+ if (!__rb_time_read(t, &val, &cnt2))
708
+ return false;
709
+
710
+ if (val != expect)
711
+ return false;
712
+
713
+ cnt = local_read(&t->cnt);
714
+ if ((cnt & 3) != cnt2)
715
+ return false;
716
+
717
+ cnt2 = cnt + 1;
718
+
719
+ rb_time_split(val, &top, &bottom);
720
+ top = rb_time_val_cnt(top, cnt);
721
+ bottom = rb_time_val_cnt(bottom, cnt);
722
+
723
+ rb_time_split(set, &top2, &bottom2);
724
+ top2 = rb_time_val_cnt(top2, cnt2);
725
+ bottom2 = rb_time_val_cnt(bottom2, cnt2);
726
+
727
+ if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
728
+ return false;
729
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
730
+ return false;
731
+ if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
732
+ return false;
733
+ return true;
734
+}
735
+
736
+#else /* 64 bits */
737
+
738
+/* local64_t always succeeds */
739
+
740
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
741
+{
742
+ *ret = local64_read(&t->time);
743
+ return true;
744
+}
745
+static void rb_time_set(rb_time_t *t, u64 val)
746
+{
747
+ local64_set(&t->time, val);
748
+}
749
+
750
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
751
+{
752
+ u64 val;
753
+ val = local64_cmpxchg(&t->time, expect, set);
754
+ return val == expect;
755
+}
756
+#endif
757
+
758
+/**
759
+ * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
760
+ * @buffer: The ring_buffer to get the number of pages from
761
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
762
+ *
763
+ * Returns the number of pages used by a per_cpu buffer of the ring buffer.
764
+ */
765
+size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu)
766
+{
767
+ return buffer->buffers[cpu]->nr_pages;
768
+}
769
+
770
+/**
771
+ * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer
772
+ * @buffer: The ring_buffer to get the number of pages from
773
+ * @cpu: The cpu of the ring_buffer to get the number of pages from
774
+ *
775
+ * Returns the number of pages that have content in the ring buffer.
776
+ */
777
+size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu)
778
+{
779
+ size_t read;
780
+ size_t lost;
781
+ size_t cnt;
782
+
783
+ read = local_read(&buffer->buffers[cpu]->pages_read);
784
+ lost = local_read(&buffer->buffers[cpu]->pages_lost);
785
+ cnt = local_read(&buffer->buffers[cpu]->pages_touched);
786
+
787
+ if (WARN_ON_ONCE(cnt < lost))
788
+ return 0;
789
+
790
+ cnt -= lost;
791
+
792
+ /* The reader can read an empty page, but not more than that */
793
+ if (cnt < read) {
794
+ WARN_ON_ONCE(read > cnt + 1);
795
+ return 0;
796
+ }
797
+
798
+ return cnt - read;
799
+}
800
+
801
+static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int full)
802
+{
803
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
804
+ size_t nr_pages;
805
+ size_t dirty;
806
+
807
+ nr_pages = cpu_buffer->nr_pages;
808
+ if (!nr_pages || !full)
809
+ return true;
810
+
811
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
812
+
813
+ return (dirty * 100) > (full * nr_pages);
814
+}
533815
534816 /*
535817 * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
....@@ -542,27 +824,60 @@
542824 struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
543825
544826 wake_up_all(&rbwork->waiters);
545
- if (rbwork->wakeup_full) {
827
+ if (rbwork->full_waiters_pending || rbwork->wakeup_full) {
546828 rbwork->wakeup_full = false;
829
+ rbwork->full_waiters_pending = false;
547830 wake_up_all(&rbwork->full_waiters);
548831 }
832
+}
833
+
834
+/**
835
+ * ring_buffer_wake_waiters - wake up any waiters on this ring buffer
836
+ * @buffer: The ring buffer to wake waiters on
837
+ *
838
+ * In the case of a file that represents a ring buffer is closing,
839
+ * it is prudent to wake up any waiters that are on this.
840
+ */
841
+void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
842
+{
843
+ struct ring_buffer_per_cpu *cpu_buffer;
844
+ struct rb_irq_work *rbwork;
845
+
846
+ if (cpu == RING_BUFFER_ALL_CPUS) {
847
+
848
+ /* Wake up individual ones too. One level recursion */
849
+ for_each_buffer_cpu(buffer, cpu)
850
+ ring_buffer_wake_waiters(buffer, cpu);
851
+
852
+ rbwork = &buffer->irq_work;
853
+ } else {
854
+ cpu_buffer = buffer->buffers[cpu];
855
+ rbwork = &cpu_buffer->irq_work;
856
+ }
857
+
858
+ rbwork->wait_index++;
859
+ /* make sure the waiters see the new index */
860
+ smp_wmb();
861
+
862
+ rb_wake_up_waiters(&rbwork->work);
549863 }
550864
551865 /**
552866 * ring_buffer_wait - wait for input to the ring buffer
553867 * @buffer: buffer to wait on
554868 * @cpu: the cpu buffer to wait on
555
- * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
869
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
556870 *
557871 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
558872 * as data is added to any of the @buffer's cpu buffers. Otherwise
559873 * it will wait for data to be added to a specific cpu buffer.
560874 */
561
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
875
+int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
562876 {
563
- struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
877
+ struct ring_buffer_per_cpu *cpu_buffer;
564878 DEFINE_WAIT(wait);
565879 struct rb_irq_work *work;
880
+ long wait_index;
566881 int ret = 0;
567882
568883 /*
....@@ -573,7 +888,7 @@
573888 if (cpu == RING_BUFFER_ALL_CPUS) {
574889 work = &buffer->irq_work;
575890 /* Full only makes sense on per cpu reads */
576
- full = false;
891
+ full = 0;
577892 } else {
578893 if (!cpumask_test_cpu(cpu, buffer->cpumask))
579894 return -ENODEV;
....@@ -581,6 +896,7 @@
581896 work = &cpu_buffer->irq_work;
582897 }
583898
899
+ wait_index = READ_ONCE(work->wait_index);
584900
585901 while (true) {
586902 if (full)
....@@ -625,19 +941,29 @@
625941 !ring_buffer_empty_cpu(buffer, cpu)) {
626942 unsigned long flags;
627943 bool pagebusy;
944
+ bool done;
628945
629946 if (!full)
630947 break;
631948
632949 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
633950 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
634
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
951
+ done = !pagebusy && full_hit(buffer, cpu, full);
635952
636
- if (!pagebusy)
953
+ if (!cpu_buffer->shortest_full ||
954
+ cpu_buffer->shortest_full > full)
955
+ cpu_buffer->shortest_full = full;
956
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
957
+ if (done)
637958 break;
638959 }
639960
640961 schedule();
962
+
963
+ /* Make sure to see the new wait index */
964
+ smp_rmb();
965
+ if (wait_index != work->wait_index)
966
+ break;
641967 }
642968
643969 if (full)
....@@ -654,6 +980,7 @@
654980 * @cpu: the cpu buffer to wait on
655981 * @filp: the file descriptor
656982 * @poll_table: The poll descriptor
983
+ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
657984 *
658985 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
659986 * as data is added to any of the @buffer's cpu buffers. Otherwise
....@@ -662,15 +989,16 @@
662989 * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers,
663990 * zero otherwise.
664991 */
665
-__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
666
- struct file *filp, poll_table *poll_table)
992
+__poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
993
+ struct file *filp, poll_table *poll_table, int full)
667994 {
668995 struct ring_buffer_per_cpu *cpu_buffer;
669996 struct rb_irq_work *work;
670997
671
- if (cpu == RING_BUFFER_ALL_CPUS)
998
+ if (cpu == RING_BUFFER_ALL_CPUS) {
672999 work = &buffer->irq_work;
673
- else {
1000
+ full = 0;
1001
+ } else {
6741002 if (!cpumask_test_cpu(cpu, buffer->cpumask))
6751003 return -EINVAL;
6761004
....@@ -678,8 +1006,17 @@
6781006 work = &cpu_buffer->irq_work;
6791007 }
6801008
681
- poll_wait(filp, &work->waiters, poll_table);
682
- work->waiters_pending = true;
1009
+ if (full) {
1010
+ poll_wait(filp, &work->full_waiters, poll_table);
1011
+ work->full_waiters_pending = true;
1012
+ if (!cpu_buffer->shortest_full ||
1013
+ cpu_buffer->shortest_full > full)
1014
+ cpu_buffer->shortest_full = full;
1015
+ } else {
1016
+ poll_wait(filp, &work->waiters, poll_table);
1017
+ work->waiters_pending = true;
1018
+ }
1019
+
6831020 /*
6841021 * There's a tight race between setting the waiters_pending and
6851022 * checking if the ring buffer is empty. Once the waiters_pending bit
....@@ -694,6 +1031,9 @@
6941031 * will fix it later.
6951032 */
6961033 smp_mb();
1034
+
1035
+ if (full)
1036
+ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
6971037
6981038 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
6991039 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
....@@ -720,13 +1060,21 @@
7201060 /* Up this if you want to test the TIME_EXTENTS and normalization */
7211061 #define DEBUG_SHIFT 0
7221062
723
-static inline u64 rb_time_stamp(struct ring_buffer *buffer)
1063
+static inline u64 rb_time_stamp(struct trace_buffer *buffer)
7241064 {
1065
+ u64 ts;
1066
+
1067
+ /* Skip retpolines :-( */
1068
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
1069
+ ts = trace_clock_local();
1070
+ else
1071
+ ts = buffer->clock();
1072
+
7251073 /* shift to debug/test normalization and TIME_EXTENTS */
726
- return buffer->clock() << DEBUG_SHIFT;
1074
+ return ts << DEBUG_SHIFT;
7271075 }
7281076
729
-u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
1077
+u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
7301078 {
7311079 u64 time;
7321080
....@@ -738,7 +1086,7 @@
7381086 }
7391087 EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
7401088
741
-void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
1089
+void ring_buffer_normalize_time_stamp(struct trace_buffer *buffer,
7421090 int cpu, u64 *ts)
7431091 {
7441092 /* Just stupid testing the normalize function and deltas */
....@@ -1056,6 +1404,7 @@
10561404 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
10571405 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
10581406
1407
+ local_inc(&cpu_buffer->pages_touched);
10591408 /*
10601409 * Just make sure we have seen our old_write and synchronize
10611410 * with any interrupts that come in.
....@@ -1109,19 +1458,6 @@
11091458 }
11101459
11111460 /**
1112
- * rb_check_list - make sure a pointer to a list has the last bits zero
1113
- */
1114
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
1115
- struct list_head *list)
1116
-{
1117
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
1118
- return 1;
1119
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
1120
- return 1;
1121
- return 0;
1122
-}
1123
-
1124
-/**
11251461 * rb_check_pages - integrity check of buffer pages
11261462 * @cpu_buffer: CPU buffer with pages to test
11271463 *
....@@ -1130,35 +1466,26 @@
11301466 */
11311467 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
11321468 {
1133
- struct list_head *head = cpu_buffer->pages;
1134
- struct buffer_page *bpage, *tmp;
1469
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
1470
+ struct list_head *tmp;
11351471
1136
- /* Reset the head page if it exists */
1137
- if (cpu_buffer->head_page)
1138
- rb_set_head_page(cpu_buffer);
1139
-
1140
- rb_head_page_deactivate(cpu_buffer);
1141
-
1142
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
1143
- return -1;
1144
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
1472
+ if (RB_WARN_ON(cpu_buffer,
1473
+ rb_list_head(rb_list_head(head->next)->prev) != head))
11451474 return -1;
11461475
1147
- if (rb_check_list(cpu_buffer, head))
1476
+ if (RB_WARN_ON(cpu_buffer,
1477
+ rb_list_head(rb_list_head(head->prev)->next) != head))
11481478 return -1;
11491479
1150
- list_for_each_entry_safe(bpage, tmp, head, list) {
1480
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
11511481 if (RB_WARN_ON(cpu_buffer,
1152
- bpage->list.next->prev != &bpage->list))
1482
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
11531483 return -1;
1484
+
11541485 if (RB_WARN_ON(cpu_buffer,
1155
- bpage->list.prev->next != &bpage->list))
1156
- return -1;
1157
- if (rb_check_list(cpu_buffer, &bpage->list))
1486
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
11581487 return -1;
11591488 }
1160
-
1161
- rb_head_page_activate(cpu_buffer);
11621489
11631490 return 0;
11641491 }
....@@ -1260,7 +1587,7 @@
12601587 }
12611588
12621589 static struct ring_buffer_per_cpu *
1263
-rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu)
1590
+rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
12641591 {
12651592 struct ring_buffer_per_cpu *cpu_buffer;
12661593 struct buffer_page *bpage;
....@@ -1325,11 +1652,13 @@
13251652 struct list_head *head = cpu_buffer->pages;
13261653 struct buffer_page *bpage, *tmp;
13271654
1655
+ irq_work_sync(&cpu_buffer->irq_work.work);
1656
+
13281657 free_buffer_page(cpu_buffer->reader_page);
13291658
1330
- rb_head_page_deactivate(cpu_buffer);
1331
-
13321659 if (head) {
1660
+ rb_head_page_deactivate(cpu_buffer);
1661
+
13331662 list_for_each_entry_safe(bpage, tmp, head, list) {
13341663 list_del_init(&bpage->list);
13351664 free_buffer_page(bpage);
....@@ -1345,16 +1674,17 @@
13451674 * __ring_buffer_alloc - allocate a new ring_buffer
13461675 * @size: the size in bytes per cpu that is needed.
13471676 * @flags: attributes to set for the ring buffer.
1677
+ * @key: ring buffer reader_lock_key.
13481678 *
13491679 * Currently the only flag that is available is the RB_FL_OVERWRITE
13501680 * flag. This flag means that the buffer will overwrite old data
13511681 * when the buffer wraps. If this flag is not set, the buffer will
13521682 * drop data when the tail hits the head.
13531683 */
1354
-struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
1684
+struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
13551685 struct lock_class_key *key)
13561686 {
1357
- struct ring_buffer *buffer;
1687
+ struct trace_buffer *buffer;
13581688 long nr_pages;
13591689 int bsize;
13601690 int cpu;
....@@ -1424,11 +1754,13 @@
14241754 * @buffer: the buffer to free.
14251755 */
14261756 void
1427
-ring_buffer_free(struct ring_buffer *buffer)
1757
+ring_buffer_free(struct trace_buffer *buffer)
14281758 {
14291759 int cpu;
14301760
14311761 cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
1762
+
1763
+ irq_work_sync(&buffer->irq_work.work);
14321764
14331765 for_each_buffer_cpu(buffer, cpu)
14341766 rb_free_cpu_buffer(buffer->buffers[cpu]);
....@@ -1440,18 +1772,18 @@
14401772 }
14411773 EXPORT_SYMBOL_GPL(ring_buffer_free);
14421774
1443
-void ring_buffer_set_clock(struct ring_buffer *buffer,
1775
+void ring_buffer_set_clock(struct trace_buffer *buffer,
14441776 u64 (*clock)(void))
14451777 {
14461778 buffer->clock = clock;
14471779 }
14481780
1449
-void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
1781
+void ring_buffer_set_time_stamp_abs(struct trace_buffer *buffer, bool abs)
14501782 {
14511783 buffer->time_stamp_abs = abs;
14521784 }
14531785
1454
-bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
1786
+bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer)
14551787 {
14561788 return buffer->time_stamp_abs;
14571789 }
....@@ -1509,6 +1841,8 @@
15091841 to_remove = rb_list_head(to_remove)->next;
15101842 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
15111843 }
1844
+ /* Read iterators need to reset themselves when some pages removed */
1845
+ cpu_buffer->pages_removed += nr_removed;
15121846
15131847 next_page = rb_list_head(to_remove)->next;
15141848
....@@ -1529,12 +1863,6 @@
15291863 if (head_bit)
15301864 cpu_buffer->head_page = list_entry(next_page,
15311865 struct buffer_page, list);
1532
-
1533
- /*
1534
- * change read pointer to make sure any read iterators reset
1535
- * themselves
1536
- */
1537
- cpu_buffer->read = 0;
15381866
15391867 /* pages are removed, resume tracing and then free the pages */
15401868 atomic_dec(&cpu_buffer->record_disabled);
....@@ -1563,7 +1891,8 @@
15631891 * Increment overrun to account for the lost events.
15641892 */
15651893 local_add(page_entries, &cpu_buffer->overrun);
1566
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1894
+ local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
1895
+ local_inc(&cpu_buffer->pages_lost);
15671896 }
15681897
15691898 /*
....@@ -1689,7 +2018,7 @@
16892018 *
16902019 * Returns 0 on success and < 0 on failure.
16912020 */
1692
-int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
2021
+int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
16932022 int cpu_id)
16942023 {
16952024 struct ring_buffer_per_cpu *cpu_buffer;
....@@ -1715,18 +2044,24 @@
17152044
17162045 size = nr_pages * BUF_PAGE_SIZE;
17172046
1718
- /*
1719
- * Don't succeed if resizing is disabled, as a reader might be
1720
- * manipulating the ring buffer and is expecting a sane state while
1721
- * this is true.
1722
- */
1723
- if (atomic_read(&buffer->resize_disabled))
1724
- return -EBUSY;
1725
-
17262047 /* prevent another thread from changing buffer sizes */
17272048 mutex_lock(&buffer->mutex);
2049
+ atomic_inc(&buffer->resizing);
17282050
17292051 if (cpu_id == RING_BUFFER_ALL_CPUS) {
2052
+ /*
2053
+ * Don't succeed if resizing is disabled, as a reader might be
2054
+ * manipulating the ring buffer and is expecting a sane state while
2055
+ * this is true.
2056
+ */
2057
+ for_each_buffer_cpu(buffer, cpu) {
2058
+ cpu_buffer = buffer->buffers[cpu];
2059
+ if (atomic_read(&cpu_buffer->resize_disabled)) {
2060
+ err = -EBUSY;
2061
+ goto out_err_unlock;
2062
+ }
2063
+ }
2064
+
17302065 /* calculate the pages to update */
17312066 for_each_buffer_cpu(buffer, cpu) {
17322067 cpu_buffer = buffer->buffers[cpu];
....@@ -1749,6 +2084,8 @@
17492084 err = -ENOMEM;
17502085 goto out_err;
17512086 }
2087
+
2088
+ cond_resched();
17522089 }
17532090
17542091 get_online_cpus();
....@@ -1794,6 +2131,16 @@
17942131 if (nr_pages == cpu_buffer->nr_pages)
17952132 goto out;
17962133
2134
+ /*
2135
+ * Don't succeed if resizing is disabled, as a reader might be
2136
+ * manipulating the ring buffer and is expecting a sane state while
2137
+ * this is true.
2138
+ */
2139
+ if (atomic_read(&cpu_buffer->resize_disabled)) {
2140
+ err = -EBUSY;
2141
+ goto out_err_unlock;
2142
+ }
2143
+
17972144 cpu_buffer->nr_pages_to_update = nr_pages -
17982145 cpu_buffer->nr_pages;
17992146
....@@ -1836,7 +2183,7 @@
18362183 * There could have been a race between checking
18372184 * record_disable and incrementing it.
18382185 */
1839
- synchronize_sched();
2186
+ synchronize_rcu();
18402187 for_each_buffer_cpu(buffer, cpu) {
18412188 cpu_buffer = buffer->buffers[cpu];
18422189 rb_check_pages(cpu_buffer);
....@@ -1844,6 +2191,7 @@
18442191 atomic_dec(&buffer->record_disabled);
18452192 }
18462193
2194
+ atomic_dec(&buffer->resizing);
18472195 mutex_unlock(&buffer->mutex);
18482196 return 0;
18492197
....@@ -1863,12 +2211,14 @@
18632211 free_buffer_page(bpage);
18642212 }
18652213 }
2214
+ out_err_unlock:
2215
+ atomic_dec(&buffer->resizing);
18662216 mutex_unlock(&buffer->mutex);
18672217 return err;
18682218 }
18692219 EXPORT_SYMBOL_GPL(ring_buffer_resize);
18702220
1871
-void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
2221
+void ring_buffer_change_overwrite(struct trace_buffer *buffer, int val)
18722222 {
18732223 mutex_lock(&buffer->mutex);
18742224 if (val)
....@@ -1891,15 +2241,63 @@
18912241 cpu_buffer->reader_page->read);
18922242 }
18932243
1894
-static __always_inline struct ring_buffer_event *
2244
+static struct ring_buffer_event *
18952245 rb_iter_head_event(struct ring_buffer_iter *iter)
18962246 {
1897
- return __rb_page_index(iter->head_page, iter->head);
1898
-}
2247
+ struct ring_buffer_event *event;
2248
+ struct buffer_page *iter_head_page = iter->head_page;
2249
+ unsigned long commit;
2250
+ unsigned length;
18992251
1900
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
1901
-{
1902
- return local_read(&bpage->page->commit);
2252
+ if (iter->head != iter->next_event)
2253
+ return iter->event;
2254
+
2255
+ /*
2256
+ * When the writer goes across pages, it issues a cmpxchg which
2257
+ * is a mb(), which will synchronize with the rmb here.
2258
+ * (see rb_tail_page_update() and __rb_reserve_next())
2259
+ */
2260
+ commit = rb_page_commit(iter_head_page);
2261
+ smp_rmb();
2262
+
2263
+ /* An event needs to be at least 8 bytes in size */
2264
+ if (iter->head > commit - 8)
2265
+ goto reset;
2266
+
2267
+ event = __rb_page_index(iter_head_page, iter->head);
2268
+ length = rb_event_length(event);
2269
+
2270
+ /*
2271
+ * READ_ONCE() doesn't work on functions and we don't want the
2272
+ * compiler doing any crazy optimizations with length.
2273
+ */
2274
+ barrier();
2275
+
2276
+ if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
2277
+ /* Writer corrupted the read? */
2278
+ goto reset;
2279
+
2280
+ memcpy(iter->event, event, length);
2281
+ /*
2282
+ * If the page stamp is still the same after this rmb() then the
2283
+ * event was safely copied without the writer entering the page.
2284
+ */
2285
+ smp_rmb();
2286
+
2287
+ /* Make sure the page didn't change since we read this */
2288
+ if (iter->page_stamp != iter_head_page->page->time_stamp ||
2289
+ commit > rb_page_commit(iter_head_page))
2290
+ goto reset;
2291
+
2292
+ iter->next_event = iter->head + length;
2293
+ return iter->event;
2294
+ reset:
2295
+ /* Reset to the beginning */
2296
+ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
2297
+ iter->head = 0;
2298
+ iter->next_event = 0;
2299
+ iter->missed_events = 1;
2300
+ return NULL;
19032301 }
19042302
19052303 /* Size is determined by what has been committed */
....@@ -1937,8 +2335,9 @@
19372335 else
19382336 rb_inc_page(cpu_buffer, &iter->head_page);
19392337
1940
- iter->read_stamp = iter->head_page->page->time_stamp;
2338
+ iter->page_stamp = iter->read_stamp = iter->head_page->page->time_stamp;
19412339 iter->head = 0;
2340
+ iter->next_event = 0;
19422341 }
19432342
19442343 /*
....@@ -1987,7 +2386,8 @@
19872386 * the counters.
19882387 */
19892388 local_add(entries, &cpu_buffer->overrun);
1990
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
2389
+ local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
2390
+ local_inc(&cpu_buffer->pages_lost);
19912391
19922392 /*
19932393 * The entries will be zeroed out when we move the
....@@ -2129,9 +2529,6 @@
21292529
21302530 event = __rb_page_index(tail_page, tail);
21312531
2132
- /* account for padding bytes */
2133
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
2134
-
21352532 /*
21362533 * Save the original length to the meta data.
21372534 * This will be used by the reader to add lost event
....@@ -2145,7 +2542,8 @@
21452542 * write counter enough to allow another writer to slip
21462543 * in on this page.
21472544 * We put in a discarded commit instead, to make sure
2148
- * that this space is not used again.
2545
+ * that this space is not used again, and this space will
2546
+ * not be accounted into 'entries_bytes'.
21492547 *
21502548 * If we are less than the minimum size, we don't need to
21512549 * worry about it.
....@@ -2155,6 +2553,9 @@
21552553
21562554 /* Mark the rest of the page with padding */
21572555 rb_event_set_padding(event);
2556
+
2557
+ /* Make sure the padding is visible before the write update */
2558
+ smp_wmb();
21582559
21592560 /* Set the write back to the previous setting */
21602561 local_sub(length, &tail_page->write);
....@@ -2166,6 +2567,12 @@
21662567 event->type_len = RINGBUF_TYPE_PADDING;
21672568 /* time delta must be non zero */
21682569 event->time_delta = 1;
2570
+
2571
+ /* account for padding bytes */
2572
+ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
2573
+
2574
+ /* Make sure the padding is visible before the tail_page->write update */
2575
+ smp_wmb();
21692576
21702577 /* Set write to end of buffer */
21712578 length = (tail + length) - BUF_PAGE_SIZE;
....@@ -2183,7 +2590,7 @@
21832590 {
21842591 struct buffer_page *tail_page = info->tail_page;
21852592 struct buffer_page *commit_page = cpu_buffer->commit_page;
2186
- struct ring_buffer *buffer = cpu_buffer->buffer;
2593
+ struct trace_buffer *buffer = cpu_buffer->buffer;
21872594 struct buffer_page *next_page;
21882595 int ret;
21892596
....@@ -2280,8 +2687,8 @@
22802687 return NULL;
22812688 }
22822689
2283
-/* Slow path, do not inline */
2284
-static noinline struct ring_buffer_event *
2690
+/* Slow path */
2691
+static struct ring_buffer_event *
22852692 rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
22862693 {
22872694 if (abs)
....@@ -2305,13 +2712,73 @@
23052712 static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
23062713 struct ring_buffer_event *event);
23072714
2715
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2716
+static inline bool sched_clock_stable(void)
2717
+{
2718
+ return true;
2719
+}
2720
+#endif
2721
+
2722
+static void
2723
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
2724
+ struct rb_event_info *info)
2725
+{
2726
+ u64 write_stamp;
2727
+
2728
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
2729
+ (unsigned long long)info->delta,
2730
+ (unsigned long long)info->ts,
2731
+ (unsigned long long)info->before,
2732
+ (unsigned long long)info->after,
2733
+ (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
2734
+ sched_clock_stable() ? "" :
2735
+ "If you just came from a suspend/resume,\n"
2736
+ "please switch to the trace global clock:\n"
2737
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
2738
+ "or add trace_clock=global to the kernel command line\n");
2739
+}
2740
+
2741
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
2742
+ struct ring_buffer_event **event,
2743
+ struct rb_event_info *info,
2744
+ u64 *delta,
2745
+ unsigned int *length)
2746
+{
2747
+ bool abs = info->add_timestamp &
2748
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
2749
+
2750
+ if (unlikely(info->delta > (1ULL << 59))) {
2751
+ /* did the clock go backwards */
2752
+ if (info->before == info->after && info->before > info->ts) {
2753
+ /* not interrupted */
2754
+ static int once;
2755
+
2756
+ /*
2757
+ * This is possible with a recalibrating of the TSC.
2758
+ * Do not produce a call stack, but just report it.
2759
+ */
2760
+ if (!once) {
2761
+ once++;
2762
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
2763
+ info->before, info->ts);
2764
+ }
2765
+ } else
2766
+ rb_check_timestamp(cpu_buffer, info);
2767
+ if (!abs)
2768
+ info->delta = 0;
2769
+ }
2770
+ *event = rb_add_time_stamp(*event, info->delta, abs);
2771
+ *length -= RB_LEN_TIME_EXTEND;
2772
+ *delta = 0;
2773
+}
2774
+
23082775 /**
23092776 * rb_update_event - update event type and data
2777
+ * @cpu_buffer: The per cpu buffer of the @event
23102778 * @event: the event to update
2311
- * @type: the type of event
2312
- * @length: the size of the event field in the ring buffer
2779
+ * @info: The info to update the @event with (contains length and delta)
23132780 *
2314
- * Update the type and data fields of the event. The length
2781
+ * Update the type and data fields of the @event. The length
23152782 * is the actual size that is written to the ring buffer,
23162783 * and with this, we can determine what to place into the
23172784 * data field.
....@@ -2324,21 +2791,12 @@
23242791 unsigned length = info->length;
23252792 u64 delta = info->delta;
23262793
2327
- /* Only a commit updates the timestamp */
2328
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
2329
- delta = 0;
2330
-
23312794 /*
23322795 * If we need to add a timestamp, then we
23332796 * add it to the start of the reserved space.
23342797 */
2335
- if (unlikely(info->add_timestamp)) {
2336
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
2337
-
2338
- event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
2339
- length -= RB_LEN_TIME_EXTEND;
2340
- delta = 0;
2341
- }
2798
+ if (unlikely(info->add_timestamp))
2799
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
23422800
23432801 event->time_delta = delta;
23442802 length -= RB_EVNT_HDR_SIZE;
....@@ -2381,12 +2839,38 @@
23812839 return length;
23822840 }
23832841
2384
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2385
-static inline bool sched_clock_stable(void)
2842
+static __always_inline bool
2843
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
2844
+ struct ring_buffer_event *event)
23862845 {
2387
- return true;
2846
+ unsigned long addr = (unsigned long)event;
2847
+ unsigned long index;
2848
+
2849
+ index = rb_event_index(event);
2850
+ addr &= PAGE_MASK;
2851
+
2852
+ return cpu_buffer->commit_page->page == (void *)addr &&
2853
+ rb_commit_index(cpu_buffer) == index;
23882854 }
2389
-#endif
2855
+
2856
+static u64 rb_time_delta(struct ring_buffer_event *event)
2857
+{
2858
+ switch (event->type_len) {
2859
+ case RINGBUF_TYPE_PADDING:
2860
+ return 0;
2861
+
2862
+ case RINGBUF_TYPE_TIME_EXTEND:
2863
+ return ring_buffer_event_time_stamp(event);
2864
+
2865
+ case RINGBUF_TYPE_TIME_STAMP:
2866
+ return 0;
2867
+
2868
+ case RINGBUF_TYPE_DATA:
2869
+ return event->time_delta;
2870
+ default:
2871
+ return 0;
2872
+ }
2873
+}
23902874
23912875 static inline int
23922876 rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
....@@ -2396,6 +2880,8 @@
23962880 struct buffer_page *bpage;
23972881 unsigned long index;
23982882 unsigned long addr;
2883
+ u64 write_stamp;
2884
+ u64 delta;
23992885
24002886 new_index = rb_event_index(event);
24012887 old_index = new_index + rb_event_ts_length(event);
....@@ -2404,10 +2890,43 @@
24042890
24052891 bpage = READ_ONCE(cpu_buffer->tail_page);
24062892
2893
+ delta = rb_time_delta(event);
2894
+
2895
+ if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
2896
+ return 0;
2897
+
2898
+ /* Make sure the write stamp is read before testing the location */
2899
+ barrier();
2900
+
24072901 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
24082902 unsigned long write_mask =
24092903 local_read(&bpage->write) & ~RB_WRITE_MASK;
24102904 unsigned long event_length = rb_event_length(event);
2905
+
2906
+ /* Something came in, can't discard */
2907
+ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
2908
+ write_stamp, write_stamp - delta))
2909
+ return 0;
2910
+
2911
+ /*
2912
+ * It's possible that the event time delta is zero
2913
+ * (has the same time stamp as the previous event)
2914
+ * in which case write_stamp and before_stamp could
2915
+ * be the same. In such a case, force before_stamp
2916
+ * to be different than write_stamp. It doesn't
2917
+ * matter what it is, as long as its different.
2918
+ */
2919
+ if (!delta)
2920
+ rb_time_set(&cpu_buffer->before_stamp, 0);
2921
+
2922
+ /*
2923
+ * If an event were to come in now, it would see that the
2924
+ * write_stamp and the before_stamp are different, and assume
2925
+ * that this event just added itself before updating
2926
+ * the write stamp. The interrupting event will fix the
2927
+ * write stamp for us, and use the before stamp as its delta.
2928
+ */
2929
+
24112930 /*
24122931 * This is on the tail page. It is possible that
24132932 * a write could come in and move the tail page
....@@ -2456,19 +2975,21 @@
24562975 if (RB_WARN_ON(cpu_buffer,
24572976 rb_is_reader_page(cpu_buffer->tail_page)))
24582977 return;
2978
+ /*
2979
+ * No need for a memory barrier here, as the update
2980
+ * of the tail_page did it for this page.
2981
+ */
24592982 local_set(&cpu_buffer->commit_page->page->commit,
24602983 rb_page_write(cpu_buffer->commit_page));
24612984 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
2462
- /* Only update the write stamp if the page has an event */
2463
- if (rb_page_write(cpu_buffer->commit_page))
2464
- cpu_buffer->write_stamp =
2465
- cpu_buffer->commit_page->page->time_stamp;
24662985 /* add barrier to keep gcc from optimizing too much */
24672986 barrier();
24682987 }
24692988 while (rb_commit_index(cpu_buffer) !=
24702989 rb_page_write(cpu_buffer->commit_page)) {
24712990
2991
+ /* Make sure the readers see the content of what is committed. */
2992
+ smp_wmb();
24722993 local_set(&cpu_buffer->commit_page->page->commit,
24732994 rb_page_write(cpu_buffer->commit_page));
24742995 RB_WARN_ON(cpu_buffer,
....@@ -2534,62 +3055,16 @@
25343055 event->time_delta = 1;
25353056 }
25363057
2537
-static __always_inline bool
2538
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
2539
- struct ring_buffer_event *event)
2540
-{
2541
- unsigned long addr = (unsigned long)event;
2542
- unsigned long index;
2543
-
2544
- index = rb_event_index(event);
2545
- addr &= PAGE_MASK;
2546
-
2547
- return cpu_buffer->commit_page->page == (void *)addr &&
2548
- rb_commit_index(cpu_buffer) == index;
2549
-}
2550
-
2551
-static __always_inline void
2552
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2553
- struct ring_buffer_event *event)
2554
-{
2555
- u64 delta;
2556
-
2557
- /*
2558
- * The event first in the commit queue updates the
2559
- * time stamp.
2560
- */
2561
- if (rb_event_is_commit(cpu_buffer, event)) {
2562
- /*
2563
- * A commit event that is first on a page
2564
- * updates the write timestamp with the page stamp
2565
- */
2566
- if (!rb_event_index(event))
2567
- cpu_buffer->write_stamp =
2568
- cpu_buffer->commit_page->page->time_stamp;
2569
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2570
- delta = ring_buffer_event_time_stamp(event);
2571
- cpu_buffer->write_stamp += delta;
2572
- } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
2573
- delta = ring_buffer_event_time_stamp(event);
2574
- cpu_buffer->write_stamp = delta;
2575
- } else
2576
- cpu_buffer->write_stamp += event->time_delta;
2577
- }
2578
-}
2579
-
25803058 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
25813059 struct ring_buffer_event *event)
25823060 {
25833061 local_inc(&cpu_buffer->entries);
2584
- rb_update_write_stamp(cpu_buffer, event);
25853062 rb_end_commit(cpu_buffer);
25863063 }
25873064
25883065 static __always_inline void
2589
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
3066
+rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
25903067 {
2591
- bool pagebusy;
2592
-
25933068 if (buffer->irq_work.waiters_pending) {
25943069 buffer->irq_work.waiters_pending = false;
25953070 /* irq_work_queue() supplies it's own memory barriers */
....@@ -2602,14 +3077,24 @@
26023077 irq_work_queue(&cpu_buffer->irq_work.work);
26033078 }
26043079
2605
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
3080
+ if (cpu_buffer->last_pages_touch == local_read(&cpu_buffer->pages_touched))
3081
+ return;
26063082
2607
- if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2608
- cpu_buffer->irq_work.wakeup_full = true;
2609
- cpu_buffer->irq_work.full_waiters_pending = false;
2610
- /* irq_work_queue() supplies it's own memory barriers */
2611
- irq_work_queue(&cpu_buffer->irq_work.work);
2612
- }
3083
+ if (cpu_buffer->reader_page == cpu_buffer->commit_page)
3084
+ return;
3085
+
3086
+ if (!cpu_buffer->irq_work.full_waiters_pending)
3087
+ return;
3088
+
3089
+ cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
3090
+
3091
+ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
3092
+ return;
3093
+
3094
+ cpu_buffer->irq_work.wakeup_full = true;
3095
+ cpu_buffer->irq_work.full_waiters_pending = false;
3096
+ /* irq_work_queue() supplies it's own memory barriers */
3097
+ irq_work_queue(&cpu_buffer->irq_work.work);
26133098 }
26143099
26153100 /*
....@@ -2727,7 +3212,7 @@
27273212 * Call this function before calling another ring_buffer_lock_reserve() and
27283213 * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
27293214 */
2730
-void ring_buffer_nest_start(struct ring_buffer *buffer)
3215
+void ring_buffer_nest_start(struct trace_buffer *buffer)
27313216 {
27323217 struct ring_buffer_per_cpu *cpu_buffer;
27333218 int cpu;
....@@ -2747,7 +3232,7 @@
27473232 * Must be called after ring_buffer_nest_start() and after the
27483233 * ring_buffer_unlock_commit().
27493234 */
2750
-void ring_buffer_nest_end(struct ring_buffer *buffer)
3235
+void ring_buffer_nest_end(struct trace_buffer *buffer)
27513236 {
27523237 struct ring_buffer_per_cpu *cpu_buffer;
27533238 int cpu;
....@@ -2769,7 +3254,7 @@
27693254 *
27703255 * Must be paired with ring_buffer_lock_reserve.
27713256 */
2772
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
3257
+int ring_buffer_unlock_commit(struct trace_buffer *buffer,
27733258 struct ring_buffer_event *event)
27743259 {
27753260 struct ring_buffer_per_cpu *cpu_buffer;
....@@ -2789,57 +3274,135 @@
27893274 }
27903275 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
27913276
2792
-static noinline void
2793
-rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
2794
- struct rb_event_info *info)
2795
-{
2796
- WARN_ONCE(info->delta > (1ULL << 59),
2797
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2798
- (unsigned long long)info->delta,
2799
- (unsigned long long)info->ts,
2800
- (unsigned long long)cpu_buffer->write_stamp,
2801
- sched_clock_stable() ? "" :
2802
- "If you just came from a suspend/resume,\n"
2803
- "please switch to the trace global clock:\n"
2804
- " echo global > /sys/kernel/debug/tracing/trace_clock\n"
2805
- "or add trace_clock=global to the kernel command line\n");
2806
- info->add_timestamp = 1;
2807
-}
2808
-
28093277 static struct ring_buffer_event *
28103278 __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
28113279 struct rb_event_info *info)
28123280 {
28133281 struct ring_buffer_event *event;
28143282 struct buffer_page *tail_page;
2815
- unsigned long tail, write;
2816
-
2817
- /*
2818
- * If the time delta since the last event is too big to
2819
- * hold in the time field of the event, then we append a
2820
- * TIME EXTEND event ahead of the data event.
2821
- */
2822
- if (unlikely(info->add_timestamp))
2823
- info->length += RB_LEN_TIME_EXTEND;
3283
+ unsigned long tail, write, w;
3284
+ bool a_ok;
3285
+ bool b_ok;
28243286
28253287 /* Don't let the compiler play games with cpu_buffer->tail_page */
28263288 tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
2827
- write = local_add_return(info->length, &tail_page->write);
3289
+
3290
+ /*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
3291
+ barrier();
3292
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
3293
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3294
+ barrier();
3295
+ info->ts = rb_time_stamp(cpu_buffer->buffer);
3296
+
3297
+ if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
3298
+ info->delta = info->ts;
3299
+ } else {
3300
+ /*
3301
+ * If interrupting an event time update, we may need an
3302
+ * absolute timestamp.
3303
+ * Don't bother if this is the start of a new page (w == 0).
3304
+ */
3305
+ if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
3306
+ info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
3307
+ info->length += RB_LEN_TIME_EXTEND;
3308
+ } else {
3309
+ info->delta = info->ts - info->after;
3310
+ if (unlikely(test_time_stamp(info->delta))) {
3311
+ info->add_timestamp |= RB_ADD_STAMP_EXTEND;
3312
+ info->length += RB_LEN_TIME_EXTEND;
3313
+ }
3314
+ }
3315
+ }
3316
+
3317
+ /*B*/ rb_time_set(&cpu_buffer->before_stamp, info->ts);
3318
+
3319
+ /*C*/ write = local_add_return(info->length, &tail_page->write);
28283320
28293321 /* set write to only the index of the write */
28303322 write &= RB_WRITE_MASK;
3323
+
28313324 tail = write - info->length;
3325
+
3326
+ /* See if we shot pass the end of this buffer page */
3327
+ if (unlikely(write > BUF_PAGE_SIZE)) {
3328
+ /* before and after may now different, fix it up*/
3329
+ b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
3330
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3331
+ if (a_ok && b_ok && info->before != info->after)
3332
+ (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
3333
+ info->before, info->after);
3334
+ return rb_move_tail(cpu_buffer, tail, info);
3335
+ }
3336
+
3337
+ if (likely(tail == w)) {
3338
+ u64 save_before;
3339
+ bool s_ok;
3340
+
3341
+ /* Nothing interrupted us between A and C */
3342
+ /*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
3343
+ barrier();
3344
+ /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
3345
+ RB_WARN_ON(cpu_buffer, !s_ok);
3346
+ if (likely(!(info->add_timestamp &
3347
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
3348
+ /* This did not interrupt any time update */
3349
+ info->delta = info->ts - info->after;
3350
+ else
3351
+ /* Just use full timestamp for inerrupting event */
3352
+ info->delta = info->ts;
3353
+ barrier();
3354
+ if (unlikely(info->ts != save_before)) {
3355
+ /* SLOW PATH - Interrupted between C and E */
3356
+
3357
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3358
+ RB_WARN_ON(cpu_buffer, !a_ok);
3359
+
3360
+ /* Write stamp must only go forward */
3361
+ if (save_before > info->after) {
3362
+ /*
3363
+ * We do not care about the result, only that
3364
+ * it gets updated atomically.
3365
+ */
3366
+ (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
3367
+ info->after, save_before);
3368
+ }
3369
+ }
3370
+ } else {
3371
+ u64 ts;
3372
+ /* SLOW PATH - Interrupted between A and C */
3373
+ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
3374
+ /* Was interrupted before here, write_stamp must be valid */
3375
+ RB_WARN_ON(cpu_buffer, !a_ok);
3376
+ ts = rb_time_stamp(cpu_buffer->buffer);
3377
+ barrier();
3378
+ /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
3379
+ info->after < ts &&
3380
+ rb_time_cmpxchg(&cpu_buffer->write_stamp,
3381
+ info->after, ts)) {
3382
+ /* Nothing came after this event between C and E */
3383
+ info->delta = ts - info->after;
3384
+ info->ts = ts;
3385
+ } else {
3386
+ /*
3387
+ * Interrupted beween C and E:
3388
+ * Lost the previous events time stamp. Just set the
3389
+ * delta to zero, and this will be the same time as
3390
+ * the event this event interrupted. And the events that
3391
+ * came after this will still be correct (as they would
3392
+ * have built their delta on the previous event.
3393
+ */
3394
+ info->delta = 0;
3395
+ }
3396
+ info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
3397
+ }
28323398
28333399 /*
28343400 * If this is the first commit on the page, then it has the same
28353401 * timestamp as the page itself.
28363402 */
2837
- if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
3403
+ if (unlikely(!tail && !(info->add_timestamp &
3404
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
28383405 info->delta = 0;
2839
-
2840
- /* See if we shot pass the end of this buffer page */
2841
- if (unlikely(write > BUF_PAGE_SIZE))
2842
- return rb_move_tail(cpu_buffer, tail, info);
28433406
28443407 /* We reserved something on the buffer */
28453408
....@@ -2852,7 +3415,7 @@
28523415 * If this is the first commit on the page, then update
28533416 * its timestamp.
28543417 */
2855
- if (!tail)
3418
+ if (unlikely(!tail))
28563419 tail_page->page->time_stamp = info->ts;
28573420
28583421 /* account for these added bytes */
....@@ -2862,16 +3425,17 @@
28623425 }
28633426
28643427 static __always_inline struct ring_buffer_event *
2865
-rb_reserve_next_event(struct ring_buffer *buffer,
3428
+rb_reserve_next_event(struct trace_buffer *buffer,
28663429 struct ring_buffer_per_cpu *cpu_buffer,
28673430 unsigned long length)
28683431 {
28693432 struct ring_buffer_event *event;
28703433 struct rb_event_info info;
28713434 int nr_loops = 0;
2872
- u64 diff;
3435
+ int add_ts_default;
28733436
28743437 rb_start_commit(cpu_buffer);
3438
+ /* The commit page can not change after this */
28753439
28763440 #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
28773441 /*
....@@ -2889,8 +3453,16 @@
28893453 #endif
28903454
28913455 info.length = rb_calculate_event_length(length);
3456
+
3457
+ if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
3458
+ add_ts_default = RB_ADD_STAMP_ABSOLUTE;
3459
+ info.length += RB_LEN_TIME_EXTEND;
3460
+ } else {
3461
+ add_ts_default = RB_ADD_STAMP_NONE;
3462
+ }
3463
+
28923464 again:
2893
- info.add_timestamp = 0;
3465
+ info.add_timestamp = add_ts_default;
28943466 info.delta = 0;
28953467
28963468 /*
....@@ -2905,35 +3477,16 @@
29053477 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
29063478 goto out_fail;
29073479
2908
- info.ts = rb_time_stamp(cpu_buffer->buffer);
2909
- diff = info.ts - cpu_buffer->write_stamp;
2910
-
2911
- /* make sure this diff is calculated here */
2912
- barrier();
2913
-
2914
- if (ring_buffer_time_stamp_abs(buffer)) {
2915
- info.delta = info.ts;
2916
- rb_handle_timestamp(cpu_buffer, &info);
2917
- } else /* Did the write stamp get updated already? */
2918
- if (likely(info.ts >= cpu_buffer->write_stamp)) {
2919
- info.delta = diff;
2920
- if (unlikely(test_time_stamp(info.delta)))
2921
- rb_handle_timestamp(cpu_buffer, &info);
2922
- }
2923
-
29243480 event = __rb_reserve_next(cpu_buffer, &info);
29253481
29263482 if (unlikely(PTR_ERR(event) == -EAGAIN)) {
2927
- if (info.add_timestamp)
3483
+ if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
29283484 info.length -= RB_LEN_TIME_EXTEND;
29293485 goto again;
29303486 }
29313487
2932
- if (!event)
2933
- goto out_fail;
2934
-
2935
- return event;
2936
-
3488
+ if (likely(event))
3489
+ return event;
29373490 out_fail:
29383491 rb_end_commit(cpu_buffer);
29393492 return NULL;
....@@ -2955,7 +3508,7 @@
29553508 * If NULL is returned, then nothing has been allocated or locked.
29563509 */
29573510 struct ring_buffer_event *
2958
-ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
3511
+ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
29593512 {
29603513 struct ring_buffer_per_cpu *cpu_buffer;
29613514 struct ring_buffer_event *event;
....@@ -3056,7 +3609,7 @@
30563609 * If this function is called, do not call ring_buffer_unlock_commit on
30573610 * the event.
30583611 */
3059
-void ring_buffer_discard_commit(struct ring_buffer *buffer,
3612
+void ring_buffer_discard_commit(struct trace_buffer *buffer,
30603613 struct ring_buffer_event *event)
30613614 {
30623615 struct ring_buffer_per_cpu *cpu_buffer;
....@@ -3079,11 +3632,6 @@
30793632 if (rb_try_to_discard(cpu_buffer, event))
30803633 goto out;
30813634
3082
- /*
3083
- * The commit is still visible by the reader, so we
3084
- * must still update the timestamp.
3085
- */
3086
- rb_update_write_stamp(cpu_buffer, event);
30873635 out:
30883636 rb_end_commit(cpu_buffer);
30893637
....@@ -3107,7 +3655,7 @@
31073655 * Note, like ring_buffer_lock_reserve, the length is the length of the data
31083656 * and not the length of the event which would hold the header.
31093657 */
3110
-int ring_buffer_write(struct ring_buffer *buffer,
3658
+int ring_buffer_write(struct trace_buffer *buffer,
31113659 unsigned long length,
31123660 void *data)
31133661 {
....@@ -3205,9 +3753,9 @@
32053753 * This prevents all writes to the buffer. Any attempt to write
32063754 * to the buffer after this will fail and return NULL.
32073755 *
3208
- * The caller should call synchronize_sched() after this.
3756
+ * The caller should call synchronize_rcu() after this.
32093757 */
3210
-void ring_buffer_record_disable(struct ring_buffer *buffer)
3758
+void ring_buffer_record_disable(struct trace_buffer *buffer)
32113759 {
32123760 atomic_inc(&buffer->record_disabled);
32133761 }
....@@ -3220,7 +3768,7 @@
32203768 * Note, multiple disables will need the same number of enables
32213769 * to truly enable the writing (much like preempt_disable).
32223770 */
3223
-void ring_buffer_record_enable(struct ring_buffer *buffer)
3771
+void ring_buffer_record_enable(struct trace_buffer *buffer)
32243772 {
32253773 atomic_dec(&buffer->record_disabled);
32263774 }
....@@ -3237,7 +3785,7 @@
32373785 * it works like an on/off switch, where as the disable() version
32383786 * must be paired with a enable().
32393787 */
3240
-void ring_buffer_record_off(struct ring_buffer *buffer)
3788
+void ring_buffer_record_off(struct trace_buffer *buffer)
32413789 {
32423790 unsigned int rd;
32433791 unsigned int new_rd;
....@@ -3260,7 +3808,7 @@
32603808 * it works like an on/off switch, where as the enable() version
32613809 * must be paired with a disable().
32623810 */
3263
-void ring_buffer_record_on(struct ring_buffer *buffer)
3811
+void ring_buffer_record_on(struct trace_buffer *buffer)
32643812 {
32653813 unsigned int rd;
32663814 unsigned int new_rd;
....@@ -3278,7 +3826,7 @@
32783826 *
32793827 * Returns true if the ring buffer is in a state that it accepts writes.
32803828 */
3281
-bool ring_buffer_record_is_on(struct ring_buffer *buffer)
3829
+bool ring_buffer_record_is_on(struct trace_buffer *buffer)
32823830 {
32833831 return !atomic_read(&buffer->record_disabled);
32843832 }
....@@ -3294,7 +3842,7 @@
32943842 * ring_buffer_record_disable(), as that is a temporary disabling of
32953843 * the ring buffer.
32963844 */
3297
-bool ring_buffer_record_is_set_on(struct ring_buffer *buffer)
3845
+bool ring_buffer_record_is_set_on(struct trace_buffer *buffer)
32983846 {
32993847 return !(atomic_read(&buffer->record_disabled) & RB_BUFFER_OFF);
33003848 }
....@@ -3307,9 +3855,9 @@
33073855 * This prevents all writes to the buffer. Any attempt to write
33083856 * to the buffer after this will fail and return NULL.
33093857 *
3310
- * The caller should call synchronize_sched() after this.
3858
+ * The caller should call synchronize_rcu() after this.
33113859 */
3312
-void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
3860
+void ring_buffer_record_disable_cpu(struct trace_buffer *buffer, int cpu)
33133861 {
33143862 struct ring_buffer_per_cpu *cpu_buffer;
33153863
....@@ -3329,7 +3877,7 @@
33293877 * Note, multiple disables will need the same number of enables
33303878 * to truly enable the writing (much like preempt_disable).
33313879 */
3332
-void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
3880
+void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
33333881 {
33343882 struct ring_buffer_per_cpu *cpu_buffer;
33353883
....@@ -3359,7 +3907,7 @@
33593907 * @buffer: The ring buffer
33603908 * @cpu: The per CPU buffer to read from.
33613909 */
3362
-u64 ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
3910
+u64 ring_buffer_oldest_event_ts(struct trace_buffer *buffer, int cpu)
33633911 {
33643912 unsigned long flags;
33653913 struct ring_buffer_per_cpu *cpu_buffer;
....@@ -3388,11 +3936,11 @@
33883936 EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
33893937
33903938 /**
3391
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
3939
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
33923940 * @buffer: The ring buffer
33933941 * @cpu: The per CPU buffer to read from.
33943942 */
3395
-unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
3943
+unsigned long ring_buffer_bytes_cpu(struct trace_buffer *buffer, int cpu)
33963944 {
33973945 struct ring_buffer_per_cpu *cpu_buffer;
33983946 unsigned long ret;
....@@ -3412,7 +3960,7 @@
34123960 * @buffer: The ring buffer
34133961 * @cpu: The per CPU buffer to get the entries from.
34143962 */
3415
-unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
3963
+unsigned long ring_buffer_entries_cpu(struct trace_buffer *buffer, int cpu)
34163964 {
34173965 struct ring_buffer_per_cpu *cpu_buffer;
34183966
....@@ -3431,7 +3979,7 @@
34313979 * @buffer: The ring buffer
34323980 * @cpu: The per CPU buffer to get the number of overruns from
34333981 */
3434
-unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
3982
+unsigned long ring_buffer_overrun_cpu(struct trace_buffer *buffer, int cpu)
34353983 {
34363984 struct ring_buffer_per_cpu *cpu_buffer;
34373985 unsigned long ret;
....@@ -3454,7 +4002,7 @@
34544002 * @cpu: The per CPU buffer to get the number of overruns from
34554003 */
34564004 unsigned long
3457
-ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
4005
+ring_buffer_commit_overrun_cpu(struct trace_buffer *buffer, int cpu)
34584006 {
34594007 struct ring_buffer_per_cpu *cpu_buffer;
34604008 unsigned long ret;
....@@ -3476,7 +4024,7 @@
34764024 * @cpu: The per CPU buffer to get the number of overruns from
34774025 */
34784026 unsigned long
3479
-ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
4027
+ring_buffer_dropped_events_cpu(struct trace_buffer *buffer, int cpu)
34804028 {
34814029 struct ring_buffer_per_cpu *cpu_buffer;
34824030 unsigned long ret;
....@@ -3497,7 +4045,7 @@
34974045 * @cpu: The per CPU buffer to get the number of events read
34984046 */
34994047 unsigned long
3500
-ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
4048
+ring_buffer_read_events_cpu(struct trace_buffer *buffer, int cpu)
35014049 {
35024050 struct ring_buffer_per_cpu *cpu_buffer;
35034051
....@@ -3516,7 +4064,7 @@
35164064 * Returns the total number of entries in the ring buffer
35174065 * (all CPU entries)
35184066 */
3519
-unsigned long ring_buffer_entries(struct ring_buffer *buffer)
4067
+unsigned long ring_buffer_entries(struct trace_buffer *buffer)
35204068 {
35214069 struct ring_buffer_per_cpu *cpu_buffer;
35224070 unsigned long entries = 0;
....@@ -3539,7 +4087,7 @@
35394087 * Returns the total number of overruns in the ring buffer
35404088 * (all CPU entries)
35414089 */
3542
-unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
4090
+unsigned long ring_buffer_overruns(struct trace_buffer *buffer)
35434091 {
35444092 struct ring_buffer_per_cpu *cpu_buffer;
35454093 unsigned long overruns = 0;
....@@ -3562,14 +4110,19 @@
35624110 /* Iterator usage is expected to have record disabled */
35634111 iter->head_page = cpu_buffer->reader_page;
35644112 iter->head = cpu_buffer->reader_page->read;
4113
+ iter->next_event = iter->head;
35654114
35664115 iter->cache_reader_page = iter->head_page;
35674116 iter->cache_read = cpu_buffer->read;
4117
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
35684118
3569
- if (iter->head)
4119
+ if (iter->head) {
35704120 iter->read_stamp = cpu_buffer->read_stamp;
3571
- else
4121
+ iter->page_stamp = cpu_buffer->reader_page->page->time_stamp;
4122
+ } else {
35724123 iter->read_stamp = iter->head_page->page->time_stamp;
4124
+ iter->page_stamp = iter->read_stamp;
4125
+ }
35734126 }
35744127
35754128 /**
....@@ -3605,17 +4158,38 @@
36054158 struct buffer_page *reader;
36064159 struct buffer_page *head_page;
36074160 struct buffer_page *commit_page;
4161
+ struct buffer_page *curr_commit_page;
36084162 unsigned commit;
4163
+ u64 curr_commit_ts;
4164
+ u64 commit_ts;
36094165
36104166 cpu_buffer = iter->cpu_buffer;
3611
-
3612
- /* Remember, trace recording is off when iterator is in use */
36134167 reader = cpu_buffer->reader_page;
36144168 head_page = cpu_buffer->head_page;
36154169 commit_page = cpu_buffer->commit_page;
3616
- commit = rb_page_commit(commit_page);
4170
+ commit_ts = commit_page->page->time_stamp;
36174171
3618
- return ((iter->head_page == commit_page && iter->head == commit) ||
4172
+ /*
4173
+ * When the writer goes across pages, it issues a cmpxchg which
4174
+ * is a mb(), which will synchronize with the rmb here.
4175
+ * (see rb_tail_page_update())
4176
+ */
4177
+ smp_rmb();
4178
+ commit = rb_page_commit(commit_page);
4179
+ /* We want to make sure that the commit page doesn't change */
4180
+ smp_rmb();
4181
+
4182
+ /* Make sure commit page didn't change */
4183
+ curr_commit_page = READ_ONCE(cpu_buffer->commit_page);
4184
+ curr_commit_ts = READ_ONCE(curr_commit_page->page->time_stamp);
4185
+
4186
+ /* If the commit page changed, then there's more data */
4187
+ if (curr_commit_page != commit_page ||
4188
+ curr_commit_ts != commit_ts)
4189
+ return 0;
4190
+
4191
+ /* Still racy, as it may return a false positive, but that's OK */
4192
+ return ((iter->head_page == commit_page && iter->head >= commit) ||
36194193 (iter->head_page == reader && commit_page == head_page &&
36204194 head_page->read == commit &&
36214195 iter->head == rb_page_commit(cpu_buffer->reader_page)));
....@@ -3647,7 +4221,7 @@
36474221 return;
36484222
36494223 default:
3650
- BUG();
4224
+ RB_WARN_ON(cpu_buffer, 1);
36514225 }
36524226 return;
36534227 }
....@@ -3677,7 +4251,7 @@
36774251 return;
36784252
36794253 default:
3680
- BUG();
4254
+ RB_WARN_ON(iter->cpu_buffer, 1);
36814255 }
36824256 return;
36834257 }
....@@ -3786,12 +4360,14 @@
37864360 goto spin;
37874361
37884362 /*
3789
- * Yeah! We succeeded in replacing the page.
4363
+ * Yay! We succeeded in replacing the page.
37904364 *
37914365 * Now make the new head point back to the reader page.
37924366 */
37934367 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
37944368 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
4369
+
4370
+ local_inc(&cpu_buffer->pages_read);
37954371
37964372 /* Finally update the reader page to the new head */
37974373 cpu_buffer->reader_page = reader;
....@@ -3811,6 +4387,38 @@
38114387
38124388 arch_spin_unlock(&cpu_buffer->lock);
38134389 local_irq_restore(flags);
4390
+
4391
+ /*
4392
+ * The writer has preempt disable, wait for it. But not forever
4393
+ * Although, 1 second is pretty much "forever"
4394
+ */
4395
+#define USECS_WAIT 1000000
4396
+ for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
4397
+ /* If the write is past the end of page, a writer is still updating it */
4398
+ if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
4399
+ break;
4400
+
4401
+ udelay(1);
4402
+
4403
+ /* Get the latest version of the reader write value */
4404
+ smp_rmb();
4405
+ }
4406
+
4407
+ /* The writer is not moving forward? Something is wrong */
4408
+ if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT))
4409
+ reader = NULL;
4410
+
4411
+ /*
4412
+ * Make sure we see any padding after the write update
4413
+ * (see rb_reset_tail()).
4414
+ *
4415
+ * In addition, a writer may be writing on the reader page
4416
+ * if the page has not been fully filled, so the read barrier
4417
+ * is also needed to make sure we see the content of what is
4418
+ * committed by the writer (see rb_set_commit_to_write()).
4419
+ */
4420
+ smp_rmb();
4421
+
38144422
38154423 return reader;
38164424 }
....@@ -3836,20 +4444,28 @@
38364444
38374445 length = rb_event_length(event);
38384446 cpu_buffer->reader_page->read += length;
4447
+ cpu_buffer->read_bytes += length;
38394448 }
38404449
38414450 static void rb_advance_iter(struct ring_buffer_iter *iter)
38424451 {
38434452 struct ring_buffer_per_cpu *cpu_buffer;
3844
- struct ring_buffer_event *event;
3845
- unsigned length;
38464453
38474454 cpu_buffer = iter->cpu_buffer;
4455
+
4456
+ /* If head == next_event then we need to jump to the next event */
4457
+ if (iter->head == iter->next_event) {
4458
+ /* If the event gets overwritten again, there's nothing to do */
4459
+ if (rb_iter_head_event(iter) == NULL)
4460
+ return;
4461
+ }
4462
+
4463
+ iter->head = iter->next_event;
38484464
38494465 /*
38504466 * Check if we are at the end of the buffer.
38514467 */
3852
- if (iter->head >= rb_page_size(iter->head_page)) {
4468
+ if (iter->next_event >= rb_page_size(iter->head_page)) {
38534469 /* discarded commits can make the page empty */
38544470 if (iter->head_page == cpu_buffer->commit_page)
38554471 return;
....@@ -3857,27 +4473,7 @@
38574473 return;
38584474 }
38594475
3860
- event = rb_iter_head_event(iter);
3861
-
3862
- length = rb_event_length(event);
3863
-
3864
- /*
3865
- * This should not be called to advance the header if we are
3866
- * at the tail of the buffer.
3867
- */
3868
- if (RB_WARN_ON(cpu_buffer,
3869
- (iter->head_page == cpu_buffer->commit_page) &&
3870
- (iter->head + length > rb_commit_index(cpu_buffer))))
3871
- return;
3872
-
3873
- rb_update_iter_read_stamp(iter, event);
3874
-
3875
- iter->head += length;
3876
-
3877
- /* check for end of page padding */
3878
- if ((iter->head >= rb_page_size(iter->head_page)) &&
3879
- (iter->head_page != cpu_buffer->commit_page))
3880
- rb_inc_iter(iter);
4476
+ rb_update_iter_read_stamp(iter, iter->event);
38814477 }
38824478
38834479 static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
....@@ -3951,7 +4547,7 @@
39514547 return event;
39524548
39534549 default:
3954
- BUG();
4550
+ RB_WARN_ON(cpu_buffer, 1);
39554551 }
39564552
39574553 return NULL;
....@@ -3961,7 +4557,7 @@
39614557 static struct ring_buffer_event *
39624558 rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
39634559 {
3964
- struct ring_buffer *buffer;
4560
+ struct trace_buffer *buffer;
39654561 struct ring_buffer_per_cpu *cpu_buffer;
39664562 struct ring_buffer_event *event;
39674563 int nr_loops = 0;
....@@ -3973,12 +4569,13 @@
39734569 buffer = cpu_buffer->buffer;
39744570
39754571 /*
3976
- * Check if someone performed a consuming read to
3977
- * the buffer. A consuming read invalidates the iterator
3978
- * and we need to reset the iterator in this case.
4572
+ * Check if someone performed a consuming read to the buffer
4573
+ * or removed some pages from the buffer. In these cases,
4574
+ * iterator was invalidated and we need to reset it.
39794575 */
39804576 if (unlikely(iter->cache_read != cpu_buffer->read ||
3981
- iter->cache_reader_page != cpu_buffer->reader_page))
4577
+ iter->cache_reader_page != cpu_buffer->reader_page ||
4578
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
39824579 rb_iter_reset(iter);
39834580
39844581 again:
....@@ -3986,14 +4583,13 @@
39864583 return NULL;
39874584
39884585 /*
3989
- * We repeat when a time extend is encountered or we hit
3990
- * the end of the page. Since the time extend is always attached
3991
- * to a data event, we should never loop more than three times.
3992
- * Once for going to next page, once on time extend, and
3993
- * finally once to get the event.
3994
- * (We never hit the following condition more than thrice).
4586
+ * As the writer can mess with what the iterator is trying
4587
+ * to read, just give up if we fail to get an event after
4588
+ * three tries. The iterator is not as reliable when reading
4589
+ * the ring buffer with an active write as the consumer is.
4590
+ * Do not warn if the three failures is reached.
39954591 */
3996
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
4592
+ if (++nr_loops > 3)
39974593 return NULL;
39984594
39994595 if (rb_per_cpu_empty(cpu_buffer))
....@@ -4005,6 +4601,8 @@
40054601 }
40064602
40074603 event = rb_iter_head_event(iter);
4604
+ if (!event)
4605
+ goto again;
40084606
40094607 switch (event->type_len) {
40104608 case RINGBUF_TYPE_PADDING:
....@@ -4039,7 +4637,7 @@
40394637 return event;
40404638
40414639 default:
4042
- BUG();
4640
+ RB_WARN_ON(cpu_buffer, 1);
40434641 }
40444642
40454643 return NULL;
....@@ -4089,7 +4687,7 @@
40894687 * not consume the data.
40904688 */
40914689 struct ring_buffer_event *
4092
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
4690
+ring_buffer_peek(struct trace_buffer *buffer, int cpu, u64 *ts,
40934691 unsigned long *lost_events)
40944692 {
40954693 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
....@@ -4114,6 +4712,20 @@
41144712
41154713 return event;
41164714 }
4715
+
4716
+/** ring_buffer_iter_dropped - report if there are dropped events
4717
+ * @iter: The ring buffer iterator
4718
+ *
4719
+ * Returns true if there was dropped events since the last peek.
4720
+ */
4721
+bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter)
4722
+{
4723
+ bool ret = iter->missed_events != 0;
4724
+
4725
+ iter->missed_events = 0;
4726
+ return ret;
4727
+}
4728
+EXPORT_SYMBOL_GPL(ring_buffer_iter_dropped);
41174729
41184730 /**
41194731 * ring_buffer_iter_peek - peek at the next event to be read
....@@ -4153,7 +4765,7 @@
41534765 * and eventually empty the ring buffer if the producer is slower.
41544766 */
41554767 struct ring_buffer_event *
4156
-ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
4768
+ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
41574769 unsigned long *lost_events)
41584770 {
41594771 struct ring_buffer_per_cpu *cpu_buffer;
....@@ -4213,7 +4825,7 @@
42134825 * This overall must be paired with ring_buffer_read_finish.
42144826 */
42154827 struct ring_buffer_iter *
4216
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
4828
+ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
42174829 {
42184830 struct ring_buffer_per_cpu *cpu_buffer;
42194831 struct ring_buffer_iter *iter;
....@@ -4221,16 +4833,21 @@
42214833 if (!cpumask_test_cpu(cpu, buffer->cpumask))
42224834 return NULL;
42234835
4224
- iter = kmalloc(sizeof(*iter), flags);
4836
+ iter = kzalloc(sizeof(*iter), flags);
42254837 if (!iter)
42264838 return NULL;
4839
+
4840
+ iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
4841
+ if (!iter->event) {
4842
+ kfree(iter);
4843
+ return NULL;
4844
+ }
42274845
42284846 cpu_buffer = buffer->buffers[cpu];
42294847
42304848 iter->cpu_buffer = cpu_buffer;
42314849
4232
- atomic_inc(&buffer->resize_disabled);
4233
- atomic_inc(&cpu_buffer->record_disabled);
4850
+ atomic_inc(&cpu_buffer->resize_disabled);
42344851
42354852 return iter;
42364853 }
....@@ -4246,7 +4863,7 @@
42464863 void
42474864 ring_buffer_read_prepare_sync(void)
42484865 {
4249
- synchronize_sched();
4866
+ synchronize_rcu();
42504867 }
42514868 EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
42524869
....@@ -4303,48 +4920,38 @@
43034920 rb_check_pages(cpu_buffer);
43044921 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
43054922
4306
- atomic_dec(&cpu_buffer->record_disabled);
4307
- atomic_dec(&cpu_buffer->buffer->resize_disabled);
4923
+ atomic_dec(&cpu_buffer->resize_disabled);
4924
+ kfree(iter->event);
43084925 kfree(iter);
43094926 }
43104927 EXPORT_SYMBOL_GPL(ring_buffer_read_finish);
43114928
43124929 /**
4313
- * ring_buffer_read - read the next item in the ring buffer by the iterator
4930
+ * ring_buffer_iter_advance - advance the iterator to the next location
43144931 * @iter: The ring buffer iterator
4315
- * @ts: The time stamp of the event read.
43164932 *
4317
- * This reads the next event in the ring buffer and increments the iterator.
4933
+ * Move the location of the iterator such that the next read will
4934
+ * be the next location of the iterator.
43184935 */
4319
-struct ring_buffer_event *
4320
-ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
4936
+void ring_buffer_iter_advance(struct ring_buffer_iter *iter)
43214937 {
4322
- struct ring_buffer_event *event;
43234938 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
43244939 unsigned long flags;
43254940
43264941 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
4327
- again:
4328
- event = rb_iter_peek(iter, ts);
4329
- if (!event)
4330
- goto out;
4331
-
4332
- if (event->type_len == RINGBUF_TYPE_PADDING)
4333
- goto again;
43344942
43354943 rb_advance_iter(iter);
4336
- out:
4337
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
43384944
4339
- return event;
4945
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
43404946 }
4341
-EXPORT_SYMBOL_GPL(ring_buffer_read);
4947
+EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
43424948
43434949 /**
43444950 * ring_buffer_size - return the size of the ring buffer (in bytes)
43454951 * @buffer: The ring buffer.
4952
+ * @cpu: The CPU to get ring buffer size from.
43464953 */
4347
-unsigned long ring_buffer_size(struct ring_buffer *buffer, int cpu)
4954
+unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
43484955 {
43494956 /*
43504957 * Earlier, this method returned
....@@ -4359,28 +4966,34 @@
43594966 }
43604967 EXPORT_SYMBOL_GPL(ring_buffer_size);
43614968
4969
+static void rb_clear_buffer_page(struct buffer_page *page)
4970
+{
4971
+ local_set(&page->write, 0);
4972
+ local_set(&page->entries, 0);
4973
+ rb_init_page(page->page);
4974
+ page->read = 0;
4975
+}
4976
+
43624977 static void
43634978 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
43644979 {
4980
+ struct buffer_page *page;
4981
+
43654982 rb_head_page_deactivate(cpu_buffer);
43664983
43674984 cpu_buffer->head_page
43684985 = list_entry(cpu_buffer->pages, struct buffer_page, list);
4369
- local_set(&cpu_buffer->head_page->write, 0);
4370
- local_set(&cpu_buffer->head_page->entries, 0);
4371
- local_set(&cpu_buffer->head_page->page->commit, 0);
4372
-
4373
- cpu_buffer->head_page->read = 0;
4986
+ rb_clear_buffer_page(cpu_buffer->head_page);
4987
+ list_for_each_entry(page, cpu_buffer->pages, list) {
4988
+ rb_clear_buffer_page(page);
4989
+ }
43744990
43754991 cpu_buffer->tail_page = cpu_buffer->head_page;
43764992 cpu_buffer->commit_page = cpu_buffer->head_page;
43774993
43784994 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
43794995 INIT_LIST_HEAD(&cpu_buffer->new_pages);
4380
- local_set(&cpu_buffer->reader_page->write, 0);
4381
- local_set(&cpu_buffer->reader_page->entries, 0);
4382
- local_set(&cpu_buffer->reader_page->page->commit, 0);
4383
- cpu_buffer->reader_page->read = 0;
4996
+ rb_clear_buffer_page(cpu_buffer->reader_page);
43844997
43854998 local_set(&cpu_buffer->entries_bytes, 0);
43864999 local_set(&cpu_buffer->overrun, 0);
....@@ -4389,38 +5002,28 @@
43895002 local_set(&cpu_buffer->entries, 0);
43905003 local_set(&cpu_buffer->committing, 0);
43915004 local_set(&cpu_buffer->commits, 0);
5005
+ local_set(&cpu_buffer->pages_touched, 0);
5006
+ local_set(&cpu_buffer->pages_lost, 0);
5007
+ local_set(&cpu_buffer->pages_read, 0);
5008
+ cpu_buffer->last_pages_touch = 0;
5009
+ cpu_buffer->shortest_full = 0;
43925010 cpu_buffer->read = 0;
43935011 cpu_buffer->read_bytes = 0;
43945012
4395
- cpu_buffer->write_stamp = 0;
4396
- cpu_buffer->read_stamp = 0;
5013
+ rb_time_set(&cpu_buffer->write_stamp, 0);
5014
+ rb_time_set(&cpu_buffer->before_stamp, 0);
43975015
43985016 cpu_buffer->lost_events = 0;
43995017 cpu_buffer->last_overrun = 0;
44005018
44015019 rb_head_page_activate(cpu_buffer);
5020
+ cpu_buffer->pages_removed = 0;
44025021 }
44035022
4404
-/**
4405
- * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
4406
- * @buffer: The ring buffer to reset a per cpu buffer of
4407
- * @cpu: The CPU buffer to be reset
4408
- */
4409
-void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
5023
+/* Must have disabled the cpu buffer then done a synchronize_rcu */
5024
+static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
44105025 {
4411
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
44125026 unsigned long flags;
4413
-
4414
- if (!cpumask_test_cpu(cpu, buffer->cpumask))
4415
- return;
4416
- /* prevent another thread from changing buffer sizes */
4417
- mutex_lock(&buffer->mutex);
4418
-
4419
- atomic_inc(&buffer->resize_disabled);
4420
- atomic_inc(&cpu_buffer->record_disabled);
4421
-
4422
- /* Make sure all commits have finished */
4423
- synchronize_sched();
44245027
44255028 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
44265029
....@@ -4435,24 +5038,115 @@
44355038
44365039 out:
44375040 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
5041
+}
5042
+
5043
+/**
5044
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
5045
+ * @buffer: The ring buffer to reset a per cpu buffer of
5046
+ * @cpu: The CPU buffer to be reset
5047
+ */
5048
+void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
5049
+{
5050
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5051
+
5052
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
5053
+ return;
5054
+
5055
+ /* prevent another thread from changing buffer sizes */
5056
+ mutex_lock(&buffer->mutex);
5057
+
5058
+ atomic_inc(&cpu_buffer->resize_disabled);
5059
+ atomic_inc(&cpu_buffer->record_disabled);
5060
+
5061
+ /* Make sure all commits have finished */
5062
+ synchronize_rcu();
5063
+
5064
+ reset_disabled_cpu_buffer(cpu_buffer);
44385065
44395066 atomic_dec(&cpu_buffer->record_disabled);
4440
- atomic_dec(&buffer->resize_disabled);
5067
+ atomic_dec(&cpu_buffer->resize_disabled);
44415068
44425069 mutex_unlock(&buffer->mutex);
44435070 }
44445071 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
44455072
5073
+/* Flag to ensure proper resetting of atomic variables */
5074
+#define RESET_BIT (1 << 30)
5075
+
5076
+/**
5077
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
5078
+ * @buffer: The ring buffer to reset a per cpu buffer of
5079
+ * @cpu: The CPU buffer to be reset
5080
+ */
5081
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
5082
+{
5083
+ struct ring_buffer_per_cpu *cpu_buffer;
5084
+ int cpu;
5085
+
5086
+ /* prevent another thread from changing buffer sizes */
5087
+ mutex_lock(&buffer->mutex);
5088
+
5089
+ for_each_online_buffer_cpu(buffer, cpu) {
5090
+ cpu_buffer = buffer->buffers[cpu];
5091
+
5092
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
5093
+ atomic_inc(&cpu_buffer->record_disabled);
5094
+ }
5095
+
5096
+ /* Make sure all commits have finished */
5097
+ synchronize_rcu();
5098
+
5099
+ for_each_buffer_cpu(buffer, cpu) {
5100
+ cpu_buffer = buffer->buffers[cpu];
5101
+
5102
+ /*
5103
+ * If a CPU came online during the synchronize_rcu(), then
5104
+ * ignore it.
5105
+ */
5106
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
5107
+ continue;
5108
+
5109
+ reset_disabled_cpu_buffer(cpu_buffer);
5110
+
5111
+ atomic_dec(&cpu_buffer->record_disabled);
5112
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
5113
+ }
5114
+
5115
+ mutex_unlock(&buffer->mutex);
5116
+}
5117
+
44465118 /**
44475119 * ring_buffer_reset - reset a ring buffer
44485120 * @buffer: The ring buffer to reset all cpu buffers
44495121 */
4450
-void ring_buffer_reset(struct ring_buffer *buffer)
5122
+void ring_buffer_reset(struct trace_buffer *buffer)
44515123 {
5124
+ struct ring_buffer_per_cpu *cpu_buffer;
44525125 int cpu;
44535126
4454
- for_each_buffer_cpu(buffer, cpu)
4455
- ring_buffer_reset_cpu(buffer, cpu);
5127
+ /* prevent another thread from changing buffer sizes */
5128
+ mutex_lock(&buffer->mutex);
5129
+
5130
+ for_each_buffer_cpu(buffer, cpu) {
5131
+ cpu_buffer = buffer->buffers[cpu];
5132
+
5133
+ atomic_inc(&cpu_buffer->resize_disabled);
5134
+ atomic_inc(&cpu_buffer->record_disabled);
5135
+ }
5136
+
5137
+ /* Make sure all commits have finished */
5138
+ synchronize_rcu();
5139
+
5140
+ for_each_buffer_cpu(buffer, cpu) {
5141
+ cpu_buffer = buffer->buffers[cpu];
5142
+
5143
+ reset_disabled_cpu_buffer(cpu_buffer);
5144
+
5145
+ atomic_dec(&cpu_buffer->record_disabled);
5146
+ atomic_dec(&cpu_buffer->resize_disabled);
5147
+ }
5148
+
5149
+ mutex_unlock(&buffer->mutex);
44565150 }
44575151 EXPORT_SYMBOL_GPL(ring_buffer_reset);
44585152
....@@ -4460,7 +5154,7 @@
44605154 * rind_buffer_empty - is the ring buffer empty?
44615155 * @buffer: The ring buffer to test
44625156 */
4463
-bool ring_buffer_empty(struct ring_buffer *buffer)
5157
+bool ring_buffer_empty(struct trace_buffer *buffer)
44645158 {
44655159 struct ring_buffer_per_cpu *cpu_buffer;
44665160 unsigned long flags;
....@@ -4490,7 +5184,7 @@
44905184 * @buffer: The ring buffer
44915185 * @cpu: The CPU buffer to test
44925186 */
4493
-bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
5187
+bool ring_buffer_empty_cpu(struct trace_buffer *buffer, int cpu)
44945188 {
44955189 struct ring_buffer_per_cpu *cpu_buffer;
44965190 unsigned long flags;
....@@ -4516,14 +5210,15 @@
45165210 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
45175211 * @buffer_a: One buffer to swap with
45185212 * @buffer_b: The other buffer to swap with
5213
+ * @cpu: the CPU of the buffers to swap
45195214 *
45205215 * This function is useful for tracers that want to take a "snapshot"
45215216 * of a CPU buffer and has another back up buffer lying around.
45225217 * it is expected that the tracer handles the cpu buffer not being
45235218 * used at the moment.
45245219 */
4525
-int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
4526
- struct ring_buffer *buffer_b, int cpu)
5220
+int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
5221
+ struct trace_buffer *buffer_b, int cpu)
45275222 {
45285223 struct ring_buffer_per_cpu *cpu_buffer_a;
45295224 struct ring_buffer_per_cpu *cpu_buffer_b;
....@@ -4555,7 +5250,7 @@
45555250 goto out;
45565251
45575252 /*
4558
- * We can't do a synchronize_sched here because this
5253
+ * We can't do a synchronize_rcu here because this
45595254 * function can be called in atomic context.
45605255 * Normally this will be called from the same CPU as cpu.
45615256 * If not it's up to the caller to protect this.
....@@ -4567,6 +5262,15 @@
45675262 if (local_read(&cpu_buffer_a->committing))
45685263 goto out_dec;
45695264 if (local_read(&cpu_buffer_b->committing))
5265
+ goto out_dec;
5266
+
5267
+ /*
5268
+ * When resize is in progress, we cannot swap it because
5269
+ * it will mess the state of the cpu buffer.
5270
+ */
5271
+ if (atomic_read(&buffer_a->resizing))
5272
+ goto out_dec;
5273
+ if (atomic_read(&buffer_b->resizing))
45705274 goto out_dec;
45715275
45725276 buffer_a->buffers[cpu] = cpu_buffer_b;
....@@ -4602,7 +5306,7 @@
46025306 * Returns:
46035307 * The page allocated, or ERR_PTR
46045308 */
4605
-void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
5309
+void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
46065310 {
46075311 struct ring_buffer_per_cpu *cpu_buffer;
46085312 struct buffer_data_page *bpage = NULL;
....@@ -4649,12 +5353,17 @@
46495353 *
46505354 * Free a page allocated from ring_buffer_alloc_read_page.
46515355 */
4652
-void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
5356
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
46535357 {
4654
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5358
+ struct ring_buffer_per_cpu *cpu_buffer;
46555359 struct buffer_data_page *bpage = data;
46565360 struct page *page = virt_to_page(bpage);
46575361 unsigned long flags;
5362
+
5363
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
5364
+ return;
5365
+
5366
+ cpu_buffer = buffer->buffers[cpu];
46585367
46595368 /* If the page is still in use someplace else, we can't reuse it */
46605369 if (page_ref_count(page) > 1)
....@@ -4709,7 +5418,7 @@
47095418 * >=0 if data has been transferred, returns the offset of consumed data.
47105419 * <0 if no data has been transferred.
47115420 */
4712
-int ring_buffer_read_page(struct ring_buffer *buffer,
5421
+int ring_buffer_read_page(struct trace_buffer *buffer,
47135422 void **data_page, size_t len, int cpu, int full)
47145423 {
47155424 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
....@@ -4770,7 +5479,15 @@
47705479 unsigned int pos = 0;
47715480 unsigned int size;
47725481
4773
- if (full)
5482
+ /*
5483
+ * If a full page is expected, this can still be returned
5484
+ * if there's been a previous partial read and the
5485
+ * rest of the page can be read and the commit page is off
5486
+ * the reader page.
5487
+ */
5488
+ if (full &&
5489
+ (!read || (len < (commit - read)) ||
5490
+ cpu_buffer->reader_page == cpu_buffer->commit_page))
47745491 goto out_unlock;
47755492
47765493 if (len > (commit - read))
....@@ -4819,7 +5536,7 @@
48195536 } else {
48205537 /* update the entry counter */
48215538 cpu_buffer->read += rb_page_entries(reader);
4822
- cpu_buffer->read_bytes += BUF_PAGE_SIZE;
5539
+ cpu_buffer->read_bytes += rb_page_commit(reader);
48235540
48245541 /* swap the pages */
48255542 rb_init_page(bpage);
....@@ -4880,12 +5597,12 @@
48805597 */
48815598 int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node)
48825599 {
4883
- struct ring_buffer *buffer;
5600
+ struct trace_buffer *buffer;
48845601 long nr_pages_same;
48855602 int cpu_i;
48865603 unsigned long nr_pages;
48875604
4888
- buffer = container_of(node, struct ring_buffer, node);
5605
+ buffer = container_of(node, struct trace_buffer, node);
48895606 if (cpumask_test_cpu(cpu, buffer->cpumask))
48905607 return 0;
48915608
....@@ -4935,7 +5652,7 @@
49355652 static struct task_struct *rb_threads[NR_CPUS] __initdata;
49365653
49375654 struct rb_test_data {
4938
- struct ring_buffer *buffer;
5655
+ struct trace_buffer *buffer;
49395656 unsigned long events;
49405657 unsigned long bytes_written;
49415658 unsigned long bytes_alloc;
....@@ -4983,7 +5700,7 @@
49835700 cnt = data->cnt + (nested ? 27 : 0);
49845701
49855702 /* Multiply cnt by ~e, to make some unique increment */
4986
- size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
5703
+ size = (cnt * 68 / 25) % (sizeof(rb_string) - 1);
49875704
49885705 len = size + sizeof(struct rb_item);
49895706
....@@ -5077,10 +5794,15 @@
50775794 static __init int test_ringbuffer(void)
50785795 {
50795796 struct task_struct *rb_hammer;
5080
- struct ring_buffer *buffer;
5797
+ struct trace_buffer *buffer;
50815798 int cpu;
50825799 int ret = 0;
50835800
5801
+ if (security_locked_down(LOCKDOWN_TRACEFS)) {
5802
+ pr_warn("Lockdown is enabled, skipping ring buffer tests\n");
5803
+ return 0;
5804
+ }
5805
+
50845806 pr_info("Running ring buffer tests...\n");
50855807
50865808 buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);