hc
2024-12-19 9370bb92b2d16684ee45cf24e879c93c509162da
kernel/kernel/trace/ring_buffer.c
....@@ -355,10 +355,11 @@
355355 local_set(&bpage->commit, 0);
356356 }
357357
358
-/*
359
- * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
360
- * this issue out.
361
- */
358
+static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
359
+{
360
+ return local_read(&bpage->page->commit);
361
+}
362
+
362363 static void free_buffer_page(struct buffer_page *bpage)
363364 {
364365 free_page((unsigned long)bpage->page);
....@@ -526,6 +527,8 @@
526527 rb_time_t write_stamp;
527528 rb_time_t before_stamp;
528529 u64 read_stamp;
530
+ /* pages removed since last reset */
531
+ unsigned long pages_removed;
529532 /* ring buffer pages to update, > 0 to add, < 0 to remove */
530533 long nr_pages_to_update;
531534 struct list_head new_pages; /* new pages to add */
....@@ -539,6 +542,7 @@
539542 unsigned flags;
540543 int cpus;
541544 atomic_t record_disabled;
545
+ atomic_t resizing;
542546 cpumask_var_t cpumask;
543547
544548 struct lock_class_key *reader_lock_key;
....@@ -561,6 +565,7 @@
561565 struct buffer_page *head_page;
562566 struct buffer_page *cache_reader_page;
563567 unsigned long cache_read;
568
+ unsigned long cache_pages_removed;
564569 u64 read_stamp;
565570 u64 page_stamp;
566571 struct ring_buffer_event *event;
....@@ -1004,6 +1009,9 @@
10041009 if (full) {
10051010 poll_wait(filp, &work->full_waiters, poll_table);
10061011 work->full_waiters_pending = true;
1012
+ if (!cpu_buffer->shortest_full ||
1013
+ cpu_buffer->shortest_full > full)
1014
+ cpu_buffer->shortest_full = full;
10071015 } else {
10081016 poll_wait(filp, &work->waiters, poll_table);
10091017 work->waiters_pending = true;
....@@ -1450,19 +1458,6 @@
14501458 }
14511459
14521460 /**
1453
- * rb_check_list - make sure a pointer to a list has the last bits zero
1454
- */
1455
-static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
1456
- struct list_head *list)
1457
-{
1458
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
1459
- return 1;
1460
- if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
1461
- return 1;
1462
- return 0;
1463
-}
1464
-
1465
-/**
14661461 * rb_check_pages - integrity check of buffer pages
14671462 * @cpu_buffer: CPU buffer with pages to test
14681463 *
....@@ -1471,35 +1466,26 @@
14711466 */
14721467 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
14731468 {
1474
- struct list_head *head = cpu_buffer->pages;
1475
- struct buffer_page *bpage, *tmp;
1469
+ struct list_head *head = rb_list_head(cpu_buffer->pages);
1470
+ struct list_head *tmp;
14761471
1477
- /* Reset the head page if it exists */
1478
- if (cpu_buffer->head_page)
1479
- rb_set_head_page(cpu_buffer);
1480
-
1481
- rb_head_page_deactivate(cpu_buffer);
1482
-
1483
- if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
1484
- return -1;
1485
- if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
1472
+ if (RB_WARN_ON(cpu_buffer,
1473
+ rb_list_head(rb_list_head(head->next)->prev) != head))
14861474 return -1;
14871475
1488
- if (rb_check_list(cpu_buffer, head))
1476
+ if (RB_WARN_ON(cpu_buffer,
1477
+ rb_list_head(rb_list_head(head->prev)->next) != head))
14891478 return -1;
14901479
1491
- list_for_each_entry_safe(bpage, tmp, head, list) {
1480
+ for (tmp = rb_list_head(head->next); tmp != head; tmp = rb_list_head(tmp->next)) {
14921481 if (RB_WARN_ON(cpu_buffer,
1493
- bpage->list.next->prev != &bpage->list))
1482
+ rb_list_head(rb_list_head(tmp->next)->prev) != tmp))
14941483 return -1;
1484
+
14951485 if (RB_WARN_ON(cpu_buffer,
1496
- bpage->list.prev->next != &bpage->list))
1497
- return -1;
1498
- if (rb_check_list(cpu_buffer, &bpage->list))
1486
+ rb_list_head(rb_list_head(tmp->prev)->next) != tmp))
14991487 return -1;
15001488 }
1501
-
1502
- rb_head_page_activate(cpu_buffer);
15031489
15041490 return 0;
15051491 }
....@@ -1666,6 +1652,8 @@
16661652 struct list_head *head = cpu_buffer->pages;
16671653 struct buffer_page *bpage, *tmp;
16681654
1655
+ irq_work_sync(&cpu_buffer->irq_work.work);
1656
+
16691657 free_buffer_page(cpu_buffer->reader_page);
16701658
16711659 if (head) {
....@@ -1772,6 +1760,8 @@
17721760
17731761 cpuhp_state_remove_instance(CPUHP_TRACE_RB_PREPARE, &buffer->node);
17741762
1763
+ irq_work_sync(&buffer->irq_work.work);
1764
+
17751765 for_each_buffer_cpu(buffer, cpu)
17761766 rb_free_cpu_buffer(buffer->buffers[cpu]);
17771767
....@@ -1851,6 +1841,8 @@
18511841 to_remove = rb_list_head(to_remove)->next;
18521842 head_bit |= (unsigned long)to_remove & RB_PAGE_HEAD;
18531843 }
1844
+ /* Read iterators need to reset themselves when some pages removed */
1845
+ cpu_buffer->pages_removed += nr_removed;
18541846
18551847 next_page = rb_list_head(to_remove)->next;
18561848
....@@ -1871,12 +1863,6 @@
18711863 if (head_bit)
18721864 cpu_buffer->head_page = list_entry(next_page,
18731865 struct buffer_page, list);
1874
-
1875
- /*
1876
- * change read pointer to make sure any read iterators reset
1877
- * themselves
1878
- */
1879
- cpu_buffer->read = 0;
18801866
18811867 /* pages are removed, resume tracing and then free the pages */
18821868 atomic_dec(&cpu_buffer->record_disabled);
....@@ -1905,7 +1891,7 @@
19051891 * Increment overrun to account for the lost events.
19061892 */
19071893 local_add(page_entries, &cpu_buffer->overrun);
1908
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
1894
+ local_sub(rb_page_commit(to_remove_page), &cpu_buffer->entries_bytes);
19091895 local_inc(&cpu_buffer->pages_lost);
19101896 }
19111897
....@@ -2060,7 +2046,7 @@
20602046
20612047 /* prevent another thread from changing buffer sizes */
20622048 mutex_lock(&buffer->mutex);
2063
-
2049
+ atomic_inc(&buffer->resizing);
20642050
20652051 if (cpu_id == RING_BUFFER_ALL_CPUS) {
20662052 /*
....@@ -2098,6 +2084,8 @@
20982084 err = -ENOMEM;
20992085 goto out_err;
21002086 }
2087
+
2088
+ cond_resched();
21012089 }
21022090
21032091 get_online_cpus();
....@@ -2203,6 +2191,7 @@
22032191 atomic_dec(&buffer->record_disabled);
22042192 }
22052193
2194
+ atomic_dec(&buffer->resizing);
22062195 mutex_unlock(&buffer->mutex);
22072196 return 0;
22082197
....@@ -2223,6 +2212,7 @@
22232212 }
22242213 }
22252214 out_err_unlock:
2215
+ atomic_dec(&buffer->resizing);
22262216 mutex_unlock(&buffer->mutex);
22272217 return err;
22282218 }
....@@ -2251,11 +2241,6 @@
22512241 cpu_buffer->reader_page->read);
22522242 }
22532243
2254
-static __always_inline unsigned rb_page_commit(struct buffer_page *bpage)
2255
-{
2256
- return local_read(&bpage->page->commit);
2257
-}
2258
-
22592244 static struct ring_buffer_event *
22602245 rb_iter_head_event(struct ring_buffer_iter *iter)
22612246 {
....@@ -2274,6 +2259,11 @@
22742259 */
22752260 commit = rb_page_commit(iter_head_page);
22762261 smp_rmb();
2262
+
2263
+ /* An event needs to be at least 8 bytes in size */
2264
+ if (iter->head > commit - 8)
2265
+ goto reset;
2266
+
22772267 event = __rb_page_index(iter_head_page, iter->head);
22782268 length = rb_event_length(event);
22792269
....@@ -2396,7 +2386,7 @@
23962386 * the counters.
23972387 */
23982388 local_add(entries, &cpu_buffer->overrun);
2399
- local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
2389
+ local_sub(rb_page_commit(next_page), &cpu_buffer->entries_bytes);
24002390 local_inc(&cpu_buffer->pages_lost);
24012391
24022392 /*
....@@ -2539,9 +2529,6 @@
25392529
25402530 event = __rb_page_index(tail_page, tail);
25412531
2542
- /* account for padding bytes */
2543
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
2544
-
25452532 /*
25462533 * Save the original length to the meta data.
25472534 * This will be used by the reader to add lost event
....@@ -2555,7 +2542,8 @@
25552542 * write counter enough to allow another writer to slip
25562543 * in on this page.
25572544 * We put in a discarded commit instead, to make sure
2558
- * that this space is not used again.
2545
+ * that this space is not used again, and this space will
2546
+ * not be accounted into 'entries_bytes'.
25592547 *
25602548 * If we are less than the minimum size, we don't need to
25612549 * worry about it.
....@@ -2579,6 +2567,9 @@
25792567 event->type_len = RINGBUF_TYPE_PADDING;
25802568 /* time delta must be non zero */
25812569 event->time_delta = 1;
2570
+
2571
+ /* account for padding bytes */
2572
+ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
25822573
25832574 /* Make sure the padding is visible before the tail_page->write update */
25842575 smp_wmb();
....@@ -2984,6 +2975,10 @@
29842975 if (RB_WARN_ON(cpu_buffer,
29852976 rb_is_reader_page(cpu_buffer->tail_page)))
29862977 return;
2978
+ /*
2979
+ * No need for a memory barrier here, as the update
2980
+ * of the tail_page did it for this page.
2981
+ */
29872982 local_set(&cpu_buffer->commit_page->page->commit,
29882983 rb_page_write(cpu_buffer->commit_page));
29892984 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
....@@ -2993,6 +2988,8 @@
29932988 while (rb_commit_index(cpu_buffer) !=
29942989 rb_page_write(cpu_buffer->commit_page)) {
29952990
2991
+ /* Make sure the readers see the content of what is committed. */
2992
+ smp_wmb();
29962993 local_set(&cpu_buffer->commit_page->page->commit,
29972994 rb_page_write(cpu_buffer->commit_page));
29982995 RB_WARN_ON(cpu_buffer,
....@@ -3939,7 +3936,7 @@
39393936 EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
39403937
39413938 /**
3942
- * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
3939
+ * ring_buffer_bytes_cpu - get the number of bytes unconsumed in a cpu buffer
39433940 * @buffer: The ring buffer
39443941 * @cpu: The per CPU buffer to read from.
39453942 */
....@@ -4117,6 +4114,7 @@
41174114
41184115 iter->cache_reader_page = iter->head_page;
41194116 iter->cache_read = cpu_buffer->read;
4117
+ iter->cache_pages_removed = cpu_buffer->pages_removed;
41204118
41214119 if (iter->head) {
41224120 iter->read_stamp = cpu_buffer->read_stamp;
....@@ -4412,7 +4410,12 @@
44124410
44134411 /*
44144412 * Make sure we see any padding after the write update
4415
- * (see rb_reset_tail())
4413
+ * (see rb_reset_tail()).
4414
+ *
4415
+ * In addition, a writer may be writing on the reader page
4416
+ * if the page has not been fully filled, so the read barrier
4417
+ * is also needed to make sure we see the content of what is
4418
+ * committed by the writer (see rb_set_commit_to_write()).
44164419 */
44174420 smp_rmb();
44184421
....@@ -4441,6 +4444,7 @@
44414444
44424445 length = rb_event_length(event);
44434446 cpu_buffer->reader_page->read += length;
4447
+ cpu_buffer->read_bytes += length;
44444448 }
44454449
44464450 static void rb_advance_iter(struct ring_buffer_iter *iter)
....@@ -4565,12 +4569,13 @@
45654569 buffer = cpu_buffer->buffer;
45664570
45674571 /*
4568
- * Check if someone performed a consuming read to
4569
- * the buffer. A consuming read invalidates the iterator
4570
- * and we need to reset the iterator in this case.
4572
+ * Check if someone performed a consuming read to the buffer
4573
+ * or removed some pages from the buffer. In these cases,
4574
+ * iterator was invalidated and we need to reset it.
45714575 */
45724576 if (unlikely(iter->cache_read != cpu_buffer->read ||
4573
- iter->cache_reader_page != cpu_buffer->reader_page))
4577
+ iter->cache_reader_page != cpu_buffer->reader_page ||
4578
+ iter->cache_pages_removed != cpu_buffer->pages_removed))
45744579 rb_iter_reset(iter);
45754580
45764581 again:
....@@ -4961,28 +4966,34 @@
49614966 }
49624967 EXPORT_SYMBOL_GPL(ring_buffer_size);
49634968
4969
+static void rb_clear_buffer_page(struct buffer_page *page)
4970
+{
4971
+ local_set(&page->write, 0);
4972
+ local_set(&page->entries, 0);
4973
+ rb_init_page(page->page);
4974
+ page->read = 0;
4975
+}
4976
+
49644977 static void
49654978 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
49664979 {
4980
+ struct buffer_page *page;
4981
+
49674982 rb_head_page_deactivate(cpu_buffer);
49684983
49694984 cpu_buffer->head_page
49704985 = list_entry(cpu_buffer->pages, struct buffer_page, list);
4971
- local_set(&cpu_buffer->head_page->write, 0);
4972
- local_set(&cpu_buffer->head_page->entries, 0);
4973
- local_set(&cpu_buffer->head_page->page->commit, 0);
4974
-
4975
- cpu_buffer->head_page->read = 0;
4986
+ rb_clear_buffer_page(cpu_buffer->head_page);
4987
+ list_for_each_entry(page, cpu_buffer->pages, list) {
4988
+ rb_clear_buffer_page(page);
4989
+ }
49764990
49774991 cpu_buffer->tail_page = cpu_buffer->head_page;
49784992 cpu_buffer->commit_page = cpu_buffer->head_page;
49794993
49804994 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
49814995 INIT_LIST_HEAD(&cpu_buffer->new_pages);
4982
- local_set(&cpu_buffer->reader_page->write, 0);
4983
- local_set(&cpu_buffer->reader_page->entries, 0);
4984
- local_set(&cpu_buffer->reader_page->page->commit, 0);
4985
- cpu_buffer->reader_page->read = 0;
4996
+ rb_clear_buffer_page(cpu_buffer->reader_page);
49864997
49874998 local_set(&cpu_buffer->entries_bytes, 0);
49884999 local_set(&cpu_buffer->overrun, 0);
....@@ -5006,6 +5017,7 @@
50065017 cpu_buffer->last_overrun = 0;
50075018
50085019 rb_head_page_activate(cpu_buffer);
5020
+ cpu_buffer->pages_removed = 0;
50095021 }
50105022
50115023 /* Must have disabled the cpu buffer then done a synchronize_rcu */
....@@ -5058,6 +5070,9 @@
50585070 }
50595071 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
50605072
5073
+/* Flag to ensure proper resetting of atomic variables */
5074
+#define RESET_BIT (1 << 30)
5075
+
50615076 /**
50625077 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
50635078 * @buffer: The ring buffer to reset a per cpu buffer of
....@@ -5074,20 +5089,27 @@
50745089 for_each_online_buffer_cpu(buffer, cpu) {
50755090 cpu_buffer = buffer->buffers[cpu];
50765091
5077
- atomic_inc(&cpu_buffer->resize_disabled);
5092
+ atomic_add(RESET_BIT, &cpu_buffer->resize_disabled);
50785093 atomic_inc(&cpu_buffer->record_disabled);
50795094 }
50805095
50815096 /* Make sure all commits have finished */
50825097 synchronize_rcu();
50835098
5084
- for_each_online_buffer_cpu(buffer, cpu) {
5099
+ for_each_buffer_cpu(buffer, cpu) {
50855100 cpu_buffer = buffer->buffers[cpu];
5101
+
5102
+ /*
5103
+ * If a CPU came online during the synchronize_rcu(), then
5104
+ * ignore it.
5105
+ */
5106
+ if (!(atomic_read(&cpu_buffer->resize_disabled) & RESET_BIT))
5107
+ continue;
50865108
50875109 reset_disabled_cpu_buffer(cpu_buffer);
50885110
50895111 atomic_dec(&cpu_buffer->record_disabled);
5090
- atomic_dec(&cpu_buffer->resize_disabled);
5112
+ atomic_sub(RESET_BIT, &cpu_buffer->resize_disabled);
50915113 }
50925114
50935115 mutex_unlock(&buffer->mutex);
....@@ -5242,6 +5264,15 @@
52425264 if (local_read(&cpu_buffer_b->committing))
52435265 goto out_dec;
52445266
5267
+ /*
5268
+ * When resize is in progress, we cannot swap it because
5269
+ * it will mess the state of the cpu buffer.
5270
+ */
5271
+ if (atomic_read(&buffer_a->resizing))
5272
+ goto out_dec;
5273
+ if (atomic_read(&buffer_b->resizing))
5274
+ goto out_dec;
5275
+
52455276 buffer_a->buffers[cpu] = cpu_buffer_b;
52465277 buffer_b->buffers[cpu] = cpu_buffer_a;
52475278
....@@ -5324,10 +5355,15 @@
53245355 */
53255356 void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
53265357 {
5327
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
5358
+ struct ring_buffer_per_cpu *cpu_buffer;
53285359 struct buffer_data_page *bpage = data;
53295360 struct page *page = virt_to_page(bpage);
53305361 unsigned long flags;
5362
+
5363
+ if (!buffer || !buffer->buffers || !buffer->buffers[cpu])
5364
+ return;
5365
+
5366
+ cpu_buffer = buffer->buffers[cpu];
53315367
53325368 /* If the page is still in use someplace else, we can't reuse it */
53335369 if (page_ref_count(page) > 1)
....@@ -5500,7 +5536,7 @@
55005536 } else {
55015537 /* update the entry counter */
55025538 cpu_buffer->read += rb_page_entries(reader);
5503
- cpu_buffer->read_bytes += BUF_PAGE_SIZE;
5539
+ cpu_buffer->read_bytes += rb_page_commit(reader);
55045540
55055541 /* swap the pages */
55065542 rb_init_page(bpage);