hc
2023-12-11 072de836f53be56a70cecf70b43ae43b7ce17376
kernel/mm/page_owner.c
....@@ -3,13 +3,14 @@
33 #include <linux/mm.h>
44 #include <linux/slab.h>
55 #include <linux/uaccess.h>
6
-#include <linux/bootmem.h>
6
+#include <linux/memblock.h>
77 #include <linux/stacktrace.h>
88 #include <linux/page_owner.h>
99 #include <linux/jump_label.h>
1010 #include <linux/migrate.h>
1111 #include <linux/stackdepot.h>
1212 #include <linux/seq_file.h>
13
+#include <linux/sched/clock.h>
1314
1415 #include "internal.h"
1516
....@@ -24,9 +25,13 @@
2425 short last_migrate_reason;
2526 gfp_t gfp_mask;
2627 depot_stack_handle_t handle;
28
+ depot_stack_handle_t free_handle;
29
+ u64 ts_nsec;
30
+ u64 free_ts_nsec;
31
+ pid_t pid;
2732 };
2833
29
-static bool page_owner_disabled = true;
34
+bool page_owner_enabled;
3035 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
3136
3237 static depot_stack_handle_t dummy_handle;
....@@ -41,7 +46,7 @@
4146 return -EINVAL;
4247
4348 if (strcmp(buf, "on") == 0)
44
- page_owner_disabled = false;
49
+ page_owner_enabled = true;
4550
4651 return 0;
4752 }
....@@ -49,24 +54,16 @@
4954
5055 static bool need_page_owner(void)
5156 {
52
- if (page_owner_disabled)
53
- return false;
54
-
55
- return true;
57
+ return page_owner_enabled;
5658 }
5759
5860 static __always_inline depot_stack_handle_t create_dummy_stack(void)
5961 {
6062 unsigned long entries[4];
61
- struct stack_trace dummy;
63
+ unsigned int nr_entries;
6264
63
- dummy.nr_entries = 0;
64
- dummy.max_entries = ARRAY_SIZE(entries);
65
- dummy.entries = &entries[0];
66
- dummy.skip = 0;
67
-
68
- save_stack_trace(&dummy);
69
- return depot_save_stack(&dummy, GFP_KERNEL);
65
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
66
+ return stack_depot_save(entries, nr_entries, GFP_KERNEL);
7067 }
7168
7269 static noinline void register_dummy_stack(void)
....@@ -86,7 +83,7 @@
8683
8784 static void init_page_owner(void)
8885 {
89
- if (page_owner_disabled)
86
+ if (!page_owner_enabled)
9087 return;
9188
9289 register_dummy_stack();
....@@ -102,103 +99,133 @@
10299 .init = init_page_owner,
103100 };
104101
105
-static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
102
+struct page_owner *get_page_owner(struct page_ext *page_ext)
106103 {
107104 return (void *)page_ext + page_owner_ops.offset;
108105 }
106
+EXPORT_SYMBOL_GPL(get_page_owner);
109107
110
-void __reset_page_owner(struct page *page, unsigned int order)
108
+depot_stack_handle_t get_page_owner_handle(struct page_ext *page_ext, unsigned long pfn)
111109 {
112
- int i;
113
- struct page_ext *page_ext;
110
+ struct page_owner *page_owner;
111
+ depot_stack_handle_t handle;
114112
115
- for (i = 0; i < (1 << order); i++) {
116
- page_ext = lookup_page_ext(page + i);
117
- if (unlikely(!page_ext))
118
- continue;
119
- __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
120
- }
113
+ if (!page_owner_enabled)
114
+ return 0;
115
+
116
+ page_owner = get_page_owner(page_ext);
117
+
118
+ /* skip handle for tail pages of higher order allocations */
119
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
120
+ return 0;
121
+
122
+ handle = READ_ONCE(page_owner->handle);
123
+ return handle;
121124 }
125
+EXPORT_SYMBOL_GPL(get_page_owner_handle);
122126
123
-static inline bool check_recursive_alloc(struct stack_trace *trace,
124
- unsigned long ip)
127
+static inline bool check_recursive_alloc(unsigned long *entries,
128
+ unsigned int nr_entries,
129
+ unsigned long ip)
125130 {
126
- int i;
131
+ unsigned int i;
127132
128
- if (!trace->nr_entries)
129
- return false;
130
-
131
- for (i = 0; i < trace->nr_entries; i++) {
132
- if (trace->entries[i] == ip)
133
+ for (i = 0; i < nr_entries; i++) {
134
+ if (entries[i] == ip)
133135 return true;
134136 }
135
-
136137 return false;
137138 }
138139
139140 static noinline depot_stack_handle_t save_stack(gfp_t flags)
140141 {
141142 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
142
- struct stack_trace trace = {
143
- .nr_entries = 0,
144
- .entries = entries,
145
- .max_entries = PAGE_OWNER_STACK_DEPTH,
146
- .skip = 2
147
- };
148143 depot_stack_handle_t handle;
144
+ unsigned int nr_entries;
149145
150
- save_stack_trace(&trace);
151
- if (trace.nr_entries != 0 &&
152
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
153
- trace.nr_entries--;
146
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
154147
155148 /*
156
- * We need to check recursion here because our request to stackdepot
157
- * could trigger memory allocation to save new entry. New memory
158
- * allocation would reach here and call depot_save_stack() again
159
- * if we don't catch it. There is still not enough memory in stackdepot
160
- * so it would try to allocate memory again and loop forever.
149
+ * We need to check recursion here because our request to
150
+ * stackdepot could trigger memory allocation to save new
151
+ * entry. New memory allocation would reach here and call
152
+ * stack_depot_save_entries() again if we don't catch it. There is
153
+ * still not enough memory in stackdepot so it would try to
154
+ * allocate memory again and loop forever.
161155 */
162
- if (check_recursive_alloc(&trace, _RET_IP_))
156
+ if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
163157 return dummy_handle;
164158
165
- handle = depot_save_stack(&trace, flags);
159
+ handle = stack_depot_save(entries, nr_entries, flags);
166160 if (!handle)
167161 handle = failure_handle;
168162
169163 return handle;
170164 }
171165
172
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
173
- depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
166
+void __reset_page_owner(struct page *page, unsigned int order)
167
+{
168
+ int i;
169
+ struct page_ext *page_ext;
170
+ depot_stack_handle_t handle = 0;
171
+ struct page_owner *page_owner;
172
+ u64 free_ts_nsec = local_clock();
173
+
174
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
175
+
176
+ page_ext = page_ext_get(page);
177
+ if (unlikely(!page_ext))
178
+ return;
179
+ for (i = 0; i < (1 << order); i++) {
180
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
181
+ page_owner = get_page_owner(page_ext);
182
+ page_owner->free_handle = handle;
183
+ page_owner->free_ts_nsec = free_ts_nsec;
184
+ page_ext = page_ext_next(page_ext);
185
+ }
186
+ page_ext_put(page_ext);
187
+}
188
+
189
+static inline void __set_page_owner_handle(struct page *page,
190
+ struct page_ext *page_ext, depot_stack_handle_t handle,
191
+ unsigned int order, gfp_t gfp_mask)
174192 {
175193 struct page_owner *page_owner;
194
+ int i;
176195
177
- page_owner = get_page_owner(page_ext);
178
- page_owner->handle = handle;
179
- page_owner->order = order;
180
- page_owner->gfp_mask = gfp_mask;
181
- page_owner->last_migrate_reason = -1;
196
+ for (i = 0; i < (1 << order); i++) {
197
+ page_owner = get_page_owner(page_ext);
198
+ page_owner->handle = handle;
199
+ page_owner->order = order;
200
+ page_owner->gfp_mask = gfp_mask;
201
+ page_owner->last_migrate_reason = -1;
202
+ page_owner->pid = current->pid;
203
+ page_owner->ts_nsec = local_clock();
204
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
205
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
182206
183
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
207
+ page_ext = page_ext_next(page_ext);
208
+ }
184209 }
185210
186211 noinline void __set_page_owner(struct page *page, unsigned int order,
187212 gfp_t gfp_mask)
188213 {
189
- struct page_ext *page_ext = lookup_page_ext(page);
214
+ struct page_ext *page_ext;
190215 depot_stack_handle_t handle;
191216
217
+ handle = save_stack(gfp_mask);
218
+
219
+ page_ext = page_ext_get(page);
192220 if (unlikely(!page_ext))
193221 return;
194
-
195
- handle = save_stack(gfp_mask);
196
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
222
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
223
+ page_ext_put(page_ext);
197224 }
198225
199226 void __set_page_owner_migrate_reason(struct page *page, int reason)
200227 {
201
- struct page_ext *page_ext = lookup_page_ext(page);
228
+ struct page_ext *page_ext = page_ext_get(page);
202229 struct page_owner *page_owner;
203230
204231 if (unlikely(!page_ext))
....@@ -206,31 +233,41 @@
206233
207234 page_owner = get_page_owner(page_ext);
208235 page_owner->last_migrate_reason = reason;
236
+ page_ext_put(page_ext);
209237 }
210238
211
-void __split_page_owner(struct page *page, unsigned int order)
239
+void __split_page_owner(struct page *page, unsigned int nr)
212240 {
213241 int i;
214
- struct page_ext *page_ext = lookup_page_ext(page);
242
+ struct page_ext *page_ext = page_ext_get(page);
215243 struct page_owner *page_owner;
216244
217245 if (unlikely(!page_ext))
218246 return;
219247
220
- page_owner = get_page_owner(page_ext);
221
- page_owner->order = 0;
222
- for (i = 1; i < (1 << order); i++)
223
- __copy_page_owner(page, page + i);
248
+ for (i = 0; i < nr; i++) {
249
+ page_owner = get_page_owner(page_ext);
250
+ page_owner->order = 0;
251
+ page_ext = page_ext_next(page_ext);
252
+ }
253
+ page_ext_put(page_ext);
224254 }
225255
226256 void __copy_page_owner(struct page *oldpage, struct page *newpage)
227257 {
228
- struct page_ext *old_ext = lookup_page_ext(oldpage);
229
- struct page_ext *new_ext = lookup_page_ext(newpage);
258
+ struct page_ext *old_ext;
259
+ struct page_ext *new_ext;
230260 struct page_owner *old_page_owner, *new_page_owner;
231261
232
- if (unlikely(!old_ext || !new_ext))
262
+ old_ext = page_ext_get(oldpage);
263
+ if (unlikely(!old_ext))
233264 return;
265
+
266
+ new_ext = page_ext_get(newpage);
267
+ if (unlikely(!new_ext)) {
268
+ page_ext_put(old_ext);
269
+ return;
270
+ }
234271
235272 old_page_owner = get_page_owner(old_ext);
236273 new_page_owner = get_page_owner(new_ext);
....@@ -239,6 +276,9 @@
239276 new_page_owner->last_migrate_reason =
240277 old_page_owner->last_migrate_reason;
241278 new_page_owner->handle = old_page_owner->handle;
279
+ new_page_owner->pid = old_page_owner->pid;
280
+ new_page_owner->ts_nsec = old_page_owner->ts_nsec;
281
+ new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
242282
243283 /*
244284 * We don't clear the bit on the oldpage as it's going to be freed
....@@ -250,6 +290,9 @@
250290 * the new page, which will be freed.
251291 */
252292 __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
293
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
294
+ page_ext_put(new_ext);
295
+ page_ext_put(old_ext);
253296 }
254297
255298 void pagetypeinfo_showmixedcount_print(struct seq_file *m,
....@@ -297,7 +340,7 @@
297340 if (PageBuddy(page)) {
298341 unsigned long freepage_order;
299342
300
- freepage_order = page_order_unsafe(page);
343
+ freepage_order = buddy_order_unsafe(page);
301344 if (freepage_order < MAX_ORDER)
302345 pfn += (1UL << freepage_order) - 1;
303346 continue;
....@@ -306,16 +349,15 @@
306349 if (PageReserved(page))
307350 continue;
308351
309
- page_ext = lookup_page_ext(page);
352
+ page_ext = page_ext_get(page);
310353 if (unlikely(!page_ext))
311354 continue;
312355
313
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
314
- continue;
356
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
357
+ goto ext_put_continue;
315358
316359 page_owner = get_page_owner(page_ext);
317
- page_mt = gfpflags_to_migratetype(
318
- page_owner->gfp_mask);
360
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
319361 if (pageblock_mt != page_mt) {
320362 if (is_migrate_cma(pageblock_mt))
321363 count[MIGRATE_MOVABLE]++;
....@@ -323,9 +365,12 @@
323365 count[pageblock_mt]++;
324366
325367 pfn = block_end_pfn;
368
+ page_ext_put(page_ext);
326369 break;
327370 }
328371 pfn += (1UL << page_owner->order) - 1;
372
+ext_put_continue:
373
+ page_ext_put(page_ext);
329374 }
330375 }
331376
....@@ -341,32 +386,28 @@
341386 struct page *page, struct page_owner *page_owner,
342387 depot_stack_handle_t handle)
343388 {
344
- int ret;
345
- int pageblock_mt, page_mt;
389
+ int ret, pageblock_mt, page_mt;
390
+ unsigned long *entries;
391
+ unsigned int nr_entries;
346392 char *kbuf;
347
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
348
- struct stack_trace trace = {
349
- .nr_entries = 0,
350
- .entries = entries,
351
- .max_entries = PAGE_OWNER_STACK_DEPTH,
352
- .skip = 0
353
- };
354393
394
+ count = min_t(size_t, count, PAGE_SIZE);
355395 kbuf = kmalloc(count, GFP_KERNEL);
356396 if (!kbuf)
357397 return -ENOMEM;
358398
359399 ret = snprintf(kbuf, count,
360
- "Page allocated via order %u, mask %#x(%pGg)\n",
400
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n",
361401 page_owner->order, page_owner->gfp_mask,
362
- &page_owner->gfp_mask);
402
+ &page_owner->gfp_mask, page_owner->pid,
403
+ page_owner->ts_nsec, page_owner->free_ts_nsec);
363404
364405 if (ret >= count)
365406 goto err;
366407
367408 /* Print information relevant to grouping pages by mobility */
368409 pageblock_mt = get_pageblock_migratetype(page);
369
- page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
410
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
370411 ret += snprintf(kbuf + ret, count - ret,
371412 "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
372413 pfn,
....@@ -378,8 +419,8 @@
378419 if (ret >= count)
379420 goto err;
380421
381
- depot_fetch_stack(handle, &trace);
382
- ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
422
+ nr_entries = stack_depot_fetch(handle, &entries);
423
+ ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
383424 if (ret >= count)
384425 goto err;
385426
....@@ -408,16 +449,11 @@
408449
409450 void __dump_page_owner(struct page *page)
410451 {
411
- struct page_ext *page_ext = lookup_page_ext(page);
452
+ struct page_ext *page_ext = page_ext_get((void *)page);
412453 struct page_owner *page_owner;
413
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
414
- struct stack_trace trace = {
415
- .nr_entries = 0,
416
- .entries = entries,
417
- .max_entries = PAGE_OWNER_STACK_DEPTH,
418
- .skip = 0
419
- };
420454 depot_stack_handle_t handle;
455
+ unsigned long *entries;
456
+ unsigned int nr_entries;
421457 gfp_t gfp_mask;
422458 int mt;
423459
....@@ -428,28 +464,44 @@
428464
429465 page_owner = get_page_owner(page_ext);
430466 gfp_mask = page_owner->gfp_mask;
431
- mt = gfpflags_to_migratetype(gfp_mask);
467
+ mt = gfp_migratetype(gfp_mask);
432468
433469 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
434
- pr_alert("page_owner info is not active (free page?)\n");
470
+ pr_alert("page_owner info is not present (never set?)\n");
471
+ page_ext_put(page_ext);
435472 return;
436473 }
474
+
475
+ if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
476
+ pr_alert("page_owner tracks the page as allocated\n");
477
+ else
478
+ pr_alert("page_owner tracks the page as freed\n");
479
+
480
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n",
481
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
482
+ page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
437483
438484 handle = READ_ONCE(page_owner->handle);
439485 if (!handle) {
440
- pr_alert("page_owner info is not active (free page?)\n");
441
- return;
486
+ pr_alert("page_owner allocation stack trace missing\n");
487
+ } else {
488
+ nr_entries = stack_depot_fetch(handle, &entries);
489
+ stack_trace_print(entries, nr_entries, 0);
442490 }
443491
444
- depot_fetch_stack(handle, &trace);
445
- pr_alert("PFN 0x%lx allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
446
- page_to_pfn(page),
447
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
448
- print_stack_trace(&trace, 0);
492
+ handle = READ_ONCE(page_owner->free_handle);
493
+ if (!handle) {
494
+ pr_alert("page_owner free stack trace missing\n");
495
+ } else {
496
+ nr_entries = stack_depot_fetch(handle, &entries);
497
+ pr_alert("page last free stack trace:\n");
498
+ stack_trace_print(entries, nr_entries, 0);
499
+ }
449500
450501 if (page_owner->last_migrate_reason != -1)
451502 pr_alert("page has been migrated, last migrate reason: %s\n",
452503 migrate_reason_names[page_owner->last_migrate_reason]);
504
+ page_ext_put(page_ext);
453505 }
454506
455507 static ssize_t
....@@ -476,6 +528,14 @@
476528 /* Find an allocated page */
477529 for (; pfn < max_pfn; pfn++) {
478530 /*
531
+ * This temporary page_owner is required so
532
+ * that we can avoid the context switches while holding
533
+ * the rcu lock and copying the page owner information to
534
+ * user through copy_to_user() or GFP_KERNEL allocations.
535
+ */
536
+ struct page_owner page_owner_tmp;
537
+
538
+ /*
479539 * If the new page is in a new MAX_ORDER_NR_PAGES area,
480540 * validate the area as existing, skip it if not
481541 */
....@@ -490,14 +550,14 @@
490550
491551 page = pfn_to_page(pfn);
492552 if (PageBuddy(page)) {
493
- unsigned long freepage_order = page_order_unsafe(page);
553
+ unsigned long freepage_order = buddy_order_unsafe(page);
494554
495555 if (freepage_order < MAX_ORDER)
496556 pfn += (1UL << freepage_order) - 1;
497557 continue;
498558 }
499559
500
- page_ext = lookup_page_ext(page);
560
+ page_ext = page_ext_get(page);
501561 if (unlikely(!page_ext))
502562 continue;
503563
....@@ -506,9 +566,23 @@
506566 * because we don't hold the zone lock.
507567 */
508568 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
509
- continue;
569
+ goto ext_put_continue;
570
+
571
+ /*
572
+ * Although we do have the info about past allocation of free
573
+ * pages, it's not relevant for current memory usage.
574
+ */
575
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
576
+ goto ext_put_continue;
510577
511578 page_owner = get_page_owner(page_ext);
579
+
580
+ /*
581
+ * Don't print "tail" pages of high-order allocations as that
582
+ * would inflate the stats.
583
+ */
584
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
585
+ goto ext_put_continue;
512586
513587 /*
514588 * Access to page_ext->handle isn't synchronous so we should
....@@ -516,13 +590,17 @@
516590 */
517591 handle = READ_ONCE(page_owner->handle);
518592 if (!handle)
519
- continue;
593
+ goto ext_put_continue;
520594
521595 /* Record the next PFN to read in the file offset */
522596 *ppos = (pfn - min_low_pfn) + 1;
523597
598
+ page_owner_tmp = *page_owner;
599
+ page_ext_put(page_ext);
524600 return print_page_owner(buf, count, pfn, page,
525
- page_owner, handle);
601
+ &page_owner_tmp, handle);
602
+ext_put_continue:
603
+ page_ext_put(page_ext);
526604 }
527605
528606 return 0;
....@@ -570,7 +648,7 @@
570648 * heavy lock contention.
571649 */
572650 if (PageBuddy(page)) {
573
- unsigned long order = page_order_unsafe(page);
651
+ unsigned long order = buddy_order_unsafe(page);
574652
575653 if (order > 0 && order < MAX_ORDER)
576654 pfn += (1UL << order) - 1;
....@@ -580,17 +658,20 @@
580658 if (PageReserved(page))
581659 continue;
582660
583
- page_ext = lookup_page_ext(page);
661
+ page_ext = page_ext_get(page);
584662 if (unlikely(!page_ext))
585663 continue;
586664
587665 /* Maybe overlapping zone */
588666 if (test_bit(PAGE_EXT_OWNER, &page_ext->flags))
589
- continue;
667
+ goto ext_put_continue;
590668
591669 /* Found early allocated page */
592
- __set_page_owner_handle(page_ext, early_handle, 0, 0);
670
+ __set_page_owner_handle(page, page_ext, early_handle,
671
+ 0, 0);
593672 count++;
673
+ext_put_continue:
674
+ page_ext_put(page_ext);
594675 }
595676 cond_resched();
596677 }
....@@ -626,16 +707,14 @@
626707
627708 static int __init pageowner_init(void)
628709 {
629
- struct dentry *dentry;
630
-
631710 if (!static_branch_unlikely(&page_owner_inited)) {
632711 pr_info("page_owner is disabled\n");
633712 return 0;
634713 }
635714
636
- dentry = debugfs_create_file("page_owner", 0400, NULL,
637
- NULL, &proc_page_owner_operations);
715
+ debugfs_create_file("page_owner", 0400, NULL, NULL,
716
+ &proc_page_owner_operations);
638717
639
- return PTR_ERR_OR_ZERO(dentry);
718
+ return 0;
640719 }
641720 late_initcall(pageowner_init)