hc
2024-05-14 bedbef8ad3e75a304af6361af235302bcc61d06b
kernel/net/core/page_pool.c
....@@ -4,15 +4,22 @@
44 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
55 * Copyright (C) 2016 Red Hat, Inc.
66 */
7
+
78 #include <linux/types.h>
89 #include <linux/kernel.h>
910 #include <linux/slab.h>
11
+#include <linux/device.h>
1012
1113 #include <net/page_pool.h>
1214 #include <linux/dma-direction.h>
1315 #include <linux/dma-mapping.h>
1416 #include <linux/page-flags.h>
1517 #include <linux/mm.h> /* for __put_page() */
18
+
19
+#include <trace/events/page_pool.h>
20
+
21
+#define DEFER_TIME (msecs_to_jiffies(1000))
22
+#define DEFER_WARN_INTERVAL (60 * HZ)
1623
1724 static int page_pool_init(struct page_pool *pool,
1825 const struct page_pool_params *params)
....@@ -36,12 +43,37 @@
3643 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
3744 * which is the XDP_TX use-case.
3845 */
39
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
40
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
41
- return -EINVAL;
46
+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
47
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
48
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
49
+ return -EINVAL;
50
+ }
51
+
52
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
53
+ /* In order to request DMA-sync-for-device the page
54
+ * needs to be mapped
55
+ */
56
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
57
+ return -EINVAL;
58
+
59
+ if (!pool->p.max_len)
60
+ return -EINVAL;
61
+
62
+ /* pool->p.offset has to be set according to the address
63
+ * offset used by the DMA engine to start copying rx data
64
+ */
65
+ }
4266
4367 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
4468 return -ENOMEM;
69
+
70
+ atomic_set(&pool->pages_state_release_cnt, 0);
71
+
72
+ /* Driver calling page_pool_create() also call page_pool_destroy() */
73
+ refcount_set(&pool->user_cnt, 1);
74
+
75
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
76
+ get_device(pool->p.dev);
4577
4678 return 0;
4779 }
....@@ -49,7 +81,7 @@
4981 struct page_pool *page_pool_create(const struct page_pool_params *params)
5082 {
5183 struct page_pool *pool;
52
- int err = 0;
84
+ int err;
5385
5486 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
5587 if (!pool)
....@@ -61,49 +93,91 @@
6193 kfree(pool);
6294 return ERR_PTR(err);
6395 }
96
+
6497 return pool;
6598 }
6699 EXPORT_SYMBOL(page_pool_create);
67100
68
-/* fast path */
69
-static struct page *__page_pool_get_cached(struct page_pool *pool)
101
+static void page_pool_return_page(struct page_pool *pool, struct page *page);
102
+
103
+noinline
104
+static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
70105 {
71106 struct ptr_ring *r = &pool->ring;
72107 struct page *page;
108
+ int pref_nid; /* preferred NUMA node */
73109
74110 /* Quicker fallback, avoid locks when ring is empty */
75111 if (__ptr_ring_empty(r))
76112 return NULL;
77113
78
- /* Test for safe-context, caller should provide this guarantee */
79
- if (likely(in_serving_softirq())) {
80
- if (likely(pool->alloc.count)) {
81
- /* Fast-path */
82
- page = pool->alloc.cache[--pool->alloc.count];
83
- return page;
84
- }
85
- /* Slower-path: Alloc array empty, time to refill
86
- *
87
- * Open-coded bulk ptr_ring consumer.
88
- *
89
- * Discussion: the ring consumer lock is not really
90
- * needed due to the softirq/NAPI protection, but
91
- * later need the ability to reclaim pages on the
92
- * ring. Thus, keeping the locks.
93
- */
94
- spin_lock(&r->consumer_lock);
95
- while ((page = __ptr_ring_consume(r))) {
96
- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
97
- break;
114
+ /* Softirq guarantee CPU and thus NUMA node is stable. This,
115
+ * assumes CPU refilling driver RX-ring will also run RX-NAPI.
116
+ */
117
+#ifdef CONFIG_NUMA
118
+ pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
119
+#else
120
+ /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
121
+ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
122
+#endif
123
+
124
+ /* Slower-path: Get pages from locked ring queue */
125
+ spin_lock(&r->consumer_lock);
126
+
127
+ /* Refill alloc array, but only if NUMA match */
128
+ do {
129
+ page = __ptr_ring_consume(r);
130
+ if (unlikely(!page))
131
+ break;
132
+
133
+ if (likely(page_to_nid(page) == pref_nid)) {
98134 pool->alloc.cache[pool->alloc.count++] = page;
135
+ } else {
136
+ /* NUMA mismatch;
137
+ * (1) release 1 page to page-allocator and
138
+ * (2) break out to fallthrough to alloc_pages_node.
139
+ * This limit stress on page buddy alloactor.
140
+ */
141
+ page_pool_return_page(pool, page);
142
+ page = NULL;
143
+ break;
99144 }
100
- spin_unlock(&r->consumer_lock);
101
- return page;
145
+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
146
+
147
+ /* Return last page */
148
+ if (likely(pool->alloc.count > 0))
149
+ page = pool->alloc.cache[--pool->alloc.count];
150
+
151
+ spin_unlock(&r->consumer_lock);
152
+ return page;
153
+}
154
+
155
+/* fast path */
156
+static struct page *__page_pool_get_cached(struct page_pool *pool)
157
+{
158
+ struct page *page;
159
+
160
+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
161
+ if (likely(pool->alloc.count)) {
162
+ /* Fast-path */
163
+ page = pool->alloc.cache[--pool->alloc.count];
164
+ } else {
165
+ page = page_pool_refill_alloc_cache(pool);
102166 }
103167
104
- /* Slow-path: Get page from locked ring queue */
105
- page = ptr_ring_consume(&pool->ring);
106168 return page;
169
+}
170
+
171
+static void page_pool_dma_sync_for_device(struct page_pool *pool,
172
+ struct page *page,
173
+ unsigned int dma_sync_size)
174
+{
175
+ dma_addr_t dma_addr = page_pool_get_dma_addr(page);
176
+
177
+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
178
+ dma_sync_single_range_for_device(pool->p.dev, dma_addr,
179
+ pool->p.offset, dma_sync_size,
180
+ pool->p.dma_dir);
107181 }
108182
109183 /* slow path */
....@@ -129,26 +203,40 @@
129203 */
130204
131205 /* Cache was empty, do real allocation */
206
+#ifdef CONFIG_NUMA
132207 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
208
+#else
209
+ page = alloc_pages(gfp, pool->p.order);
210
+#endif
133211 if (!page)
134212 return NULL;
135213
136214 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
137215 goto skip_dma_map;
138216
139
- /* Setup DMA mapping: use page->private for DMA-addr
217
+ /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
218
+ * since dma_addr_t can be either 32 or 64 bits and does not always fit
219
+ * into page private data (i.e 32bit cpu with 64bit DMA caps)
140220 * This mapping is kept for lifetime of page, until leaving pool.
141221 */
142
- dma = dma_map_page(pool->p.dev, page, 0,
143
- (PAGE_SIZE << pool->p.order),
144
- pool->p.dma_dir);
222
+ dma = dma_map_page_attrs(pool->p.dev, page, 0,
223
+ (PAGE_SIZE << pool->p.order),
224
+ pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
145225 if (dma_mapping_error(pool->p.dev, dma)) {
146226 put_page(page);
147227 return NULL;
148228 }
149
- set_page_private(page, dma); /* page->private = dma; */
229
+ page_pool_set_dma_addr(page, dma);
230
+
231
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
232
+ page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
150233
151234 skip_dma_map:
235
+ /* Track how many pages are held 'in-flight' */
236
+ pool->pages_state_hold_cnt++;
237
+
238
+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
239
+
152240 /* When page just alloc'ed is should/must have refcnt 1. */
153241 return page;
154242 }
....@@ -171,23 +259,62 @@
171259 }
172260 EXPORT_SYMBOL(page_pool_alloc_pages);
173261
174
-/* Cleanup page_pool state from page */
175
-static void __page_pool_clean_page(struct page_pool *pool,
176
- struct page *page)
177
-{
178
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
179
- return;
262
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
263
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
264
+ */
265
+#define _distance(a, b) (s32)((a) - (b))
180266
181
- /* DMA unmap */
182
- dma_unmap_page(pool->p.dev, page_private(page),
183
- PAGE_SIZE << pool->p.order, pool->p.dma_dir);
184
- set_page_private(page, 0);
267
+static s32 page_pool_inflight(struct page_pool *pool)
268
+{
269
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
270
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
271
+ s32 inflight;
272
+
273
+ inflight = _distance(hold_cnt, release_cnt);
274
+
275
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
276
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
277
+
278
+ return inflight;
185279 }
186280
187
-/* Return a page to the page allocator, cleaning up our state */
188
-static void __page_pool_return_page(struct page_pool *pool, struct page *page)
281
+/* Disconnects a page (from a page_pool). API users can have a need
282
+ * to disconnect a page (from a page_pool), to allow it to be used as
283
+ * a regular page (that will eventually be returned to the normal
284
+ * page-allocator via put_page).
285
+ */
286
+void page_pool_release_page(struct page_pool *pool, struct page *page)
189287 {
190
- __page_pool_clean_page(pool, page);
288
+ dma_addr_t dma;
289
+ int count;
290
+
291
+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
292
+ /* Always account for inflight pages, even if we didn't
293
+ * map them
294
+ */
295
+ goto skip_dma_unmap;
296
+
297
+ dma = page_pool_get_dma_addr(page);
298
+
299
+ /* When page is unmapped, it cannot be returned to our pool */
300
+ dma_unmap_page_attrs(pool->p.dev, dma,
301
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
302
+ DMA_ATTR_SKIP_CPU_SYNC);
303
+ page_pool_set_dma_addr(page, 0);
304
+skip_dma_unmap:
305
+ /* This may be the last page returned, releasing the pool, so
306
+ * it is not safe to reference pool afterwards.
307
+ */
308
+ count = atomic_inc_return(&pool->pages_state_release_cnt);
309
+ trace_page_pool_state_release(pool, page, count);
310
+}
311
+EXPORT_SYMBOL(page_pool_release_page);
312
+
313
+/* Return a page to the page allocator, cleaning up our state */
314
+static void page_pool_return_page(struct page_pool *pool, struct page *page)
315
+{
316
+ page_pool_release_page(pool, page);
317
+
191318 put_page(page);
192319 /* An optimization would be to call __free_pages(page, pool->p.order)
193320 * knowing page is not part of page-cache (thus avoiding a
....@@ -195,8 +322,7 @@
195322 */
196323 }
197324
198
-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
199
- struct page *page)
325
+static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
200326 {
201327 int ret;
202328 /* BH protection not needed if current is serving softirq */
....@@ -213,7 +339,7 @@
213339 *
214340 * Caller must provide appropriate safe context.
215341 */
216
-static bool __page_pool_recycle_direct(struct page *page,
342
+static bool page_pool_recycle_in_cache(struct page *page,
217343 struct page_pool *pool)
218344 {
219345 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
....@@ -224,8 +350,22 @@
224350 return true;
225351 }
226352
227
-void __page_pool_put_page(struct page_pool *pool,
228
- struct page *page, bool allow_direct)
353
+/* page is NOT reusable when:
354
+ * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
355
+ */
356
+static bool pool_page_reusable(struct page_pool *pool, struct page *page)
357
+{
358
+ return !page_is_pfmemalloc(page);
359
+}
360
+
361
+/* If the page refcnt == 1, this will try to recycle the page.
362
+ * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
363
+ * the configured size min(dma_sync_size, pool->max_len).
364
+ * If the page refcnt != 1, then the page will be returned to memory
365
+ * subsystem.
366
+ */
367
+void page_pool_put_page(struct page_pool *pool, struct page *page,
368
+ unsigned int dma_sync_size, bool allow_direct)
229369 {
230370 /* This allocator is optimized for the XDP mode that uses
231371 * one-frame-per-page, but have fallbacks that act like the
....@@ -233,16 +373,21 @@
233373 *
234374 * refcnt == 1 means page_pool owns page, and can recycle it.
235375 */
236
- if (likely(page_ref_count(page) == 1)) {
376
+ if (likely(page_ref_count(page) == 1 &&
377
+ pool_page_reusable(pool, page))) {
237378 /* Read barrier done in page_ref_count / READ_ONCE */
238379
380
+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
381
+ page_pool_dma_sync_for_device(pool, page,
382
+ dma_sync_size);
383
+
239384 if (allow_direct && in_serving_softirq())
240
- if (__page_pool_recycle_direct(page, pool))
385
+ if (page_pool_recycle_in_cache(page, pool))
241386 return;
242387
243
- if (!__page_pool_recycle_into_ring(pool, page)) {
388
+ if (!page_pool_recycle_in_ring(pool, page)) {
244389 /* Cache full, fallback to free pages */
245
- __page_pool_return_page(pool, page);
390
+ page_pool_return_page(pool, page);
246391 }
247392 return;
248393 }
....@@ -259,12 +404,13 @@
259404 * doing refcnt based recycle tricks, meaning another process
260405 * will be invoking put_page.
261406 */
262
- __page_pool_clean_page(pool, page);
407
+ /* Do not replace this with page_pool_return_page() */
408
+ page_pool_release_page(pool, page);
263409 put_page(page);
264410 }
265
-EXPORT_SYMBOL(__page_pool_put_page);
411
+EXPORT_SYMBOL(page_pool_put_page);
266412
267
-static void __page_pool_empty_ring(struct page_pool *pool)
413
+static void page_pool_empty_ring(struct page_pool *pool)
268414 {
269415 struct page *page;
270416
....@@ -275,27 +421,29 @@
275421 pr_crit("%s() page_pool refcnt %d violation\n",
276422 __func__, page_ref_count(page));
277423
278
- __page_pool_return_page(pool, page);
424
+ page_pool_return_page(pool, page);
279425 }
280426 }
281427
282
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
428
+static void page_pool_free(struct page_pool *pool)
283429 {
284
- struct page_pool *pool;
430
+ if (pool->disconnect)
431
+ pool->disconnect(pool);
285432
286
- pool = container_of(rcu, struct page_pool, rcu);
287
-
288
- WARN(pool->alloc.count, "API usage violation");
289
-
290
- __page_pool_empty_ring(pool);
291433 ptr_ring_cleanup(&pool->ring, NULL);
434
+
435
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
436
+ put_device(pool->p.dev);
437
+
292438 kfree(pool);
293439 }
294440
295
-/* Cleanup and release resources */
296
-void page_pool_destroy(struct page_pool *pool)
441
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
297442 {
298443 struct page *page;
444
+
445
+ if (pool->destroy_cnt)
446
+ return;
299447
300448 /* Empty alloc cache, assume caller made sure this is
301449 * no-longer in use, and page_pool_alloc_pages() cannot be
....@@ -303,15 +451,93 @@
303451 */
304452 while (pool->alloc.count) {
305453 page = pool->alloc.cache[--pool->alloc.count];
306
- __page_pool_return_page(pool, page);
454
+ page_pool_return_page(pool, page);
307455 }
456
+}
457
+
458
+static void page_pool_scrub(struct page_pool *pool)
459
+{
460
+ page_pool_empty_alloc_cache_once(pool);
461
+ pool->destroy_cnt++;
308462
309463 /* No more consumers should exist, but producers could still
310464 * be in-flight.
311465 */
312
- __page_pool_empty_ring(pool);
466
+ page_pool_empty_ring(pool);
467
+}
313468
314
- /* An xdp_mem_allocator can still ref page_pool pointer */
315
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
469
+static int page_pool_release(struct page_pool *pool)
470
+{
471
+ int inflight;
472
+
473
+ page_pool_scrub(pool);
474
+ inflight = page_pool_inflight(pool);
475
+ if (!inflight)
476
+ page_pool_free(pool);
477
+
478
+ return inflight;
479
+}
480
+
481
+static void page_pool_release_retry(struct work_struct *wq)
482
+{
483
+ struct delayed_work *dwq = to_delayed_work(wq);
484
+ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
485
+ int inflight;
486
+
487
+ inflight = page_pool_release(pool);
488
+ if (!inflight)
489
+ return;
490
+
491
+ /* Periodic warning */
492
+ if (time_after_eq(jiffies, pool->defer_warn)) {
493
+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
494
+
495
+ pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
496
+ __func__, inflight, sec);
497
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
498
+ }
499
+
500
+ /* Still not ready to be disconnected, retry later */
501
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
502
+}
503
+
504
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
505
+{
506
+ refcount_inc(&pool->user_cnt);
507
+ pool->disconnect = disconnect;
508
+}
509
+
510
+void page_pool_destroy(struct page_pool *pool)
511
+{
512
+ if (!pool)
513
+ return;
514
+
515
+ if (!page_pool_put(pool))
516
+ return;
517
+
518
+ if (!page_pool_release(pool))
519
+ return;
520
+
521
+ pool->defer_start = jiffies;
522
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
523
+
524
+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
525
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
316526 }
317527 EXPORT_SYMBOL(page_pool_destroy);
528
+
529
+/* Caller must provide appropriate safe context, e.g. NAPI. */
530
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
531
+{
532
+ struct page *page;
533
+
534
+ trace_page_pool_update_nid(pool, new_nid);
535
+ pool->p.nid = new_nid;
536
+
537
+ /* Flush pool alloc cache, as refill will check NUMA node */
538
+ while (pool->alloc.count) {
539
+ page = pool->alloc.cache[--pool->alloc.count];
540
+ page_pool_return_page(pool, page);
541
+ }
542
+}
543
+EXPORT_SYMBOL(page_pool_update_nid);