~hc/RK356X_SDK_RELEASE.git

..	..	@@ -4,15 +4,22 @@
4	4	* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5	5	* Copyright (C) 2016 Red Hat, Inc.
6	6	*/
	7	+
7	8	#include <linux/types.h>
8	9	#include <linux/kernel.h>
9	10	#include <linux/slab.h>
	11	+#include <linux/device.h>
10	12
11	13	#include <net/page_pool.h>
12	14	#include <linux/dma-direction.h>
13	15	#include <linux/dma-mapping.h>
14	16	#include <linux/page-flags.h>
15	17	#include <linux/mm.h> /* for __put_page() */
	18	+
	19	+#include <trace/events/page_pool.h>
	20	+
	21	+#define DEFER_TIME (msecs_to_jiffies(1000))
	22	+#define DEFER_WARN_INTERVAL (60 * HZ)
16	23
17	24	static int page_pool_init(struct page_pool *pool,
18	25	const struct page_pool_params *params)
..	..	@@ -36,12 +43,37 @@
36	43	* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
37	44	* which is the XDP_TX use-case.
38	45	*/
39		- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
40		- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
41		- return -EINVAL;
	46	+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
	47	+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
	48	+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
	49	+ return -EINVAL;
	50	+ }
	51	+
	52	+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
	53	+ /* In order to request DMA-sync-for-device the page
	54	+ * needs to be mapped
	55	+ */
	56	+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
	57	+ return -EINVAL;
	58	+
	59	+ if (!pool->p.max_len)
	60	+ return -EINVAL;
	61	+
	62	+ /* pool->p.offset has to be set according to the address
	63	+ * offset used by the DMA engine to start copying rx data
	64	+ */
	65	+ }
42	66
43	67	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
44	68	return -ENOMEM;
	69	+
	70	+ atomic_set(&pool->pages_state_release_cnt, 0);
	71	+
	72	+ /* Driver calling page_pool_create() also call page_pool_destroy() */
	73	+ refcount_set(&pool->user_cnt, 1);
	74	+
	75	+ if (pool->p.flags & PP_FLAG_DMA_MAP)
	76	+ get_device(pool->p.dev);
45	77
46	78	return 0;
47	79	}
..	..	@@ -49,7 +81,7 @@
49	81	struct page_pool page_pool_create(const struct page_pool_params params)
50	82	{
51	83	struct page_pool *pool;
52		- int err = 0;
	84	+ int err;
53	85
54	86	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
55	87	if (!pool)
..	..	@@ -61,49 +93,91 @@
61	93	kfree(pool);
62	94	return ERR_PTR(err);
63	95	}
	96	+
64	97	return pool;
65	98	}
66	99	EXPORT_SYMBOL(page_pool_create);
67	100
68		-/* fast path */
69		-static struct page __page_pool_get_cached(struct page_pool pool)
	101	+static void page_pool_return_page(struct page_pool pool, struct page page);
	102	+
	103	+noinline
	104	+static struct page page_pool_refill_alloc_cache(struct page_pool pool)
70	105	{
71	106	struct ptr_ring *r = &pool->ring;
72	107	struct page *page;
	108	+ int pref_nid; /* preferred NUMA node */
73	109
74	110	/* Quicker fallback, avoid locks when ring is empty */
75	111	if (__ptr_ring_empty(r))
76	112	return NULL;
77	113
78		- /* Test for safe-context, caller should provide this guarantee */
79		- if (likely(in_serving_softirq())) {
80		- if (likely(pool->alloc.count)) {
81		- /* Fast-path */
82		- page = pool->alloc.cache[--pool->alloc.count];
83		- return page;
84		- }
85		- /* Slower-path: Alloc array empty, time to refill
86		- *
87		- * Open-coded bulk ptr_ring consumer.
88		- *
89		- * Discussion: the ring consumer lock is not really
90		- * needed due to the softirq/NAPI protection, but
91		- * later need the ability to reclaim pages on the
92		- * ring. Thus, keeping the locks.
93		- */
94		- spin_lock(&r->consumer_lock);
95		- while ((page = __ptr_ring_consume(r))) {
96		- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
97		- break;
	114	+ /* Softirq guarantee CPU and thus NUMA node is stable. This,
	115	+ * assumes CPU refilling driver RX-ring will also run RX-NAPI.
	116	+ */
	117	+#ifdef CONFIG_NUMA
	118	+ pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
	119	+#else
	120	+ /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
	121	+ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
	122	+#endif
	123	+
	124	+ /* Slower-path: Get pages from locked ring queue */
	125	+ spin_lock(&r->consumer_lock);
	126	+
	127	+ /* Refill alloc array, but only if NUMA match */
	128	+ do {
	129	+ page = __ptr_ring_consume(r);
	130	+ if (unlikely(!page))
	131	+ break;
	132	+
	133	+ if (likely(page_to_nid(page) == pref_nid)) {
98	134	pool->alloc.cache[pool->alloc.count++] = page;
	135	+ } else {
	136	+ /* NUMA mismatch;
	137	+ * (1) release 1 page to page-allocator and
	138	+ * (2) break out to fallthrough to alloc_pages_node.
	139	+ * This limit stress on page buddy alloactor.
	140	+ */
	141	+ page_pool_return_page(pool, page);
	142	+ page = NULL;
	143	+ break;
99	144	}
100		- spin_unlock(&r->consumer_lock);
101		- return page;
	145	+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
	146	+
	147	+ /* Return last page */
	148	+ if (likely(pool->alloc.count > 0))
	149	+ page = pool->alloc.cache[--pool->alloc.count];
	150	+
	151	+ spin_unlock(&r->consumer_lock);
	152	+ return page;
	153	+}
	154	+
	155	+/* fast path */
	156	+static struct page __page_pool_get_cached(struct page_pool pool)
	157	+{
	158	+ struct page *page;
	159	+
	160	+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
	161	+ if (likely(pool->alloc.count)) {
	162	+ /* Fast-path */
	163	+ page = pool->alloc.cache[--pool->alloc.count];
	164	+ } else {
	165	+ page = page_pool_refill_alloc_cache(pool);
102	166	}
103	167
104		- /* Slow-path: Get page from locked ring queue */
105		- page = ptr_ring_consume(&pool->ring);
106	168	return page;
	169	+}
	170	+
	171	+static void page_pool_dma_sync_for_device(struct page_pool *pool,
	172	+ struct page *page,
	173	+ unsigned int dma_sync_size)
	174	+{
	175	+ dma_addr_t dma_addr = page_pool_get_dma_addr(page);
	176	+
	177	+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
	178	+ dma_sync_single_range_for_device(pool->p.dev, dma_addr,
	179	+ pool->p.offset, dma_sync_size,
	180	+ pool->p.dma_dir);
107	181	}
108	182
109	183	/* slow path */
..	..	@@ -129,26 +203,40 @@
129	203	*/
130	204
131	205	/* Cache was empty, do real allocation */
	206	+#ifdef CONFIG_NUMA
132	207	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
	208	+#else
	209	+ page = alloc_pages(gfp, pool->p.order);
	210	+#endif
133	211	if (!page)
134	212	return NULL;
135	213
136	214	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
137	215	goto skip_dma_map;
138	216
139		- /* Setup DMA mapping: use page->private for DMA-addr
	217	+ /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
	218	+ * since dma_addr_t can be either 32 or 64 bits and does not always fit
	219	+ * into page private data (i.e 32bit cpu with 64bit DMA caps)
140	220	* This mapping is kept for lifetime of page, until leaving pool.
141	221	*/
142		- dma = dma_map_page(pool->p.dev, page, 0,
143		- (PAGE_SIZE << pool->p.order),
144		- pool->p.dma_dir);
	222	+ dma = dma_map_page_attrs(pool->p.dev, page, 0,
	223	+ (PAGE_SIZE << pool->p.order),
	224	+ pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
145	225	if (dma_mapping_error(pool->p.dev, dma)) {
146	226	put_page(page);
147	227	return NULL;
148	228	}
149		- set_page_private(page, dma); /* page->private = dma; */
	229	+ page_pool_set_dma_addr(page, dma);
	230	+
	231	+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
	232	+ page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
150	233
151	234	skip_dma_map:
	235	+ /* Track how many pages are held 'in-flight' */
	236	+ pool->pages_state_hold_cnt++;
	237	+
	238	+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
	239	+
152	240	/* When page just alloc'ed is should/must have refcnt 1. */
153	241	return page;
154	242	}
..	..	@@ -171,23 +259,62 @@
171	259	}
172	260	EXPORT_SYMBOL(page_pool_alloc_pages);
173	261
174		-/* Cleanup page_pool state from page */
175		-static void __page_pool_clean_page(struct page_pool *pool,
176		- struct page *page)
177		-{
178		- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
179		- return;
	262	+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
	263	+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
	264	+ */
	265	+#define _distance(a, b) (s32)((a) - (b))
180	266
181		- /* DMA unmap */
182		- dma_unmap_page(pool->p.dev, page_private(page),
183		- PAGE_SIZE << pool->p.order, pool->p.dma_dir);
184		- set_page_private(page, 0);
	267	+static s32 page_pool_inflight(struct page_pool *pool)
	268	+{
	269	+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
	270	+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
	271	+ s32 inflight;
	272	+
	273	+ inflight = _distance(hold_cnt, release_cnt);
	274	+
	275	+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
	276	+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
	277	+
	278	+ return inflight;
185	279	}
186	280
187		-/* Return a page to the page allocator, cleaning up our state */
188		-static void __page_pool_return_page(struct page_pool pool, struct page page)
	281	+/* Disconnects a page (from a page_pool). API users can have a need
	282	+ * to disconnect a page (from a page_pool), to allow it to be used as
	283	+ * a regular page (that will eventually be returned to the normal
	284	+ * page-allocator via put_page).
	285	+ */
	286	+void page_pool_release_page(struct page_pool pool, struct page page)
189	287	{
190		- __page_pool_clean_page(pool, page);
	288	+ dma_addr_t dma;
	289	+ int count;
	290	+
	291	+ if (!(pool->p.flags & PP_FLAG_DMA_MAP))
	292	+ /* Always account for inflight pages, even if we didn't
	293	+ * map them
	294	+ */
	295	+ goto skip_dma_unmap;
	296	+
	297	+ dma = page_pool_get_dma_addr(page);
	298	+
	299	+ /* When page is unmapped, it cannot be returned to our pool */
	300	+ dma_unmap_page_attrs(pool->p.dev, dma,
	301	+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
	302	+ DMA_ATTR_SKIP_CPU_SYNC);
	303	+ page_pool_set_dma_addr(page, 0);
	304	+skip_dma_unmap:
	305	+ /* This may be the last page returned, releasing the pool, so
	306	+ * it is not safe to reference pool afterwards.
	307	+ */
	308	+ count = atomic_inc_return(&pool->pages_state_release_cnt);
	309	+ trace_page_pool_state_release(pool, page, count);
	310	+}
	311	+EXPORT_SYMBOL(page_pool_release_page);
	312	+
	313	+/* Return a page to the page allocator, cleaning up our state */
	314	+static void page_pool_return_page(struct page_pool pool, struct page page)
	315	+{
	316	+ page_pool_release_page(pool, page);
	317	+
191	318	put_page(page);
192	319	/* An optimization would be to call __free_pages(page, pool->p.order)
193	320	* knowing page is not part of page-cache (thus avoiding a
..	..	@@ -195,8 +322,7 @@
195	322	*/
196	323	}
197	324
198		-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
199		- struct page *page)
	325	+static bool page_pool_recycle_in_ring(struct page_pool pool, struct page page)
200	326	{
201	327	int ret;
202	328	/* BH protection not needed if current is serving softirq */
..	..	@@ -213,7 +339,7 @@
213	339	*
214	340	* Caller must provide appropriate safe context.
215	341	*/
216		-static bool __page_pool_recycle_direct(struct page *page,
	342	+static bool page_pool_recycle_in_cache(struct page *page,
217	343	struct page_pool *pool)
218	344	{
219	345	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
..	..	@@ -224,8 +350,22 @@
224	350	return true;
225	351	}
226	352
227		-void __page_pool_put_page(struct page_pool *pool,
228		- struct page *page, bool allow_direct)
	353	+/* page is NOT reusable when:
	354	+ * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
	355	+ */
	356	+static bool pool_page_reusable(struct page_pool pool, struct page page)
	357	+{
	358	+ return !page_is_pfmemalloc(page);
	359	+}
	360	+
	361	+/* If the page refcnt == 1, this will try to recycle the page.
	362	+ * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
	363	+ * the configured size min(dma_sync_size, pool->max_len).
	364	+ * If the page refcnt != 1, then the page will be returned to memory
	365	+ * subsystem.
	366	+ */
	367	+void page_pool_put_page(struct page_pool pool, struct page page,
	368	+ unsigned int dma_sync_size, bool allow_direct)
229	369	{
230	370	/* This allocator is optimized for the XDP mode that uses
231	371	* one-frame-per-page, but have fallbacks that act like the
..	..	@@ -233,16 +373,21 @@
233	373	*
234	374	* refcnt == 1 means page_pool owns page, and can recycle it.
235	375	*/
236		- if (likely(page_ref_count(page) == 1)) {
	376	+ if (likely(page_ref_count(page) == 1 &&
	377	+ pool_page_reusable(pool, page))) {
237	378	/* Read barrier done in page_ref_count / READ_ONCE */
238	379
	380	+ if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
	381	+ page_pool_dma_sync_for_device(pool, page,
	382	+ dma_sync_size);
	383	+
239	384	if (allow_direct && in_serving_softirq())
240		- if (__page_pool_recycle_direct(page, pool))
	385	+ if (page_pool_recycle_in_cache(page, pool))
241	386	return;
242	387
243		- if (!__page_pool_recycle_into_ring(pool, page)) {
	388	+ if (!page_pool_recycle_in_ring(pool, page)) {
244	389	/* Cache full, fallback to free pages */
245		- __page_pool_return_page(pool, page);
	390	+ page_pool_return_page(pool, page);
246	391	}
247	392	return;
248	393	}
..	..	@@ -259,12 +404,13 @@
259	404	* doing refcnt based recycle tricks, meaning another process
260	405	* will be invoking put_page.
261	406	*/
262		- __page_pool_clean_page(pool, page);
	407	+ /* Do not replace this with page_pool_return_page() */
	408	+ page_pool_release_page(pool, page);
263	409	put_page(page);
264	410	}
265		-EXPORT_SYMBOL(__page_pool_put_page);
	411	+EXPORT_SYMBOL(page_pool_put_page);
266	412
267		-static void __page_pool_empty_ring(struct page_pool *pool)
	413	+static void page_pool_empty_ring(struct page_pool *pool)
268	414	{
269	415	struct page *page;
270	416
..	..	@@ -275,27 +421,29 @@
275	421	pr_crit("%s() page_pool refcnt %d violation\n",
276	422	__func__, page_ref_count(page));
277	423
278		- __page_pool_return_page(pool, page);
	424	+ page_pool_return_page(pool, page);
279	425	}
280	426	}
281	427
282		-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
	428	+static void page_pool_free(struct page_pool *pool)
283	429	{
284		- struct page_pool *pool;
	430	+ if (pool->disconnect)
	431	+ pool->disconnect(pool);
285	432
286		- pool = container_of(rcu, struct page_pool, rcu);
287		-
288		- WARN(pool->alloc.count, "API usage violation");
289		-
290		- __page_pool_empty_ring(pool);
291	433	ptr_ring_cleanup(&pool->ring, NULL);
	434	+
	435	+ if (pool->p.flags & PP_FLAG_DMA_MAP)
	436	+ put_device(pool->p.dev);
	437	+
292	438	kfree(pool);
293	439	}
294	440
295		-/* Cleanup and release resources */
296		-void page_pool_destroy(struct page_pool *pool)
	441	+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
297	442	{
298	443	struct page *page;
	444	+
	445	+ if (pool->destroy_cnt)
	446	+ return;
299	447
300	448	/* Empty alloc cache, assume caller made sure this is
301	449	* no-longer in use, and page_pool_alloc_pages() cannot be
..	..	@@ -303,15 +451,93 @@
303	451	*/
304	452	while (pool->alloc.count) {
305	453	page = pool->alloc.cache[--pool->alloc.count];
306		- __page_pool_return_page(pool, page);
	454	+ page_pool_return_page(pool, page);
307	455	}
	456	+}
	457	+
	458	+static void page_pool_scrub(struct page_pool *pool)
	459	+{
	460	+ page_pool_empty_alloc_cache_once(pool);
	461	+ pool->destroy_cnt++;
308	462
309	463	/* No more consumers should exist, but producers could still
310	464	* be in-flight.
311	465	*/
312		- __page_pool_empty_ring(pool);
	466	+ page_pool_empty_ring(pool);
	467	+}
313	468
314		- /* An xdp_mem_allocator can still ref page_pool pointer */
315		- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
	469	+static int page_pool_release(struct page_pool *pool)
	470	+{
	471	+ int inflight;
	472	+
	473	+ page_pool_scrub(pool);
	474	+ inflight = page_pool_inflight(pool);
	475	+ if (!inflight)
	476	+ page_pool_free(pool);
	477	+
	478	+ return inflight;
	479	+}
	480	+
	481	+static void page_pool_release_retry(struct work_struct *wq)
	482	+{
	483	+ struct delayed_work *dwq = to_delayed_work(wq);
	484	+ struct page_pool pool = container_of(dwq, typeof(pool), release_dw);
	485	+ int inflight;
	486	+
	487	+ inflight = page_pool_release(pool);
	488	+ if (!inflight)
	489	+ return;
	490	+
	491	+ /* Periodic warning */
	492	+ if (time_after_eq(jiffies, pool->defer_warn)) {
	493	+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
	494	+
	495	+ pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
	496	+ __func__, inflight, sec);
	497	+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
	498	+ }
	499	+
	500	+ /* Still not ready to be disconnected, retry later */
	501	+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
	502	+}
	503	+
	504	+void page_pool_use_xdp_mem(struct page_pool pool, void (disconnect)(void *))
	505	+{
	506	+ refcount_inc(&pool->user_cnt);
	507	+ pool->disconnect = disconnect;
	508	+}
	509	+
	510	+void page_pool_destroy(struct page_pool *pool)
	511	+{
	512	+ if (!pool)
	513	+ return;
	514	+
	515	+ if (!page_pool_put(pool))
	516	+ return;
	517	+
	518	+ if (!page_pool_release(pool))
	519	+ return;
	520	+
	521	+ pool->defer_start = jiffies;
	522	+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
	523	+
	524	+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
	525	+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
316	526	}
317	527	EXPORT_SYMBOL(page_pool_destroy);
	528	+
	529	+/* Caller must provide appropriate safe context, e.g. NAPI. */
	530	+void page_pool_update_nid(struct page_pool *pool, int new_nid)
	531	+{
	532	+ struct page *page;
	533	+
	534	+ trace_page_pool_update_nid(pool, new_nid);
	535	+ pool->p.nid = new_nid;
	536	+
	537	+ /* Flush pool alloc cache, as refill will check NUMA node */
	538	+ while (pool->alloc.count) {
	539	+ page = pool->alloc.cache[--pool->alloc.count];
	540	+ page_pool_return_page(pool, page);
	541	+ }
	542	+}
	543	+EXPORT_SYMBOL(page_pool_update_nid);