Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[linux-2.6-block.git] / net / core / page_pool.c
CommitLineData
ff7d6b27
JDB
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
32c28f7e 7
ff7d6b27
JDB
8#include <linux/types.h>
9#include <linux/kernel.h>
10#include <linux/slab.h>
f71fec47 11#include <linux/device.h>
ff7d6b27
JDB
12
13#include <net/page_pool.h>
14#include <linux/dma-direction.h>
15#include <linux/dma-mapping.h>
16#include <linux/page-flags.h>
17#include <linux/mm.h> /* for __put_page() */
18
32c28f7e
JDB
19#include <trace/events/page_pool.h>
20
ff7d6b27
JDB
21static int page_pool_init(struct page_pool *pool,
22 const struct page_pool_params *params)
23{
24 unsigned int ring_qsize = 1024; /* Default */
25
26 memcpy(&pool->p, params, sizeof(pool->p));
27
28 /* Validate only known flags were used */
29 if (pool->p.flags & ~(PP_FLAG_ALL))
30 return -EINVAL;
31
32 if (pool->p.pool_size)
33 ring_qsize = pool->p.pool_size;
34
35 /* Sanity limit mem that can be pinned down */
36 if (ring_qsize > 32768)
37 return -E2BIG;
38
39 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
40 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
41 * which is the XDP_TX use-case.
42 */
43 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
44 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
45 return -EINVAL;
46
47 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
48 return -ENOMEM;
49
99c07c43
JDB
50 atomic_set(&pool->pages_state_release_cnt, 0);
51
1da4bbef
IK
52 /* Driver calling page_pool_create() also call page_pool_destroy() */
53 refcount_set(&pool->user_cnt, 1);
54
f71fec47
JDB
55 if (pool->p.flags & PP_FLAG_DMA_MAP)
56 get_device(pool->p.dev);
57
ff7d6b27
JDB
58 return 0;
59}
60
61struct page_pool *page_pool_create(const struct page_pool_params *params)
62{
63 struct page_pool *pool;
64 int err = 0;
65
66 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
67 if (!pool)
68 return ERR_PTR(-ENOMEM);
69
70 err = page_pool_init(pool, params);
71 if (err < 0) {
72 pr_warn("%s() gave up with errno %d\n", __func__, err);
73 kfree(pool);
74 return ERR_PTR(err);
75 }
1da4bbef 76
ff7d6b27
JDB
77 return pool;
78}
79EXPORT_SYMBOL(page_pool_create);
80
81/* fast path */
82static struct page *__page_pool_get_cached(struct page_pool *pool)
83{
84 struct ptr_ring *r = &pool->ring;
85 struct page *page;
86
87 /* Quicker fallback, avoid locks when ring is empty */
88 if (__ptr_ring_empty(r))
89 return NULL;
90
91 /* Test for safe-context, caller should provide this guarantee */
92 if (likely(in_serving_softirq())) {
93 if (likely(pool->alloc.count)) {
94 /* Fast-path */
95 page = pool->alloc.cache[--pool->alloc.count];
96 return page;
97 }
98 /* Slower-path: Alloc array empty, time to refill
99 *
100 * Open-coded bulk ptr_ring consumer.
101 *
102 * Discussion: the ring consumer lock is not really
103 * needed due to the softirq/NAPI protection, but
104 * later need the ability to reclaim pages on the
105 * ring. Thus, keeping the locks.
106 */
107 spin_lock(&r->consumer_lock);
108 while ((page = __ptr_ring_consume(r))) {
109 if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
110 break;
111 pool->alloc.cache[pool->alloc.count++] = page;
112 }
113 spin_unlock(&r->consumer_lock);
114 return page;
115 }
116
117 /* Slow-path: Get page from locked ring queue */
118 page = ptr_ring_consume(&pool->ring);
119 return page;
120}
121
122/* slow path */
123noinline
124static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
125 gfp_t _gfp)
126{
127 struct page *page;
128 gfp_t gfp = _gfp;
129 dma_addr_t dma;
130
131 /* We could always set __GFP_COMP, and avoid this branch, as
132 * prep_new_page() can handle order-0 with __GFP_COMP.
133 */
134 if (pool->p.order)
135 gfp |= __GFP_COMP;
136
137 /* FUTURE development:
138 *
139 * Current slow-path essentially falls back to single page
140 * allocations, which doesn't improve performance. This code
141 * need bulk allocation support from the page allocator code.
142 */
143
144 /* Cache was empty, do real allocation */
145 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
146 if (!page)
147 return NULL;
148
149 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
150 goto skip_dma_map;
151
1567b85e
IA
152 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
153 * since dma_addr_t can be either 32 or 64 bits and does not always fit
154 * into page private data (i.e 32bit cpu with 64bit DMA caps)
ff7d6b27
JDB
155 * This mapping is kept for lifetime of page, until leaving pool.
156 */
13f16d9d
JDB
157 dma = dma_map_page_attrs(pool->p.dev, page, 0,
158 (PAGE_SIZE << pool->p.order),
159 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
ff7d6b27
JDB
160 if (dma_mapping_error(pool->p.dev, dma)) {
161 put_page(page);
162 return NULL;
163 }
1567b85e 164 page->dma_addr = dma;
ff7d6b27
JDB
165
166skip_dma_map:
99c07c43
JDB
167 /* Track how many pages are held 'in-flight' */
168 pool->pages_state_hold_cnt++;
169
32c28f7e
JDB
170 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
171
ff7d6b27
JDB
172 /* When page just alloc'ed is should/must have refcnt 1. */
173 return page;
174}
175
176/* For using page_pool replace: alloc_pages() API calls, but provide
177 * synchronization guarantee for allocation side.
178 */
179struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
180{
181 struct page *page;
182
183 /* Fast-path: Get a page from cache */
184 page = __page_pool_get_cached(pool);
185 if (page)
186 return page;
187
188 /* Slow-path: cache empty, do real allocation */
189 page = __page_pool_alloc_pages_slow(pool, gfp);
190 return page;
191}
192EXPORT_SYMBOL(page_pool_alloc_pages);
193
99c07c43
JDB
194/* Calculate distance between two u32 values, valid if distance is below 2^(31)
195 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
196 */
197#define _distance(a, b) (s32)((a) - (b))
198
199static s32 page_pool_inflight(struct page_pool *pool)
200{
201 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
202 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
203 s32 distance;
204
205 distance = _distance(hold_cnt, release_cnt);
206
32c28f7e 207 trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
99c07c43
JDB
208 return distance;
209}
210
211static bool __page_pool_safe_to_destroy(struct page_pool *pool)
212{
213 s32 inflight = page_pool_inflight(pool);
214
215 /* The distance should not be able to become negative */
216 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
217
218 return (inflight == 0);
219}
220
ff7d6b27
JDB
221/* Cleanup page_pool state from page */
222static void __page_pool_clean_page(struct page_pool *pool,
223 struct page *page)
224{
1567b85e
IA
225 dma_addr_t dma;
226
ff7d6b27 227 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
99c07c43 228 goto skip_dma_unmap;
ff7d6b27 229
1567b85e 230 dma = page->dma_addr;
ff7d6b27 231 /* DMA unmap */
13f16d9d
JDB
232 dma_unmap_page_attrs(pool->p.dev, dma,
233 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
234 DMA_ATTR_SKIP_CPU_SYNC);
1567b85e 235 page->dma_addr = 0;
99c07c43
JDB
236skip_dma_unmap:
237 atomic_inc(&pool->pages_state_release_cnt);
32c28f7e
JDB
238 trace_page_pool_state_release(pool, page,
239 atomic_read(&pool->pages_state_release_cnt));
ff7d6b27
JDB
240}
241
a25d50bf
IA
242/* unmap the page and clean our state */
243void page_pool_unmap_page(struct page_pool *pool, struct page *page)
244{
99c07c43
JDB
245 /* When page is unmapped, this implies page will not be
246 * returned to page_pool.
247 */
a25d50bf
IA
248 __page_pool_clean_page(pool, page);
249}
250EXPORT_SYMBOL(page_pool_unmap_page);
251
ff7d6b27
JDB
252/* Return a page to the page allocator, cleaning up our state */
253static void __page_pool_return_page(struct page_pool *pool, struct page *page)
254{
255 __page_pool_clean_page(pool, page);
99c07c43 256
ff7d6b27
JDB
257 put_page(page);
258 /* An optimization would be to call __free_pages(page, pool->p.order)
259 * knowing page is not part of page-cache (thus avoiding a
260 * __page_cache_release() call).
261 */
262}
263
264static bool __page_pool_recycle_into_ring(struct page_pool *pool,
265 struct page *page)
266{
267 int ret;
268 /* BH protection not needed if current is serving softirq */
269 if (in_serving_softirq())
270 ret = ptr_ring_produce(&pool->ring, page);
271 else
272 ret = ptr_ring_produce_bh(&pool->ring, page);
273
274 return (ret == 0) ? true : false;
275}
276
277/* Only allow direct recycling in special circumstances, into the
278 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
279 *
280 * Caller must provide appropriate safe context.
281 */
282static bool __page_pool_recycle_direct(struct page *page,
283 struct page_pool *pool)
284{
285 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
286 return false;
287
288 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
289 pool->alloc.cache[pool->alloc.count++] = page;
290 return true;
291}
292
293void __page_pool_put_page(struct page_pool *pool,
294 struct page *page, bool allow_direct)
295{
296 /* This allocator is optimized for the XDP mode that uses
297 * one-frame-per-page, but have fallbacks that act like the
298 * regular page allocator APIs.
299 *
300 * refcnt == 1 means page_pool owns page, and can recycle it.
301 */
302 if (likely(page_ref_count(page) == 1)) {
303 /* Read barrier done in page_ref_count / READ_ONCE */
304
305 if (allow_direct && in_serving_softirq())
306 if (__page_pool_recycle_direct(page, pool))
307 return;
308
309 if (!__page_pool_recycle_into_ring(pool, page)) {
310 /* Cache full, fallback to free pages */
311 __page_pool_return_page(pool, page);
312 }
313 return;
314 }
315 /* Fallback/non-XDP mode: API user have elevated refcnt.
316 *
317 * Many drivers split up the page into fragments, and some
318 * want to keep doing this to save memory and do refcnt based
319 * recycling. Support this use case too, to ease drivers
320 * switching between XDP/non-XDP.
321 *
322 * In-case page_pool maintains the DMA mapping, API user must
323 * call page_pool_put_page once. In this elevated refcnt
324 * case, the DMA is unmapped/released, as driver is likely
325 * doing refcnt based recycle tricks, meaning another process
326 * will be invoking put_page.
327 */
328 __page_pool_clean_page(pool, page);
329 put_page(page);
330}
331EXPORT_SYMBOL(__page_pool_put_page);
332
333static void __page_pool_empty_ring(struct page_pool *pool)
334{
335 struct page *page;
336
337 /* Empty recycle ring */
4905bd9a 338 while ((page = ptr_ring_consume_bh(&pool->ring))) {
ff7d6b27
JDB
339 /* Verify the refcnt invariant of cached pages */
340 if (!(page_ref_count(page) == 1))
341 pr_crit("%s() page_pool refcnt %d violation\n",
342 __func__, page_ref_count(page));
343
344 __page_pool_return_page(pool, page);
345 }
346}
347
d956a048
JDB
348static void __warn_in_flight(struct page_pool *pool)
349{
350 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
351 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
352 s32 distance;
353
354 distance = _distance(hold_cnt, release_cnt);
355
356 /* Drivers should fix this, but only problematic when DMA is used */
357 WARN(1, "Still in-flight pages:%d hold:%u released:%u",
358 distance, hold_cnt, release_cnt);
359}
360
e54cfd7e
JDB
361void __page_pool_free(struct page_pool *pool)
362{
1da4bbef
IK
363 /* Only last user actually free/release resources */
364 if (!page_pool_put(pool))
365 return;
366
e54cfd7e
JDB
367 WARN(pool->alloc.count, "API usage violation");
368 WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
d956a048
JDB
369
370 /* Can happen due to forced shutdown */
371 if (!__page_pool_safe_to_destroy(pool))
372 __warn_in_flight(pool);
e54cfd7e
JDB
373
374 ptr_ring_cleanup(&pool->ring, NULL);
f71fec47
JDB
375
376 if (pool->p.flags & PP_FLAG_DMA_MAP)
377 put_device(pool->p.dev);
378
e54cfd7e
JDB
379 kfree(pool);
380}
381EXPORT_SYMBOL(__page_pool_free);
382
99c07c43
JDB
383/* Request to shutdown: release pages cached by page_pool, and check
384 * for in-flight pages
385 */
386bool __page_pool_request_shutdown(struct page_pool *pool)
ff7d6b27
JDB
387{
388 struct page *page;
389
390 /* Empty alloc cache, assume caller made sure this is
391 * no-longer in use, and page_pool_alloc_pages() cannot be
392 * call concurrently.
393 */
394 while (pool->alloc.count) {
395 page = pool->alloc.cache[--pool->alloc.count];
396 __page_pool_return_page(pool, page);
397 }
398
399 /* No more consumers should exist, but producers could still
400 * be in-flight.
401 */
402 __page_pool_empty_ring(pool);
403
99c07c43 404 return __page_pool_safe_to_destroy(pool);
ff7d6b27 405}
99c07c43 406EXPORT_SYMBOL(__page_pool_request_shutdown);