Commit | Line | Data |
---|---|---|
ff7d6b27 JDB |
1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * | |
3 | * page_pool.c | |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> | |
5 | * Copyright (C) 2016 Red Hat, Inc. | |
6 | */ | |
32c28f7e | 7 | |
ff7d6b27 JDB |
8 | #include <linux/types.h> |
9 | #include <linux/kernel.h> | |
10 | #include <linux/slab.h> | |
f71fec47 | 11 | #include <linux/device.h> |
ff7d6b27 JDB |
12 | |
13 | #include <net/page_pool.h> | |
78862447 LB |
14 | #include <net/xdp.h> |
15 | ||
ff7d6b27 JDB |
16 | #include <linux/dma-direction.h> |
17 | #include <linux/dma-mapping.h> | |
18 | #include <linux/page-flags.h> | |
19 | #include <linux/mm.h> /* for __put_page() */ | |
c07aea3e | 20 | #include <linux/poison.h> |
ff7d6b27 | 21 | |
32c28f7e JDB |
22 | #include <trace/events/page_pool.h> |
23 | ||
c3f812ce JL |
24 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
25 | #define DEFER_WARN_INTERVAL (60 * HZ) | |
26 | ||
ff7d6b27 JDB |
27 | static int page_pool_init(struct page_pool *pool, |
28 | const struct page_pool_params *params) | |
29 | { | |
30 | unsigned int ring_qsize = 1024; /* Default */ | |
31 | ||
32 | memcpy(&pool->p, params, sizeof(pool->p)); | |
33 | ||
34 | /* Validate only known flags were used */ | |
35 | if (pool->p.flags & ~(PP_FLAG_ALL)) | |
36 | return -EINVAL; | |
37 | ||
38 | if (pool->p.pool_size) | |
39 | ring_qsize = pool->p.pool_size; | |
40 | ||
41 | /* Sanity limit mem that can be pinned down */ | |
42 | if (ring_qsize > 32768) | |
43 | return -E2BIG; | |
44 | ||
45 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. | |
46 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, | |
47 | * which is the XDP_TX use-case. | |
48 | */ | |
798dda81 DK |
49 | if (pool->p.flags & PP_FLAG_DMA_MAP) { |
50 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && | |
51 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) | |
52 | return -EINVAL; | |
53 | } | |
ff7d6b27 | 54 | |
e68bc756 LB |
55 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { |
56 | /* In order to request DMA-sync-for-device the page | |
57 | * needs to be mapped | |
58 | */ | |
59 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) | |
60 | return -EINVAL; | |
61 | ||
62 | if (!pool->p.max_len) | |
63 | return -EINVAL; | |
64 | ||
65 | /* pool->p.offset has to be set according to the address | |
66 | * offset used by the DMA engine to start copying rx data | |
67 | */ | |
68 | } | |
69 | ||
ff7d6b27 JDB |
70 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) |
71 | return -ENOMEM; | |
72 | ||
99c07c43 JDB |
73 | atomic_set(&pool->pages_state_release_cnt, 0); |
74 | ||
1da4bbef IK |
75 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
76 | refcount_set(&pool->user_cnt, 1); | |
77 | ||
f71fec47 JDB |
78 | if (pool->p.flags & PP_FLAG_DMA_MAP) |
79 | get_device(pool->p.dev); | |
80 | ||
ff7d6b27 JDB |
81 | return 0; |
82 | } | |
83 | ||
84 | struct page_pool *page_pool_create(const struct page_pool_params *params) | |
85 | { | |
86 | struct page_pool *pool; | |
873343e7 | 87 | int err; |
ff7d6b27 JDB |
88 | |
89 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); | |
90 | if (!pool) | |
91 | return ERR_PTR(-ENOMEM); | |
92 | ||
93 | err = page_pool_init(pool, params); | |
94 | if (err < 0) { | |
95 | pr_warn("%s() gave up with errno %d\n", __func__, err); | |
96 | kfree(pool); | |
97 | return ERR_PTR(err); | |
98 | } | |
1da4bbef | 99 | |
ff7d6b27 JDB |
100 | return pool; |
101 | } | |
102 | EXPORT_SYMBOL(page_pool_create); | |
103 | ||
458de8a9 | 104 | static void page_pool_return_page(struct page_pool *pool, struct page *page); |
44768dec JDB |
105 | |
106 | noinline | |
304db6cb | 107 | static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) |
44768dec JDB |
108 | { |
109 | struct ptr_ring *r = &pool->ring; | |
110 | struct page *page; | |
111 | int pref_nid; /* preferred NUMA node */ | |
112 | ||
113 | /* Quicker fallback, avoid locks when ring is empty */ | |
114 | if (__ptr_ring_empty(r)) | |
115 | return NULL; | |
116 | ||
117 | /* Softirq guarantee CPU and thus NUMA node is stable. This, | |
118 | * assumes CPU refilling driver RX-ring will also run RX-NAPI. | |
119 | */ | |
f13fc107 | 120 | #ifdef CONFIG_NUMA |
44768dec | 121 | pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; |
f13fc107 JDB |
122 | #else |
123 | /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ | |
124 | pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ | |
125 | #endif | |
44768dec JDB |
126 | |
127 | /* Slower-path: Get pages from locked ring queue */ | |
128 | spin_lock(&r->consumer_lock); | |
129 | ||
130 | /* Refill alloc array, but only if NUMA match */ | |
131 | do { | |
132 | page = __ptr_ring_consume(r); | |
133 | if (unlikely(!page)) | |
134 | break; | |
135 | ||
136 | if (likely(page_to_nid(page) == pref_nid)) { | |
137 | pool->alloc.cache[pool->alloc.count++] = page; | |
138 | } else { | |
139 | /* NUMA mismatch; | |
140 | * (1) release 1 page to page-allocator and | |
141 | * (2) break out to fallthrough to alloc_pages_node. | |
142 | * This limit stress on page buddy alloactor. | |
143 | */ | |
458de8a9 | 144 | page_pool_return_page(pool, page); |
44768dec JDB |
145 | page = NULL; |
146 | break; | |
147 | } | |
304db6cb | 148 | } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); |
44768dec JDB |
149 | |
150 | /* Return last page */ | |
151 | if (likely(pool->alloc.count > 0)) | |
152 | page = pool->alloc.cache[--pool->alloc.count]; | |
153 | ||
154 | spin_unlock(&r->consumer_lock); | |
155 | return page; | |
156 | } | |
157 | ||
ff7d6b27 JDB |
158 | /* fast path */ |
159 | static struct page *__page_pool_get_cached(struct page_pool *pool) | |
160 | { | |
ff7d6b27 JDB |
161 | struct page *page; |
162 | ||
304db6cb LR |
163 | /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ |
164 | if (likely(pool->alloc.count)) { | |
165 | /* Fast-path */ | |
166 | page = pool->alloc.cache[--pool->alloc.count]; | |
167 | } else { | |
168 | page = page_pool_refill_alloc_cache(pool); | |
ff7d6b27 JDB |
169 | } |
170 | ||
ff7d6b27 JDB |
171 | return page; |
172 | } | |
173 | ||
e68bc756 LB |
174 | static void page_pool_dma_sync_for_device(struct page_pool *pool, |
175 | struct page *page, | |
176 | unsigned int dma_sync_size) | |
177 | { | |
9ddb3c14 MWO |
178 | dma_addr_t dma_addr = page_pool_get_dma_addr(page); |
179 | ||
e68bc756 | 180 | dma_sync_size = min(dma_sync_size, pool->p.max_len); |
9ddb3c14 | 181 | dma_sync_single_range_for_device(pool->p.dev, dma_addr, |
e68bc756 LB |
182 | pool->p.offset, dma_sync_size, |
183 | pool->p.dma_dir); | |
184 | } | |
185 | ||
dfa59717 JDB |
186 | static bool page_pool_dma_map(struct page_pool *pool, struct page *page) |
187 | { | |
188 | dma_addr_t dma; | |
189 | ||
190 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr | |
191 | * since dma_addr_t can be either 32 or 64 bits and does not always fit | |
192 | * into page private data (i.e 32bit cpu with 64bit DMA caps) | |
193 | * This mapping is kept for lifetime of page, until leaving pool. | |
194 | */ | |
195 | dma = dma_map_page_attrs(pool->p.dev, page, 0, | |
196 | (PAGE_SIZE << pool->p.order), | |
197 | pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); | |
198 | if (dma_mapping_error(pool->p.dev, dma)) | |
199 | return false; | |
200 | ||
9ddb3c14 | 201 | page_pool_set_dma_addr(page, dma); |
dfa59717 JDB |
202 | |
203 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) | |
204 | page_pool_dma_sync_for_device(pool, page, pool->p.max_len); | |
205 | ||
206 | return true; | |
207 | } | |
208 | ||
be5dba25 JDB |
209 | static struct page *__page_pool_alloc_page_order(struct page_pool *pool, |
210 | gfp_t gfp) | |
ff7d6b27 JDB |
211 | { |
212 | struct page *page; | |
ff7d6b27 | 213 | |
be5dba25 | 214 | gfp |= __GFP_COMP; |
ff7d6b27 | 215 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); |
be5dba25 | 216 | if (unlikely(!page)) |
ff7d6b27 JDB |
217 | return NULL; |
218 | ||
be5dba25 | 219 | if ((pool->p.flags & PP_FLAG_DMA_MAP) && |
dfa59717 | 220 | unlikely(!page_pool_dma_map(pool, page))) { |
ff7d6b27 JDB |
221 | put_page(page); |
222 | return NULL; | |
223 | } | |
ff7d6b27 | 224 | |
c07aea3e MC |
225 | page->pp_magic |= PP_SIGNATURE; |
226 | ||
99c07c43 JDB |
227 | /* Track how many pages are held 'in-flight' */ |
228 | pool->pages_state_hold_cnt++; | |
32c28f7e | 229 | trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); |
be5dba25 JDB |
230 | return page; |
231 | } | |
232 | ||
233 | /* slow path */ | |
234 | noinline | |
235 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, | |
236 | gfp_t gfp) | |
237 | { | |
238 | const int bulk = PP_ALLOC_CACHE_REFILL; | |
239 | unsigned int pp_flags = pool->p.flags; | |
240 | unsigned int pp_order = pool->p.order; | |
241 | struct page *page; | |
242 | int i, nr_pages; | |
243 | ||
244 | /* Don't support bulk alloc for high-order pages */ | |
245 | if (unlikely(pp_order)) | |
246 | return __page_pool_alloc_page_order(pool, gfp); | |
247 | ||
248 | /* Unnecessary as alloc cache is empty, but guarantees zero count */ | |
249 | if (unlikely(pool->alloc.count > 0)) | |
250 | return pool->alloc.cache[--pool->alloc.count]; | |
251 | ||
252 | /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ | |
253 | memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); | |
254 | ||
255 | nr_pages = alloc_pages_bulk_array(gfp, bulk, pool->alloc.cache); | |
256 | if (unlikely(!nr_pages)) | |
257 | return NULL; | |
258 | ||
259 | /* Pages have been filled into alloc.cache array, but count is zero and | |
260 | * page element have not been (possibly) DMA mapped. | |
261 | */ | |
262 | for (i = 0; i < nr_pages; i++) { | |
263 | page = pool->alloc.cache[i]; | |
264 | if ((pp_flags & PP_FLAG_DMA_MAP) && | |
265 | unlikely(!page_pool_dma_map(pool, page))) { | |
266 | put_page(page); | |
267 | continue; | |
268 | } | |
c07aea3e | 269 | page->pp_magic |= PP_SIGNATURE; |
be5dba25 JDB |
270 | pool->alloc.cache[pool->alloc.count++] = page; |
271 | /* Track how many pages are held 'in-flight' */ | |
272 | pool->pages_state_hold_cnt++; | |
273 | trace_page_pool_state_hold(pool, page, | |
274 | pool->pages_state_hold_cnt); | |
275 | } | |
276 | ||
277 | /* Return last page */ | |
278 | if (likely(pool->alloc.count > 0)) | |
279 | page = pool->alloc.cache[--pool->alloc.count]; | |
280 | else | |
281 | page = NULL; | |
32c28f7e | 282 | |
ff7d6b27 JDB |
283 | /* When page just alloc'ed is should/must have refcnt 1. */ |
284 | return page; | |
285 | } | |
286 | ||
287 | /* For using page_pool replace: alloc_pages() API calls, but provide | |
288 | * synchronization guarantee for allocation side. | |
289 | */ | |
290 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) | |
291 | { | |
292 | struct page *page; | |
293 | ||
294 | /* Fast-path: Get a page from cache */ | |
295 | page = __page_pool_get_cached(pool); | |
296 | if (page) | |
297 | return page; | |
298 | ||
299 | /* Slow-path: cache empty, do real allocation */ | |
300 | page = __page_pool_alloc_pages_slow(pool, gfp); | |
301 | return page; | |
302 | } | |
303 | EXPORT_SYMBOL(page_pool_alloc_pages); | |
304 | ||
99c07c43 JDB |
305 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
306 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution | |
307 | */ | |
308 | #define _distance(a, b) (s32)((a) - (b)) | |
309 | ||
310 | static s32 page_pool_inflight(struct page_pool *pool) | |
311 | { | |
312 | u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); | |
313 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); | |
c3f812ce | 314 | s32 inflight; |
99c07c43 | 315 | |
c3f812ce | 316 | inflight = _distance(hold_cnt, release_cnt); |
99c07c43 | 317 | |
7c9e6942 | 318 | trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); |
99c07c43 JDB |
319 | WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); |
320 | ||
c3f812ce | 321 | return inflight; |
99c07c43 JDB |
322 | } |
323 | ||
458de8a9 IA |
324 | /* Disconnects a page (from a page_pool). API users can have a need |
325 | * to disconnect a page (from a page_pool), to allow it to be used as | |
326 | * a regular page (that will eventually be returned to the normal | |
327 | * page-allocator via put_page). | |
328 | */ | |
329 | void page_pool_release_page(struct page_pool *pool, struct page *page) | |
ff7d6b27 | 330 | { |
1567b85e | 331 | dma_addr_t dma; |
c3f812ce | 332 | int count; |
1567b85e | 333 | |
ff7d6b27 | 334 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
458de8a9 IA |
335 | /* Always account for inflight pages, even if we didn't |
336 | * map them | |
337 | */ | |
99c07c43 | 338 | goto skip_dma_unmap; |
ff7d6b27 | 339 | |
9ddb3c14 | 340 | dma = page_pool_get_dma_addr(page); |
458de8a9 | 341 | |
9ddb3c14 | 342 | /* When page is unmapped, it cannot be returned to our pool */ |
13f16d9d JDB |
343 | dma_unmap_page_attrs(pool->p.dev, dma, |
344 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, | |
345 | DMA_ATTR_SKIP_CPU_SYNC); | |
9ddb3c14 | 346 | page_pool_set_dma_addr(page, 0); |
99c07c43 | 347 | skip_dma_unmap: |
c07aea3e MC |
348 | page->pp_magic = 0; |
349 | ||
c3f812ce JL |
350 | /* This may be the last page returned, releasing the pool, so |
351 | * it is not safe to reference pool afterwards. | |
352 | */ | |
353 | count = atomic_inc_return(&pool->pages_state_release_cnt); | |
354 | trace_page_pool_state_release(pool, page, count); | |
ff7d6b27 | 355 | } |
458de8a9 | 356 | EXPORT_SYMBOL(page_pool_release_page); |
a25d50bf | 357 | |
ff7d6b27 | 358 | /* Return a page to the page allocator, cleaning up our state */ |
458de8a9 | 359 | static void page_pool_return_page(struct page_pool *pool, struct page *page) |
ff7d6b27 | 360 | { |
458de8a9 | 361 | page_pool_release_page(pool, page); |
99c07c43 | 362 | |
ff7d6b27 JDB |
363 | put_page(page); |
364 | /* An optimization would be to call __free_pages(page, pool->p.order) | |
365 | * knowing page is not part of page-cache (thus avoiding a | |
366 | * __page_cache_release() call). | |
367 | */ | |
368 | } | |
369 | ||
458de8a9 | 370 | static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) |
ff7d6b27 JDB |
371 | { |
372 | int ret; | |
373 | /* BH protection not needed if current is serving softirq */ | |
374 | if (in_serving_softirq()) | |
375 | ret = ptr_ring_produce(&pool->ring, page); | |
376 | else | |
377 | ret = ptr_ring_produce_bh(&pool->ring, page); | |
378 | ||
379 | return (ret == 0) ? true : false; | |
380 | } | |
381 | ||
382 | /* Only allow direct recycling in special circumstances, into the | |
383 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. | |
384 | * | |
385 | * Caller must provide appropriate safe context. | |
386 | */ | |
458de8a9 | 387 | static bool page_pool_recycle_in_cache(struct page *page, |
ff7d6b27 JDB |
388 | struct page_pool *pool) |
389 | { | |
390 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) | |
391 | return false; | |
392 | ||
393 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ | |
394 | pool->alloc.cache[pool->alloc.count++] = page; | |
395 | return true; | |
396 | } | |
397 | ||
458de8a9 IA |
398 | /* If the page refcnt == 1, this will try to recycle the page. |
399 | * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for | |
400 | * the configured size min(dma_sync_size, pool->max_len). | |
401 | * If the page refcnt != 1, then the page will be returned to memory | |
402 | * subsystem. | |
403 | */ | |
78862447 LB |
404 | static __always_inline struct page * |
405 | __page_pool_put_page(struct page_pool *pool, struct page *page, | |
406 | unsigned int dma_sync_size, bool allow_direct) | |
ff7d6b27 JDB |
407 | { |
408 | /* This allocator is optimized for the XDP mode that uses | |
409 | * one-frame-per-page, but have fallbacks that act like the | |
410 | * regular page allocator APIs. | |
411 | * | |
412 | * refcnt == 1 means page_pool owns page, and can recycle it. | |
05656132 AL |
413 | * |
414 | * page is NOT reusable when allocated when system is under | |
415 | * some pressure. (page_is_pfmemalloc) | |
ff7d6b27 | 416 | */ |
05656132 | 417 | if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { |
ff7d6b27 JDB |
418 | /* Read barrier done in page_ref_count / READ_ONCE */ |
419 | ||
e68bc756 LB |
420 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) |
421 | page_pool_dma_sync_for_device(pool, page, | |
422 | dma_sync_size); | |
423 | ||
78862447 LB |
424 | if (allow_direct && in_serving_softirq() && |
425 | page_pool_recycle_in_cache(page, pool)) | |
426 | return NULL; | |
ff7d6b27 | 427 | |
78862447 LB |
428 | /* Page found as candidate for recycling */ |
429 | return page; | |
ff7d6b27 JDB |
430 | } |
431 | /* Fallback/non-XDP mode: API user have elevated refcnt. | |
432 | * | |
433 | * Many drivers split up the page into fragments, and some | |
434 | * want to keep doing this to save memory and do refcnt based | |
435 | * recycling. Support this use case too, to ease drivers | |
436 | * switching between XDP/non-XDP. | |
437 | * | |
438 | * In-case page_pool maintains the DMA mapping, API user must | |
439 | * call page_pool_put_page once. In this elevated refcnt | |
440 | * case, the DMA is unmapped/released, as driver is likely | |
441 | * doing refcnt based recycle tricks, meaning another process | |
442 | * will be invoking put_page. | |
443 | */ | |
458de8a9 IA |
444 | /* Do not replace this with page_pool_return_page() */ |
445 | page_pool_release_page(pool, page); | |
ff7d6b27 | 446 | put_page(page); |
78862447 LB |
447 | |
448 | return NULL; | |
449 | } | |
450 | ||
451 | void page_pool_put_page(struct page_pool *pool, struct page *page, | |
452 | unsigned int dma_sync_size, bool allow_direct) | |
453 | { | |
454 | page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); | |
455 | if (page && !page_pool_recycle_in_ring(pool, page)) { | |
456 | /* Cache full, fallback to free pages */ | |
457 | page_pool_return_page(pool, page); | |
458 | } | |
ff7d6b27 | 459 | } |
458de8a9 | 460 | EXPORT_SYMBOL(page_pool_put_page); |
ff7d6b27 | 461 | |
78862447 LB |
462 | /* Caller must not use data area after call, as this function overwrites it */ |
463 | void page_pool_put_page_bulk(struct page_pool *pool, void **data, | |
464 | int count) | |
465 | { | |
466 | int i, bulk_len = 0; | |
467 | ||
468 | for (i = 0; i < count; i++) { | |
469 | struct page *page = virt_to_head_page(data[i]); | |
470 | ||
471 | page = __page_pool_put_page(pool, page, -1, false); | |
472 | /* Approved for bulk recycling in ptr_ring cache */ | |
473 | if (page) | |
474 | data[bulk_len++] = page; | |
475 | } | |
476 | ||
477 | if (unlikely(!bulk_len)) | |
478 | return; | |
479 | ||
480 | /* Bulk producer into ptr_ring page_pool cache */ | |
481 | page_pool_ring_lock(pool); | |
482 | for (i = 0; i < bulk_len; i++) { | |
483 | if (__ptr_ring_produce(&pool->ring, data[i])) | |
484 | break; /* ring full */ | |
485 | } | |
486 | page_pool_ring_unlock(pool); | |
487 | ||
488 | /* Hopefully all pages was return into ptr_ring */ | |
489 | if (likely(i == bulk_len)) | |
490 | return; | |
491 | ||
492 | /* ptr_ring cache full, free remaining pages outside producer lock | |
493 | * since put_page() with refcnt == 1 can be an expensive operation | |
494 | */ | |
495 | for (; i < bulk_len; i++) | |
496 | page_pool_return_page(pool, data[i]); | |
497 | } | |
498 | EXPORT_SYMBOL(page_pool_put_page_bulk); | |
499 | ||
458de8a9 | 500 | static void page_pool_empty_ring(struct page_pool *pool) |
ff7d6b27 JDB |
501 | { |
502 | struct page *page; | |
503 | ||
504 | /* Empty recycle ring */ | |
4905bd9a | 505 | while ((page = ptr_ring_consume_bh(&pool->ring))) { |
ff7d6b27 JDB |
506 | /* Verify the refcnt invariant of cached pages */ |
507 | if (!(page_ref_count(page) == 1)) | |
508 | pr_crit("%s() page_pool refcnt %d violation\n", | |
509 | __func__, page_ref_count(page)); | |
510 | ||
458de8a9 | 511 | page_pool_return_page(pool, page); |
ff7d6b27 JDB |
512 | } |
513 | } | |
514 | ||
c3f812ce | 515 | static void page_pool_free(struct page_pool *pool) |
d956a048 | 516 | { |
c3f812ce JL |
517 | if (pool->disconnect) |
518 | pool->disconnect(pool); | |
e54cfd7e JDB |
519 | |
520 | ptr_ring_cleanup(&pool->ring, NULL); | |
f71fec47 JDB |
521 | |
522 | if (pool->p.flags & PP_FLAG_DMA_MAP) | |
523 | put_device(pool->p.dev); | |
524 | ||
e54cfd7e JDB |
525 | kfree(pool); |
526 | } | |
e54cfd7e | 527 | |
7c9e6942 | 528 | static void page_pool_empty_alloc_cache_once(struct page_pool *pool) |
ff7d6b27 JDB |
529 | { |
530 | struct page *page; | |
531 | ||
7c9e6942 JDB |
532 | if (pool->destroy_cnt) |
533 | return; | |
534 | ||
ff7d6b27 JDB |
535 | /* Empty alloc cache, assume caller made sure this is |
536 | * no-longer in use, and page_pool_alloc_pages() cannot be | |
537 | * call concurrently. | |
538 | */ | |
539 | while (pool->alloc.count) { | |
540 | page = pool->alloc.cache[--pool->alloc.count]; | |
458de8a9 | 541 | page_pool_return_page(pool, page); |
ff7d6b27 | 542 | } |
7c9e6942 JDB |
543 | } |
544 | ||
545 | static void page_pool_scrub(struct page_pool *pool) | |
546 | { | |
547 | page_pool_empty_alloc_cache_once(pool); | |
548 | pool->destroy_cnt++; | |
ff7d6b27 JDB |
549 | |
550 | /* No more consumers should exist, but producers could still | |
551 | * be in-flight. | |
552 | */ | |
458de8a9 | 553 | page_pool_empty_ring(pool); |
c3f812ce JL |
554 | } |
555 | ||
556 | static int page_pool_release(struct page_pool *pool) | |
557 | { | |
558 | int inflight; | |
559 | ||
560 | page_pool_scrub(pool); | |
561 | inflight = page_pool_inflight(pool); | |
562 | if (!inflight) | |
563 | page_pool_free(pool); | |
564 | ||
565 | return inflight; | |
566 | } | |
567 | ||
568 | static void page_pool_release_retry(struct work_struct *wq) | |
569 | { | |
570 | struct delayed_work *dwq = to_delayed_work(wq); | |
571 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); | |
572 | int inflight; | |
573 | ||
574 | inflight = page_pool_release(pool); | |
575 | if (!inflight) | |
576 | return; | |
577 | ||
578 | /* Periodic warning */ | |
579 | if (time_after_eq(jiffies, pool->defer_warn)) { | |
580 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; | |
581 | ||
582 | pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", | |
583 | __func__, inflight, sec); | |
584 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
585 | } | |
586 | ||
587 | /* Still not ready to be disconnected, retry later */ | |
588 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
589 | } | |
590 | ||
591 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) | |
592 | { | |
593 | refcount_inc(&pool->user_cnt); | |
594 | pool->disconnect = disconnect; | |
595 | } | |
596 | ||
597 | void page_pool_destroy(struct page_pool *pool) | |
598 | { | |
599 | if (!pool) | |
600 | return; | |
601 | ||
602 | if (!page_pool_put(pool)) | |
603 | return; | |
604 | ||
605 | if (!page_pool_release(pool)) | |
606 | return; | |
607 | ||
608 | pool->defer_start = jiffies; | |
609 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
ff7d6b27 | 610 | |
c3f812ce JL |
611 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
612 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
ff7d6b27 | 613 | } |
c3f812ce | 614 | EXPORT_SYMBOL(page_pool_destroy); |
bc836748 SM |
615 | |
616 | /* Caller must provide appropriate safe context, e.g. NAPI. */ | |
617 | void page_pool_update_nid(struct page_pool *pool, int new_nid) | |
618 | { | |
44768dec JDB |
619 | struct page *page; |
620 | ||
bc836748 SM |
621 | trace_page_pool_update_nid(pool, new_nid); |
622 | pool->p.nid = new_nid; | |
44768dec JDB |
623 | |
624 | /* Flush pool alloc cache, as refill will check NUMA node */ | |
625 | while (pool->alloc.count) { | |
626 | page = pool->alloc.cache[--pool->alloc.count]; | |
458de8a9 | 627 | page_pool_return_page(pool, page); |
44768dec | 628 | } |
bc836748 SM |
629 | } |
630 | EXPORT_SYMBOL(page_pool_update_nid); | |
6a5bcd84 IA |
631 | |
632 | bool page_pool_return_skb_page(struct page *page) | |
633 | { | |
634 | struct page_pool *pp; | |
635 | ||
636 | page = compound_head(page); | |
637 | if (unlikely(page->pp_magic != PP_SIGNATURE)) | |
638 | return false; | |
639 | ||
640 | pp = page->pp; | |
641 | ||
642 | /* Driver set this to memory recycling info. Reset it on recycle. | |
643 | * This will *not* work for NIC using a split-page memory model. | |
644 | * The page will be returned to the pool here regardless of the | |
645 | * 'flipped' fragment being in use or not. | |
646 | */ | |
647 | page->pp = NULL; | |
648 | page_pool_put_full_page(pp, page, false); | |
649 | ||
650 | return true; | |
651 | } | |
652 | EXPORT_SYMBOL(page_pool_return_skb_page); |