Commit | Line | Data |
---|---|---|
ff7d6b27 JDB |
1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * | |
3 | * page_pool.c | |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> | |
5 | * Copyright (C) 2016 Red Hat, Inc. | |
6 | */ | |
32c28f7e | 7 | |
ff7d6b27 JDB |
8 | #include <linux/types.h> |
9 | #include <linux/kernel.h> | |
10 | #include <linux/slab.h> | |
f71fec47 | 11 | #include <linux/device.h> |
ff7d6b27 JDB |
12 | |
13 | #include <net/page_pool.h> | |
78862447 LB |
14 | #include <net/xdp.h> |
15 | ||
ff7d6b27 JDB |
16 | #include <linux/dma-direction.h> |
17 | #include <linux/dma-mapping.h> | |
18 | #include <linux/page-flags.h> | |
8d29c703 | 19 | #include <linux/mm.h> /* for put_page() */ |
c07aea3e | 20 | #include <linux/poison.h> |
f3c5264f | 21 | #include <linux/ethtool.h> |
ff7d6b27 | 22 | |
32c28f7e JDB |
23 | #include <trace/events/page_pool.h> |
24 | ||
c3f812ce JL |
25 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
26 | #define DEFER_WARN_INTERVAL (60 * HZ) | |
27 | ||
53e0961d YL |
28 | #define BIAS_MAX LONG_MAX |
29 | ||
8610037e JD |
30 | #ifdef CONFIG_PAGE_POOL_STATS |
31 | /* alloc_stat_inc is intended to be used in softirq context */ | |
32 | #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) | |
ad6fa1e1 JD |
33 | /* recycle_stat_inc is safe to use when preemption is possible. */ |
34 | #define recycle_stat_inc(pool, __stat) \ | |
35 | do { \ | |
36 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ | |
37 | this_cpu_inc(s->__stat); \ | |
38 | } while (0) | |
6b95e338 | 39 | |
590032a4 LB |
40 | #define recycle_stat_add(pool, __stat, val) \ |
41 | do { \ | |
42 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ | |
43 | this_cpu_add(s->__stat, val); \ | |
44 | } while (0) | |
45 | ||
f3c5264f LB |
46 | static const char pp_stats[][ETH_GSTRING_LEN] = { |
47 | "rx_pp_alloc_fast", | |
48 | "rx_pp_alloc_slow", | |
49 | "rx_pp_alloc_slow_ho", | |
50 | "rx_pp_alloc_empty", | |
51 | "rx_pp_alloc_refill", | |
52 | "rx_pp_alloc_waive", | |
53 | "rx_pp_recycle_cached", | |
54 | "rx_pp_recycle_cache_full", | |
55 | "rx_pp_recycle_ring", | |
56 | "rx_pp_recycle_ring_full", | |
57 | "rx_pp_recycle_released_ref", | |
58 | }; | |
59 | ||
6b95e338 JD |
60 | bool page_pool_get_stats(struct page_pool *pool, |
61 | struct page_pool_stats *stats) | |
62 | { | |
63 | int cpu = 0; | |
64 | ||
65 | if (!stats) | |
66 | return false; | |
67 | ||
f3c5264f LB |
68 | /* The caller is responsible to initialize stats. */ |
69 | stats->alloc_stats.fast += pool->alloc_stats.fast; | |
70 | stats->alloc_stats.slow += pool->alloc_stats.slow; | |
71 | stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; | |
72 | stats->alloc_stats.empty += pool->alloc_stats.empty; | |
73 | stats->alloc_stats.refill += pool->alloc_stats.refill; | |
74 | stats->alloc_stats.waive += pool->alloc_stats.waive; | |
6b95e338 JD |
75 | |
76 | for_each_possible_cpu(cpu) { | |
77 | const struct page_pool_recycle_stats *pcpu = | |
78 | per_cpu_ptr(pool->recycle_stats, cpu); | |
79 | ||
80 | stats->recycle_stats.cached += pcpu->cached; | |
81 | stats->recycle_stats.cache_full += pcpu->cache_full; | |
82 | stats->recycle_stats.ring += pcpu->ring; | |
83 | stats->recycle_stats.ring_full += pcpu->ring_full; | |
84 | stats->recycle_stats.released_refcnt += pcpu->released_refcnt; | |
85 | } | |
86 | ||
87 | return true; | |
88 | } | |
89 | EXPORT_SYMBOL(page_pool_get_stats); | |
f3c5264f LB |
90 | |
91 | u8 *page_pool_ethtool_stats_get_strings(u8 *data) | |
92 | { | |
93 | int i; | |
94 | ||
95 | for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { | |
96 | memcpy(data, pp_stats[i], ETH_GSTRING_LEN); | |
97 | data += ETH_GSTRING_LEN; | |
98 | } | |
99 | ||
100 | return data; | |
101 | } | |
102 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); | |
103 | ||
104 | int page_pool_ethtool_stats_get_count(void) | |
105 | { | |
106 | return ARRAY_SIZE(pp_stats); | |
107 | } | |
108 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); | |
109 | ||
110 | u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) | |
111 | { | |
112 | struct page_pool_stats *pool_stats = stats; | |
113 | ||
114 | *data++ = pool_stats->alloc_stats.fast; | |
115 | *data++ = pool_stats->alloc_stats.slow; | |
116 | *data++ = pool_stats->alloc_stats.slow_high_order; | |
117 | *data++ = pool_stats->alloc_stats.empty; | |
118 | *data++ = pool_stats->alloc_stats.refill; | |
119 | *data++ = pool_stats->alloc_stats.waive; | |
120 | *data++ = pool_stats->recycle_stats.cached; | |
121 | *data++ = pool_stats->recycle_stats.cache_full; | |
122 | *data++ = pool_stats->recycle_stats.ring; | |
123 | *data++ = pool_stats->recycle_stats.ring_full; | |
124 | *data++ = pool_stats->recycle_stats.released_refcnt; | |
125 | ||
126 | return data; | |
127 | } | |
128 | EXPORT_SYMBOL(page_pool_ethtool_stats_get); | |
129 | ||
8610037e JD |
130 | #else |
131 | #define alloc_stat_inc(pool, __stat) | |
ad6fa1e1 | 132 | #define recycle_stat_inc(pool, __stat) |
590032a4 | 133 | #define recycle_stat_add(pool, __stat, val) |
8610037e JD |
134 | #endif |
135 | ||
ff7d6b27 JDB |
136 | static int page_pool_init(struct page_pool *pool, |
137 | const struct page_pool_params *params) | |
138 | { | |
139 | unsigned int ring_qsize = 1024; /* Default */ | |
140 | ||
141 | memcpy(&pool->p, params, sizeof(pool->p)); | |
142 | ||
143 | /* Validate only known flags were used */ | |
144 | if (pool->p.flags & ~(PP_FLAG_ALL)) | |
145 | return -EINVAL; | |
146 | ||
147 | if (pool->p.pool_size) | |
148 | ring_qsize = pool->p.pool_size; | |
149 | ||
150 | /* Sanity limit mem that can be pinned down */ | |
151 | if (ring_qsize > 32768) | |
152 | return -E2BIG; | |
153 | ||
154 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. | |
155 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, | |
156 | * which is the XDP_TX use-case. | |
157 | */ | |
798dda81 DK |
158 | if (pool->p.flags & PP_FLAG_DMA_MAP) { |
159 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && | |
160 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) | |
161 | return -EINVAL; | |
162 | } | |
ff7d6b27 | 163 | |
e68bc756 LB |
164 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { |
165 | /* In order to request DMA-sync-for-device the page | |
166 | * needs to be mapped | |
167 | */ | |
168 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) | |
169 | return -EINVAL; | |
170 | ||
171 | if (!pool->p.max_len) | |
172 | return -EINVAL; | |
173 | ||
174 | /* pool->p.offset has to be set according to the address | |
175 | * offset used by the DMA engine to start copying rx data | |
176 | */ | |
177 | } | |
178 | ||
f915b75b YL |
179 | if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT && |
180 | pool->p.flags & PP_FLAG_PAGE_FRAG) | |
181 | return -EINVAL; | |
182 | ||
ad6fa1e1 JD |
183 | #ifdef CONFIG_PAGE_POOL_STATS |
184 | pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); | |
185 | if (!pool->recycle_stats) | |
186 | return -ENOMEM; | |
187 | #endif | |
188 | ||
ff7d6b27 JDB |
189 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) |
190 | return -ENOMEM; | |
191 | ||
99c07c43 JDB |
192 | atomic_set(&pool->pages_state_release_cnt, 0); |
193 | ||
1da4bbef IK |
194 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
195 | refcount_set(&pool->user_cnt, 1); | |
196 | ||
f71fec47 JDB |
197 | if (pool->p.flags & PP_FLAG_DMA_MAP) |
198 | get_device(pool->p.dev); | |
199 | ||
ff7d6b27 JDB |
200 | return 0; |
201 | } | |
202 | ||
203 | struct page_pool *page_pool_create(const struct page_pool_params *params) | |
204 | { | |
205 | struct page_pool *pool; | |
873343e7 | 206 | int err; |
ff7d6b27 JDB |
207 | |
208 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); | |
209 | if (!pool) | |
210 | return ERR_PTR(-ENOMEM); | |
211 | ||
212 | err = page_pool_init(pool, params); | |
213 | if (err < 0) { | |
214 | pr_warn("%s() gave up with errno %d\n", __func__, err); | |
215 | kfree(pool); | |
216 | return ERR_PTR(err); | |
217 | } | |
1da4bbef | 218 | |
ff7d6b27 JDB |
219 | return pool; |
220 | } | |
221 | EXPORT_SYMBOL(page_pool_create); | |
222 | ||
458de8a9 | 223 | static void page_pool_return_page(struct page_pool *pool, struct page *page); |
44768dec JDB |
224 | |
225 | noinline | |
304db6cb | 226 | static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) |
44768dec JDB |
227 | { |
228 | struct ptr_ring *r = &pool->ring; | |
229 | struct page *page; | |
230 | int pref_nid; /* preferred NUMA node */ | |
231 | ||
232 | /* Quicker fallback, avoid locks when ring is empty */ | |
8610037e JD |
233 | if (__ptr_ring_empty(r)) { |
234 | alloc_stat_inc(pool, empty); | |
44768dec | 235 | return NULL; |
8610037e | 236 | } |
44768dec JDB |
237 | |
238 | /* Softirq guarantee CPU and thus NUMA node is stable. This, | |
239 | * assumes CPU refilling driver RX-ring will also run RX-NAPI. | |
240 | */ | |
f13fc107 | 241 | #ifdef CONFIG_NUMA |
44768dec | 242 | pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; |
f13fc107 JDB |
243 | #else |
244 | /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ | |
245 | pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ | |
246 | #endif | |
44768dec | 247 | |
44768dec JDB |
248 | /* Refill alloc array, but only if NUMA match */ |
249 | do { | |
250 | page = __ptr_ring_consume(r); | |
251 | if (unlikely(!page)) | |
252 | break; | |
253 | ||
254 | if (likely(page_to_nid(page) == pref_nid)) { | |
255 | pool->alloc.cache[pool->alloc.count++] = page; | |
256 | } else { | |
257 | /* NUMA mismatch; | |
258 | * (1) release 1 page to page-allocator and | |
259 | * (2) break out to fallthrough to alloc_pages_node. | |
260 | * This limit stress on page buddy alloactor. | |
261 | */ | |
458de8a9 | 262 | page_pool_return_page(pool, page); |
8610037e | 263 | alloc_stat_inc(pool, waive); |
44768dec JDB |
264 | page = NULL; |
265 | break; | |
266 | } | |
304db6cb | 267 | } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); |
44768dec JDB |
268 | |
269 | /* Return last page */ | |
8610037e | 270 | if (likely(pool->alloc.count > 0)) { |
44768dec | 271 | page = pool->alloc.cache[--pool->alloc.count]; |
8610037e JD |
272 | alloc_stat_inc(pool, refill); |
273 | } | |
44768dec | 274 | |
44768dec JDB |
275 | return page; |
276 | } | |
277 | ||
ff7d6b27 JDB |
278 | /* fast path */ |
279 | static struct page *__page_pool_get_cached(struct page_pool *pool) | |
280 | { | |
ff7d6b27 JDB |
281 | struct page *page; |
282 | ||
304db6cb LR |
283 | /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ |
284 | if (likely(pool->alloc.count)) { | |
285 | /* Fast-path */ | |
286 | page = pool->alloc.cache[--pool->alloc.count]; | |
8610037e | 287 | alloc_stat_inc(pool, fast); |
304db6cb LR |
288 | } else { |
289 | page = page_pool_refill_alloc_cache(pool); | |
ff7d6b27 JDB |
290 | } |
291 | ||
ff7d6b27 JDB |
292 | return page; |
293 | } | |
294 | ||
e68bc756 LB |
295 | static void page_pool_dma_sync_for_device(struct page_pool *pool, |
296 | struct page *page, | |
297 | unsigned int dma_sync_size) | |
298 | { | |
9ddb3c14 MWO |
299 | dma_addr_t dma_addr = page_pool_get_dma_addr(page); |
300 | ||
e68bc756 | 301 | dma_sync_size = min(dma_sync_size, pool->p.max_len); |
9ddb3c14 | 302 | dma_sync_single_range_for_device(pool->p.dev, dma_addr, |
e68bc756 LB |
303 | pool->p.offset, dma_sync_size, |
304 | pool->p.dma_dir); | |
305 | } | |
306 | ||
dfa59717 JDB |
307 | static bool page_pool_dma_map(struct page_pool *pool, struct page *page) |
308 | { | |
309 | dma_addr_t dma; | |
310 | ||
311 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr | |
312 | * since dma_addr_t can be either 32 or 64 bits and does not always fit | |
313 | * into page private data (i.e 32bit cpu with 64bit DMA caps) | |
314 | * This mapping is kept for lifetime of page, until leaving pool. | |
315 | */ | |
316 | dma = dma_map_page_attrs(pool->p.dev, page, 0, | |
317 | (PAGE_SIZE << pool->p.order), | |
318 | pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); | |
319 | if (dma_mapping_error(pool->p.dev, dma)) | |
320 | return false; | |
321 | ||
9ddb3c14 | 322 | page_pool_set_dma_addr(page, dma); |
dfa59717 JDB |
323 | |
324 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) | |
325 | page_pool_dma_sync_for_device(pool, page, pool->p.max_len); | |
326 | ||
327 | return true; | |
328 | } | |
329 | ||
57f05bc2 YL |
330 | static void page_pool_set_pp_info(struct page_pool *pool, |
331 | struct page *page) | |
332 | { | |
333 | page->pp = pool; | |
334 | page->pp_magic |= PP_SIGNATURE; | |
35b2e549 THJ |
335 | if (pool->p.init_callback) |
336 | pool->p.init_callback(page, pool->p.init_arg); | |
57f05bc2 YL |
337 | } |
338 | ||
339 | static void page_pool_clear_pp_info(struct page *page) | |
340 | { | |
341 | page->pp_magic = 0; | |
342 | page->pp = NULL; | |
343 | } | |
344 | ||
be5dba25 JDB |
345 | static struct page *__page_pool_alloc_page_order(struct page_pool *pool, |
346 | gfp_t gfp) | |
ff7d6b27 JDB |
347 | { |
348 | struct page *page; | |
ff7d6b27 | 349 | |
be5dba25 | 350 | gfp |= __GFP_COMP; |
ff7d6b27 | 351 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); |
be5dba25 | 352 | if (unlikely(!page)) |
ff7d6b27 JDB |
353 | return NULL; |
354 | ||
be5dba25 | 355 | if ((pool->p.flags & PP_FLAG_DMA_MAP) && |
dfa59717 | 356 | unlikely(!page_pool_dma_map(pool, page))) { |
ff7d6b27 JDB |
357 | put_page(page); |
358 | return NULL; | |
359 | } | |
ff7d6b27 | 360 | |
8610037e | 361 | alloc_stat_inc(pool, slow_high_order); |
57f05bc2 | 362 | page_pool_set_pp_info(pool, page); |
c07aea3e | 363 | |
99c07c43 JDB |
364 | /* Track how many pages are held 'in-flight' */ |
365 | pool->pages_state_hold_cnt++; | |
32c28f7e | 366 | trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); |
be5dba25 JDB |
367 | return page; |
368 | } | |
369 | ||
370 | /* slow path */ | |
371 | noinline | |
372 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, | |
373 | gfp_t gfp) | |
374 | { | |
375 | const int bulk = PP_ALLOC_CACHE_REFILL; | |
376 | unsigned int pp_flags = pool->p.flags; | |
377 | unsigned int pp_order = pool->p.order; | |
378 | struct page *page; | |
379 | int i, nr_pages; | |
380 | ||
381 | /* Don't support bulk alloc for high-order pages */ | |
382 | if (unlikely(pp_order)) | |
383 | return __page_pool_alloc_page_order(pool, gfp); | |
384 | ||
385 | /* Unnecessary as alloc cache is empty, but guarantees zero count */ | |
386 | if (unlikely(pool->alloc.count > 0)) | |
387 | return pool->alloc.cache[--pool->alloc.count]; | |
388 | ||
389 | /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ | |
390 | memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); | |
391 | ||
d810d367 JW |
392 | nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, |
393 | pool->alloc.cache); | |
be5dba25 JDB |
394 | if (unlikely(!nr_pages)) |
395 | return NULL; | |
396 | ||
397 | /* Pages have been filled into alloc.cache array, but count is zero and | |
398 | * page element have not been (possibly) DMA mapped. | |
399 | */ | |
400 | for (i = 0; i < nr_pages; i++) { | |
401 | page = pool->alloc.cache[i]; | |
402 | if ((pp_flags & PP_FLAG_DMA_MAP) && | |
403 | unlikely(!page_pool_dma_map(pool, page))) { | |
404 | put_page(page); | |
405 | continue; | |
406 | } | |
57f05bc2 YL |
407 | |
408 | page_pool_set_pp_info(pool, page); | |
be5dba25 JDB |
409 | pool->alloc.cache[pool->alloc.count++] = page; |
410 | /* Track how many pages are held 'in-flight' */ | |
411 | pool->pages_state_hold_cnt++; | |
412 | trace_page_pool_state_hold(pool, page, | |
413 | pool->pages_state_hold_cnt); | |
414 | } | |
415 | ||
416 | /* Return last page */ | |
8610037e | 417 | if (likely(pool->alloc.count > 0)) { |
be5dba25 | 418 | page = pool->alloc.cache[--pool->alloc.count]; |
8610037e JD |
419 | alloc_stat_inc(pool, slow); |
420 | } else { | |
be5dba25 | 421 | page = NULL; |
8610037e | 422 | } |
32c28f7e | 423 | |
ff7d6b27 JDB |
424 | /* When page just alloc'ed is should/must have refcnt 1. */ |
425 | return page; | |
426 | } | |
427 | ||
428 | /* For using page_pool replace: alloc_pages() API calls, but provide | |
429 | * synchronization guarantee for allocation side. | |
430 | */ | |
431 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) | |
432 | { | |
433 | struct page *page; | |
434 | ||
435 | /* Fast-path: Get a page from cache */ | |
436 | page = __page_pool_get_cached(pool); | |
437 | if (page) | |
438 | return page; | |
439 | ||
440 | /* Slow-path: cache empty, do real allocation */ | |
441 | page = __page_pool_alloc_pages_slow(pool, gfp); | |
442 | return page; | |
443 | } | |
444 | EXPORT_SYMBOL(page_pool_alloc_pages); | |
445 | ||
99c07c43 JDB |
446 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
447 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution | |
448 | */ | |
449 | #define _distance(a, b) (s32)((a) - (b)) | |
450 | ||
451 | static s32 page_pool_inflight(struct page_pool *pool) | |
452 | { | |
453 | u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); | |
454 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); | |
c3f812ce | 455 | s32 inflight; |
99c07c43 | 456 | |
c3f812ce | 457 | inflight = _distance(hold_cnt, release_cnt); |
99c07c43 | 458 | |
7c9e6942 | 459 | trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); |
99c07c43 JDB |
460 | WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); |
461 | ||
c3f812ce | 462 | return inflight; |
99c07c43 JDB |
463 | } |
464 | ||
458de8a9 IA |
465 | /* Disconnects a page (from a page_pool). API users can have a need |
466 | * to disconnect a page (from a page_pool), to allow it to be used as | |
467 | * a regular page (that will eventually be returned to the normal | |
468 | * page-allocator via put_page). | |
469 | */ | |
470 | void page_pool_release_page(struct page_pool *pool, struct page *page) | |
ff7d6b27 | 471 | { |
1567b85e | 472 | dma_addr_t dma; |
c3f812ce | 473 | int count; |
1567b85e | 474 | |
ff7d6b27 | 475 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
458de8a9 IA |
476 | /* Always account for inflight pages, even if we didn't |
477 | * map them | |
478 | */ | |
99c07c43 | 479 | goto skip_dma_unmap; |
ff7d6b27 | 480 | |
9ddb3c14 | 481 | dma = page_pool_get_dma_addr(page); |
458de8a9 | 482 | |
9ddb3c14 | 483 | /* When page is unmapped, it cannot be returned to our pool */ |
13f16d9d JDB |
484 | dma_unmap_page_attrs(pool->p.dev, dma, |
485 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, | |
486 | DMA_ATTR_SKIP_CPU_SYNC); | |
9ddb3c14 | 487 | page_pool_set_dma_addr(page, 0); |
99c07c43 | 488 | skip_dma_unmap: |
57f05bc2 | 489 | page_pool_clear_pp_info(page); |
c07aea3e | 490 | |
c3f812ce JL |
491 | /* This may be the last page returned, releasing the pool, so |
492 | * it is not safe to reference pool afterwards. | |
493 | */ | |
7fb9b66d | 494 | count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); |
c3f812ce | 495 | trace_page_pool_state_release(pool, page, count); |
ff7d6b27 | 496 | } |
458de8a9 | 497 | EXPORT_SYMBOL(page_pool_release_page); |
a25d50bf | 498 | |
ff7d6b27 | 499 | /* Return a page to the page allocator, cleaning up our state */ |
458de8a9 | 500 | static void page_pool_return_page(struct page_pool *pool, struct page *page) |
ff7d6b27 | 501 | { |
458de8a9 | 502 | page_pool_release_page(pool, page); |
99c07c43 | 503 | |
ff7d6b27 JDB |
504 | put_page(page); |
505 | /* An optimization would be to call __free_pages(page, pool->p.order) | |
506 | * knowing page is not part of page-cache (thus avoiding a | |
507 | * __page_cache_release() call). | |
508 | */ | |
509 | } | |
510 | ||
458de8a9 | 511 | static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) |
ff7d6b27 JDB |
512 | { |
513 | int ret; | |
514 | /* BH protection not needed if current is serving softirq */ | |
515 | if (in_serving_softirq()) | |
516 | ret = ptr_ring_produce(&pool->ring, page); | |
517 | else | |
518 | ret = ptr_ring_produce_bh(&pool->ring, page); | |
519 | ||
ad6fa1e1 JD |
520 | if (!ret) { |
521 | recycle_stat_inc(pool, ring); | |
522 | return true; | |
523 | } | |
524 | ||
525 | return false; | |
ff7d6b27 JDB |
526 | } |
527 | ||
528 | /* Only allow direct recycling in special circumstances, into the | |
529 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. | |
530 | * | |
531 | * Caller must provide appropriate safe context. | |
532 | */ | |
458de8a9 | 533 | static bool page_pool_recycle_in_cache(struct page *page, |
ff7d6b27 JDB |
534 | struct page_pool *pool) |
535 | { | |
ad6fa1e1 JD |
536 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { |
537 | recycle_stat_inc(pool, cache_full); | |
ff7d6b27 | 538 | return false; |
ad6fa1e1 | 539 | } |
ff7d6b27 JDB |
540 | |
541 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ | |
542 | pool->alloc.cache[pool->alloc.count++] = page; | |
ad6fa1e1 | 543 | recycle_stat_inc(pool, cached); |
ff7d6b27 JDB |
544 | return true; |
545 | } | |
546 | ||
458de8a9 IA |
547 | /* If the page refcnt == 1, this will try to recycle the page. |
548 | * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for | |
549 | * the configured size min(dma_sync_size, pool->max_len). | |
550 | * If the page refcnt != 1, then the page will be returned to memory | |
551 | * subsystem. | |
552 | */ | |
78862447 LB |
553 | static __always_inline struct page * |
554 | __page_pool_put_page(struct page_pool *pool, struct page *page, | |
555 | unsigned int dma_sync_size, bool allow_direct) | |
ff7d6b27 JDB |
556 | { |
557 | /* This allocator is optimized for the XDP mode that uses | |
558 | * one-frame-per-page, but have fallbacks that act like the | |
559 | * regular page allocator APIs. | |
560 | * | |
561 | * refcnt == 1 means page_pool owns page, and can recycle it. | |
05656132 AL |
562 | * |
563 | * page is NOT reusable when allocated when system is under | |
564 | * some pressure. (page_is_pfmemalloc) | |
ff7d6b27 | 565 | */ |
05656132 | 566 | if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) { |
ff7d6b27 JDB |
567 | /* Read barrier done in page_ref_count / READ_ONCE */ |
568 | ||
e68bc756 LB |
569 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) |
570 | page_pool_dma_sync_for_device(pool, page, | |
571 | dma_sync_size); | |
572 | ||
78862447 LB |
573 | if (allow_direct && in_serving_softirq() && |
574 | page_pool_recycle_in_cache(page, pool)) | |
575 | return NULL; | |
ff7d6b27 | 576 | |
78862447 LB |
577 | /* Page found as candidate for recycling */ |
578 | return page; | |
ff7d6b27 JDB |
579 | } |
580 | /* Fallback/non-XDP mode: API user have elevated refcnt. | |
581 | * | |
582 | * Many drivers split up the page into fragments, and some | |
583 | * want to keep doing this to save memory and do refcnt based | |
584 | * recycling. Support this use case too, to ease drivers | |
585 | * switching between XDP/non-XDP. | |
586 | * | |
587 | * In-case page_pool maintains the DMA mapping, API user must | |
588 | * call page_pool_put_page once. In this elevated refcnt | |
589 | * case, the DMA is unmapped/released, as driver is likely | |
590 | * doing refcnt based recycle tricks, meaning another process | |
591 | * will be invoking put_page. | |
592 | */ | |
ad6fa1e1 | 593 | recycle_stat_inc(pool, released_refcnt); |
458de8a9 IA |
594 | /* Do not replace this with page_pool_return_page() */ |
595 | page_pool_release_page(pool, page); | |
ff7d6b27 | 596 | put_page(page); |
78862447 LB |
597 | |
598 | return NULL; | |
599 | } | |
600 | ||
52cc6ffc AD |
601 | void page_pool_put_defragged_page(struct page_pool *pool, struct page *page, |
602 | unsigned int dma_sync_size, bool allow_direct) | |
78862447 LB |
603 | { |
604 | page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); | |
605 | if (page && !page_pool_recycle_in_ring(pool, page)) { | |
606 | /* Cache full, fallback to free pages */ | |
ad6fa1e1 | 607 | recycle_stat_inc(pool, ring_full); |
78862447 LB |
608 | page_pool_return_page(pool, page); |
609 | } | |
ff7d6b27 | 610 | } |
52cc6ffc | 611 | EXPORT_SYMBOL(page_pool_put_defragged_page); |
ff7d6b27 | 612 | |
78862447 LB |
613 | /* Caller must not use data area after call, as this function overwrites it */ |
614 | void page_pool_put_page_bulk(struct page_pool *pool, void **data, | |
615 | int count) | |
616 | { | |
617 | int i, bulk_len = 0; | |
618 | ||
619 | for (i = 0; i < count; i++) { | |
620 | struct page *page = virt_to_head_page(data[i]); | |
621 | ||
52cc6ffc AD |
622 | /* It is not the last user for the page frag case */ |
623 | if (!page_pool_is_last_frag(pool, page)) | |
624 | continue; | |
625 | ||
78862447 LB |
626 | page = __page_pool_put_page(pool, page, -1, false); |
627 | /* Approved for bulk recycling in ptr_ring cache */ | |
628 | if (page) | |
629 | data[bulk_len++] = page; | |
630 | } | |
631 | ||
632 | if (unlikely(!bulk_len)) | |
633 | return; | |
634 | ||
635 | /* Bulk producer into ptr_ring page_pool cache */ | |
636 | page_pool_ring_lock(pool); | |
637 | for (i = 0; i < bulk_len; i++) { | |
590032a4 LB |
638 | if (__ptr_ring_produce(&pool->ring, data[i])) { |
639 | /* ring full */ | |
640 | recycle_stat_inc(pool, ring_full); | |
641 | break; | |
642 | } | |
78862447 | 643 | } |
590032a4 | 644 | recycle_stat_add(pool, ring, i); |
78862447 LB |
645 | page_pool_ring_unlock(pool); |
646 | ||
647 | /* Hopefully all pages was return into ptr_ring */ | |
648 | if (likely(i == bulk_len)) | |
649 | return; | |
650 | ||
651 | /* ptr_ring cache full, free remaining pages outside producer lock | |
652 | * since put_page() with refcnt == 1 can be an expensive operation | |
653 | */ | |
654 | for (; i < bulk_len; i++) | |
655 | page_pool_return_page(pool, data[i]); | |
656 | } | |
657 | EXPORT_SYMBOL(page_pool_put_page_bulk); | |
658 | ||
53e0961d YL |
659 | static struct page *page_pool_drain_frag(struct page_pool *pool, |
660 | struct page *page) | |
661 | { | |
662 | long drain_count = BIAS_MAX - pool->frag_users; | |
663 | ||
664 | /* Some user is still using the page frag */ | |
52cc6ffc | 665 | if (likely(page_pool_defrag_page(page, drain_count))) |
53e0961d YL |
666 | return NULL; |
667 | ||
668 | if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { | |
669 | if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) | |
670 | page_pool_dma_sync_for_device(pool, page, -1); | |
671 | ||
672 | return page; | |
673 | } | |
674 | ||
675 | page_pool_return_page(pool, page); | |
676 | return NULL; | |
677 | } | |
678 | ||
679 | static void page_pool_free_frag(struct page_pool *pool) | |
680 | { | |
681 | long drain_count = BIAS_MAX - pool->frag_users; | |
682 | struct page *page = pool->frag_page; | |
683 | ||
684 | pool->frag_page = NULL; | |
685 | ||
52cc6ffc | 686 | if (!page || page_pool_defrag_page(page, drain_count)) |
53e0961d YL |
687 | return; |
688 | ||
689 | page_pool_return_page(pool, page); | |
690 | } | |
691 | ||
692 | struct page *page_pool_alloc_frag(struct page_pool *pool, | |
693 | unsigned int *offset, | |
694 | unsigned int size, gfp_t gfp) | |
695 | { | |
696 | unsigned int max_size = PAGE_SIZE << pool->p.order; | |
697 | struct page *page = pool->frag_page; | |
698 | ||
699 | if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || | |
700 | size > max_size)) | |
701 | return NULL; | |
702 | ||
703 | size = ALIGN(size, dma_get_cache_alignment()); | |
704 | *offset = pool->frag_offset; | |
705 | ||
706 | if (page && *offset + size > max_size) { | |
707 | page = page_pool_drain_frag(pool, page); | |
0f6deac3 JW |
708 | if (page) { |
709 | alloc_stat_inc(pool, fast); | |
53e0961d | 710 | goto frag_reset; |
0f6deac3 | 711 | } |
53e0961d YL |
712 | } |
713 | ||
714 | if (!page) { | |
715 | page = page_pool_alloc_pages(pool, gfp); | |
716 | if (unlikely(!page)) { | |
717 | pool->frag_page = NULL; | |
718 | return NULL; | |
719 | } | |
720 | ||
721 | pool->frag_page = page; | |
722 | ||
723 | frag_reset: | |
724 | pool->frag_users = 1; | |
725 | *offset = 0; | |
726 | pool->frag_offset = size; | |
52cc6ffc | 727 | page_pool_fragment_page(page, BIAS_MAX); |
53e0961d YL |
728 | return page; |
729 | } | |
730 | ||
731 | pool->frag_users++; | |
732 | pool->frag_offset = *offset + size; | |
0f6deac3 | 733 | alloc_stat_inc(pool, fast); |
53e0961d YL |
734 | return page; |
735 | } | |
736 | EXPORT_SYMBOL(page_pool_alloc_frag); | |
737 | ||
458de8a9 | 738 | static void page_pool_empty_ring(struct page_pool *pool) |
ff7d6b27 JDB |
739 | { |
740 | struct page *page; | |
741 | ||
742 | /* Empty recycle ring */ | |
4905bd9a | 743 | while ((page = ptr_ring_consume_bh(&pool->ring))) { |
ff7d6b27 JDB |
744 | /* Verify the refcnt invariant of cached pages */ |
745 | if (!(page_ref_count(page) == 1)) | |
746 | pr_crit("%s() page_pool refcnt %d violation\n", | |
747 | __func__, page_ref_count(page)); | |
748 | ||
458de8a9 | 749 | page_pool_return_page(pool, page); |
ff7d6b27 JDB |
750 | } |
751 | } | |
752 | ||
c3f812ce | 753 | static void page_pool_free(struct page_pool *pool) |
d956a048 | 754 | { |
c3f812ce JL |
755 | if (pool->disconnect) |
756 | pool->disconnect(pool); | |
e54cfd7e JDB |
757 | |
758 | ptr_ring_cleanup(&pool->ring, NULL); | |
f71fec47 JDB |
759 | |
760 | if (pool->p.flags & PP_FLAG_DMA_MAP) | |
761 | put_device(pool->p.dev); | |
762 | ||
ad6fa1e1 JD |
763 | #ifdef CONFIG_PAGE_POOL_STATS |
764 | free_percpu(pool->recycle_stats); | |
765 | #endif | |
e54cfd7e JDB |
766 | kfree(pool); |
767 | } | |
e54cfd7e | 768 | |
7c9e6942 | 769 | static void page_pool_empty_alloc_cache_once(struct page_pool *pool) |
ff7d6b27 JDB |
770 | { |
771 | struct page *page; | |
772 | ||
7c9e6942 JDB |
773 | if (pool->destroy_cnt) |
774 | return; | |
775 | ||
ff7d6b27 JDB |
776 | /* Empty alloc cache, assume caller made sure this is |
777 | * no-longer in use, and page_pool_alloc_pages() cannot be | |
778 | * call concurrently. | |
779 | */ | |
780 | while (pool->alloc.count) { | |
781 | page = pool->alloc.cache[--pool->alloc.count]; | |
458de8a9 | 782 | page_pool_return_page(pool, page); |
ff7d6b27 | 783 | } |
7c9e6942 JDB |
784 | } |
785 | ||
786 | static void page_pool_scrub(struct page_pool *pool) | |
787 | { | |
788 | page_pool_empty_alloc_cache_once(pool); | |
789 | pool->destroy_cnt++; | |
ff7d6b27 JDB |
790 | |
791 | /* No more consumers should exist, but producers could still | |
792 | * be in-flight. | |
793 | */ | |
458de8a9 | 794 | page_pool_empty_ring(pool); |
c3f812ce JL |
795 | } |
796 | ||
797 | static int page_pool_release(struct page_pool *pool) | |
798 | { | |
799 | int inflight; | |
800 | ||
801 | page_pool_scrub(pool); | |
802 | inflight = page_pool_inflight(pool); | |
803 | if (!inflight) | |
804 | page_pool_free(pool); | |
805 | ||
806 | return inflight; | |
807 | } | |
808 | ||
809 | static void page_pool_release_retry(struct work_struct *wq) | |
810 | { | |
811 | struct delayed_work *dwq = to_delayed_work(wq); | |
812 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); | |
813 | int inflight; | |
814 | ||
815 | inflight = page_pool_release(pool); | |
816 | if (!inflight) | |
817 | return; | |
818 | ||
819 | /* Periodic warning */ | |
820 | if (time_after_eq(jiffies, pool->defer_warn)) { | |
821 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; | |
822 | ||
823 | pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", | |
824 | __func__, inflight, sec); | |
825 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
826 | } | |
827 | ||
828 | /* Still not ready to be disconnected, retry later */ | |
829 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
830 | } | |
831 | ||
64693ec7 THJ |
832 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), |
833 | struct xdp_mem_info *mem) | |
c3f812ce JL |
834 | { |
835 | refcount_inc(&pool->user_cnt); | |
836 | pool->disconnect = disconnect; | |
64693ec7 | 837 | pool->xdp_mem_id = mem->id; |
c3f812ce JL |
838 | } |
839 | ||
840 | void page_pool_destroy(struct page_pool *pool) | |
841 | { | |
842 | if (!pool) | |
843 | return; | |
844 | ||
845 | if (!page_pool_put(pool)) | |
846 | return; | |
847 | ||
53e0961d YL |
848 | page_pool_free_frag(pool); |
849 | ||
c3f812ce JL |
850 | if (!page_pool_release(pool)) |
851 | return; | |
852 | ||
853 | pool->defer_start = jiffies; | |
854 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
ff7d6b27 | 855 | |
c3f812ce JL |
856 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
857 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
ff7d6b27 | 858 | } |
c3f812ce | 859 | EXPORT_SYMBOL(page_pool_destroy); |
bc836748 SM |
860 | |
861 | /* Caller must provide appropriate safe context, e.g. NAPI. */ | |
862 | void page_pool_update_nid(struct page_pool *pool, int new_nid) | |
863 | { | |
44768dec JDB |
864 | struct page *page; |
865 | ||
bc836748 SM |
866 | trace_page_pool_update_nid(pool, new_nid); |
867 | pool->p.nid = new_nid; | |
44768dec JDB |
868 | |
869 | /* Flush pool alloc cache, as refill will check NUMA node */ | |
870 | while (pool->alloc.count) { | |
871 | page = pool->alloc.cache[--pool->alloc.count]; | |
458de8a9 | 872 | page_pool_return_page(pool, page); |
44768dec | 873 | } |
bc836748 SM |
874 | } |
875 | EXPORT_SYMBOL(page_pool_update_nid); | |
6a5bcd84 IA |
876 | |
877 | bool page_pool_return_skb_page(struct page *page) | |
878 | { | |
879 | struct page_pool *pp; | |
880 | ||
881 | page = compound_head(page); | |
0fa32ca4 YL |
882 | |
883 | /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation | |
884 | * in order to preserve any existing bits, such as bit 0 for the | |
885 | * head page of compound page and bit 1 for pfmemalloc page, so | |
886 | * mask those bits for freeing side when doing below checking, | |
887 | * and page_is_pfmemalloc() is checked in __page_pool_put_page() | |
888 | * to avoid recycling the pfmemalloc page. | |
889 | */ | |
890 | if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) | |
6a5bcd84 IA |
891 | return false; |
892 | ||
893 | pp = page->pp; | |
894 | ||
895 | /* Driver set this to memory recycling info. Reset it on recycle. | |
896 | * This will *not* work for NIC using a split-page memory model. | |
897 | * The page will be returned to the pool here regardless of the | |
898 | * 'flipped' fragment being in use or not. | |
899 | */ | |
6a5bcd84 IA |
900 | page_pool_put_full_page(pp, page, false); |
901 | ||
902 | return true; | |
903 | } | |
904 | EXPORT_SYMBOL(page_pool_return_skb_page); |