page_pool: disable direct recycling based on pool->cpuid on destroy
[linux-block.git] / net / core / page_pool.c
CommitLineData
ff7d6b27
JDB
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
32c28f7e 7
ff7d6b27
JDB
8#include <linux/types.h>
9#include <linux/kernel.h>
10#include <linux/slab.h>
f71fec47 11#include <linux/device.h>
ff7d6b27 12
a9ca9f9c 13#include <net/page_pool/helpers.h>
78862447
LB
14#include <net/xdp.h>
15
ff7d6b27
JDB
16#include <linux/dma-direction.h>
17#include <linux/dma-mapping.h>
18#include <linux/page-flags.h>
8d29c703 19#include <linux/mm.h> /* for put_page() */
c07aea3e 20#include <linux/poison.h>
f3c5264f 21#include <linux/ethtool.h>
8c48eea3 22#include <linux/netdevice.h>
ff7d6b27 23
32c28f7e
JDB
24#include <trace/events/page_pool.h>
25
f17c6964
JK
26#include "page_pool_priv.h"
27
c3f812ce
JL
28#define DEFER_TIME (msecs_to_jiffies(1000))
29#define DEFER_WARN_INTERVAL (60 * HZ)
30
aaf153ae 31#define BIAS_MAX (LONG_MAX >> 1)
53e0961d 32
8610037e
JD
33#ifdef CONFIG_PAGE_POOL_STATS
34/* alloc_stat_inc is intended to be used in softirq context */
35#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
ad6fa1e1
JD
36/* recycle_stat_inc is safe to use when preemption is possible. */
37#define recycle_stat_inc(pool, __stat) \
38 do { \
39 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
40 this_cpu_inc(s->__stat); \
41 } while (0)
6b95e338 42
590032a4
LB
43#define recycle_stat_add(pool, __stat, val) \
44 do { \
45 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
46 this_cpu_add(s->__stat, val); \
47 } while (0)
48
f3c5264f
LB
49static const char pp_stats[][ETH_GSTRING_LEN] = {
50 "rx_pp_alloc_fast",
51 "rx_pp_alloc_slow",
52 "rx_pp_alloc_slow_ho",
53 "rx_pp_alloc_empty",
54 "rx_pp_alloc_refill",
55 "rx_pp_alloc_waive",
56 "rx_pp_recycle_cached",
57 "rx_pp_recycle_cache_full",
58 "rx_pp_recycle_ring",
59 "rx_pp_recycle_ring_full",
60 "rx_pp_recycle_released_ref",
61};
62
82e896d9
JK
63/**
64 * page_pool_get_stats() - fetch page pool stats
65 * @pool: pool from which page was allocated
66 * @stats: struct page_pool_stats to fill in
67 *
68 * Retrieve statistics about the page_pool. This API is only available
69 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
70 * A pointer to a caller allocated struct page_pool_stats structure
71 * is passed to this API which is filled in. The caller can then report
72 * those stats to the user (perhaps via ethtool, debugfs, etc.).
73 */
d49010ad 74bool page_pool_get_stats(const struct page_pool *pool,
6b95e338
JD
75 struct page_pool_stats *stats)
76{
77 int cpu = 0;
78
79 if (!stats)
80 return false;
81
f3c5264f
LB
82 /* The caller is responsible to initialize stats. */
83 stats->alloc_stats.fast += pool->alloc_stats.fast;
84 stats->alloc_stats.slow += pool->alloc_stats.slow;
85 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
86 stats->alloc_stats.empty += pool->alloc_stats.empty;
87 stats->alloc_stats.refill += pool->alloc_stats.refill;
88 stats->alloc_stats.waive += pool->alloc_stats.waive;
6b95e338
JD
89
90 for_each_possible_cpu(cpu) {
91 const struct page_pool_recycle_stats *pcpu =
92 per_cpu_ptr(pool->recycle_stats, cpu);
93
94 stats->recycle_stats.cached += pcpu->cached;
95 stats->recycle_stats.cache_full += pcpu->cache_full;
96 stats->recycle_stats.ring += pcpu->ring;
97 stats->recycle_stats.ring_full += pcpu->ring_full;
98 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
99 }
100
101 return true;
102}
103EXPORT_SYMBOL(page_pool_get_stats);
f3c5264f
LB
104
105u8 *page_pool_ethtool_stats_get_strings(u8 *data)
106{
107 int i;
108
109 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
110 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
111 data += ETH_GSTRING_LEN;
112 }
113
114 return data;
115}
116EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
117
118int page_pool_ethtool_stats_get_count(void)
119{
120 return ARRAY_SIZE(pp_stats);
121}
122EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
123
124u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
125{
126 struct page_pool_stats *pool_stats = stats;
127
128 *data++ = pool_stats->alloc_stats.fast;
129 *data++ = pool_stats->alloc_stats.slow;
130 *data++ = pool_stats->alloc_stats.slow_high_order;
131 *data++ = pool_stats->alloc_stats.empty;
132 *data++ = pool_stats->alloc_stats.refill;
133 *data++ = pool_stats->alloc_stats.waive;
134 *data++ = pool_stats->recycle_stats.cached;
135 *data++ = pool_stats->recycle_stats.cache_full;
136 *data++ = pool_stats->recycle_stats.ring;
137 *data++ = pool_stats->recycle_stats.ring_full;
138 *data++ = pool_stats->recycle_stats.released_refcnt;
139
140 return data;
141}
142EXPORT_SYMBOL(page_pool_ethtool_stats_get);
143
8610037e
JD
144#else
145#define alloc_stat_inc(pool, __stat)
ad6fa1e1 146#define recycle_stat_inc(pool, __stat)
590032a4 147#define recycle_stat_add(pool, __stat, val)
8610037e
JD
148#endif
149
368d3cb4
YL
150static bool page_pool_producer_lock(struct page_pool *pool)
151 __acquires(&pool->ring.producer_lock)
152{
153 bool in_softirq = in_softirq();
154
155 if (in_softirq)
156 spin_lock(&pool->ring.producer_lock);
157 else
158 spin_lock_bh(&pool->ring.producer_lock);
159
160 return in_softirq;
161}
162
163static void page_pool_producer_unlock(struct page_pool *pool,
164 bool in_softirq)
165 __releases(&pool->ring.producer_lock)
166{
167 if (in_softirq)
168 spin_unlock(&pool->ring.producer_lock);
169 else
170 spin_unlock_bh(&pool->ring.producer_lock);
171}
172
ff7d6b27 173static int page_pool_init(struct page_pool *pool,
2b0cfa6e
LB
174 const struct page_pool_params *params,
175 int cpuid)
ff7d6b27
JDB
176{
177 unsigned int ring_qsize = 1024; /* Default */
178
5027ec19
JK
179 memcpy(&pool->p, &params->fast, sizeof(pool->p));
180 memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
ff7d6b27 181
2b0cfa6e
LB
182 pool->cpuid = cpuid;
183
ff7d6b27
JDB
184 /* Validate only known flags were used */
185 if (pool->p.flags & ~(PP_FLAG_ALL))
186 return -EINVAL;
187
188 if (pool->p.pool_size)
189 ring_qsize = pool->p.pool_size;
190
191 /* Sanity limit mem that can be pinned down */
192 if (ring_qsize > 32768)
193 return -E2BIG;
194
195 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
196 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
197 * which is the XDP_TX use-case.
198 */
798dda81
DK
199 if (pool->p.flags & PP_FLAG_DMA_MAP) {
200 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
201 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
202 return -EINVAL;
203 }
ff7d6b27 204
e68bc756
LB
205 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
206 /* In order to request DMA-sync-for-device the page
207 * needs to be mapped
208 */
209 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
210 return -EINVAL;
211
212 if (!pool->p.max_len)
213 return -EINVAL;
214
215 /* pool->p.offset has to be set according to the address
216 * offset used by the DMA engine to start copying rx data
217 */
218 }
219
2da0cac1
JK
220 pool->has_init_callback = !!pool->slow.init_callback;
221
ad6fa1e1
JD
222#ifdef CONFIG_PAGE_POOL_STATS
223 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
224 if (!pool->recycle_stats)
225 return -ENOMEM;
226#endif
227
8ffbd166
JS
228 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
229#ifdef CONFIG_PAGE_POOL_STATS
230 free_percpu(pool->recycle_stats);
231#endif
ff7d6b27 232 return -ENOMEM;
8ffbd166 233 }
ff7d6b27 234
99c07c43
JDB
235 atomic_set(&pool->pages_state_release_cnt, 0);
236
1da4bbef
IK
237 /* Driver calling page_pool_create() also call page_pool_destroy() */
238 refcount_set(&pool->user_cnt, 1);
239
f71fec47
JDB
240 if (pool->p.flags & PP_FLAG_DMA_MAP)
241 get_device(pool->p.dev);
242
ff7d6b27
JDB
243 return 0;
244}
245
23cfaf67
JK
246static void page_pool_uninit(struct page_pool *pool)
247{
248 ptr_ring_cleanup(&pool->ring, NULL);
249
250 if (pool->p.flags & PP_FLAG_DMA_MAP)
251 put_device(pool->p.dev);
252
253#ifdef CONFIG_PAGE_POOL_STATS
254 free_percpu(pool->recycle_stats);
255#endif
256}
257
82e896d9 258/**
2b0cfa6e 259 * page_pool_create_percpu() - create a page pool for a given cpu.
82e896d9 260 * @params: parameters, see struct page_pool_params
2b0cfa6e 261 * @cpuid: cpu identifier
82e896d9 262 */
2b0cfa6e
LB
263struct page_pool *
264page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
ff7d6b27
JDB
265{
266 struct page_pool *pool;
873343e7 267 int err;
ff7d6b27
JDB
268
269 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
270 if (!pool)
271 return ERR_PTR(-ENOMEM);
272
2b0cfa6e 273 err = page_pool_init(pool, params, cpuid);
f17c6964
JK
274 if (err < 0)
275 goto err_free;
276
277 err = page_pool_list(pool);
278 if (err)
279 goto err_uninit;
1da4bbef 280
ff7d6b27 281 return pool;
f17c6964
JK
282
283err_uninit:
284 page_pool_uninit(pool);
285err_free:
286 pr_warn("%s() gave up with errno %d\n", __func__, err);
287 kfree(pool);
288 return ERR_PTR(err);
ff7d6b27 289}
2b0cfa6e
LB
290EXPORT_SYMBOL(page_pool_create_percpu);
291
292/**
293 * page_pool_create() - create a page pool
294 * @params: parameters, see struct page_pool_params
295 */
296struct page_pool *page_pool_create(const struct page_pool_params *params)
297{
298 return page_pool_create_percpu(params, -1);
299}
ff7d6b27
JDB
300EXPORT_SYMBOL(page_pool_create);
301
458de8a9 302static void page_pool_return_page(struct page_pool *pool, struct page *page);
44768dec
JDB
303
304noinline
304db6cb 305static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
44768dec
JDB
306{
307 struct ptr_ring *r = &pool->ring;
308 struct page *page;
309 int pref_nid; /* preferred NUMA node */
310
311 /* Quicker fallback, avoid locks when ring is empty */
8610037e
JD
312 if (__ptr_ring_empty(r)) {
313 alloc_stat_inc(pool, empty);
44768dec 314 return NULL;
8610037e 315 }
44768dec
JDB
316
317 /* Softirq guarantee CPU and thus NUMA node is stable. This,
318 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
319 */
f13fc107 320#ifdef CONFIG_NUMA
44768dec 321 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
f13fc107
JDB
322#else
323 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
324 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
325#endif
44768dec 326
44768dec
JDB
327 /* Refill alloc array, but only if NUMA match */
328 do {
329 page = __ptr_ring_consume(r);
330 if (unlikely(!page))
331 break;
332
333 if (likely(page_to_nid(page) == pref_nid)) {
334 pool->alloc.cache[pool->alloc.count++] = page;
335 } else {
336 /* NUMA mismatch;
337 * (1) release 1 page to page-allocator and
338 * (2) break out to fallthrough to alloc_pages_node.
339 * This limit stress on page buddy alloactor.
340 */
458de8a9 341 page_pool_return_page(pool, page);
8610037e 342 alloc_stat_inc(pool, waive);
44768dec
JDB
343 page = NULL;
344 break;
345 }
304db6cb 346 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
44768dec
JDB
347
348 /* Return last page */
8610037e 349 if (likely(pool->alloc.count > 0)) {
44768dec 350 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
351 alloc_stat_inc(pool, refill);
352 }
44768dec 353
44768dec
JDB
354 return page;
355}
356
ff7d6b27
JDB
357/* fast path */
358static struct page *__page_pool_get_cached(struct page_pool *pool)
359{
ff7d6b27
JDB
360 struct page *page;
361
304db6cb
LR
362 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
363 if (likely(pool->alloc.count)) {
364 /* Fast-path */
365 page = pool->alloc.cache[--pool->alloc.count];
8610037e 366 alloc_stat_inc(pool, fast);
304db6cb
LR
367 } else {
368 page = page_pool_refill_alloc_cache(pool);
ff7d6b27
JDB
369 }
370
ff7d6b27
JDB
371 return page;
372}
373
e68bc756
LB
374static void page_pool_dma_sync_for_device(struct page_pool *pool,
375 struct page *page,
376 unsigned int dma_sync_size)
377{
9ddb3c14
MWO
378 dma_addr_t dma_addr = page_pool_get_dma_addr(page);
379
e68bc756 380 dma_sync_size = min(dma_sync_size, pool->p.max_len);
9ddb3c14 381 dma_sync_single_range_for_device(pool->p.dev, dma_addr,
e68bc756
LB
382 pool->p.offset, dma_sync_size,
383 pool->p.dma_dir);
384}
385
dfa59717
JDB
386static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
387{
388 dma_addr_t dma;
389
390 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
391 * since dma_addr_t can be either 32 or 64 bits and does not always fit
392 * into page private data (i.e 32bit cpu with 64bit DMA caps)
393 * This mapping is kept for lifetime of page, until leaving pool.
394 */
395 dma = dma_map_page_attrs(pool->p.dev, page, 0,
396 (PAGE_SIZE << pool->p.order),
8e4c62c7
JK
397 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
398 DMA_ATTR_WEAK_ORDERING);
dfa59717
JDB
399 if (dma_mapping_error(pool->p.dev, dma))
400 return false;
401
90de47f0
YL
402 if (page_pool_set_dma_addr(page, dma))
403 goto unmap_failed;
dfa59717
JDB
404
405 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
406 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
407
408 return true;
90de47f0
YL
409
410unmap_failed:
411 WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
412 dma_unmap_page_attrs(pool->p.dev, dma,
413 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
414 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
415 return false;
dfa59717
JDB
416}
417
57f05bc2
YL
418static void page_pool_set_pp_info(struct page_pool *pool,
419 struct page *page)
420{
421 page->pp = pool;
422 page->pp_magic |= PP_SIGNATURE;
58d53d8f
YL
423
424 /* Ensuring all pages have been split into one fragment initially:
425 * page_pool_set_pp_info() is only called once for every page when it
426 * is allocated from the page allocator and page_pool_fragment_page()
427 * is dirtying the same cache line as the page->pp_magic above, so
428 * the overhead is negligible.
429 */
430 page_pool_fragment_page(page, 1);
2da0cac1 431 if (pool->has_init_callback)
5027ec19 432 pool->slow.init_callback(page, pool->slow.init_arg);
57f05bc2
YL
433}
434
435static void page_pool_clear_pp_info(struct page *page)
436{
437 page->pp_magic = 0;
438 page->pp = NULL;
439}
440
be5dba25
JDB
441static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
442 gfp_t gfp)
ff7d6b27
JDB
443{
444 struct page *page;
ff7d6b27 445
be5dba25 446 gfp |= __GFP_COMP;
ff7d6b27 447 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
be5dba25 448 if (unlikely(!page))
ff7d6b27
JDB
449 return NULL;
450
be5dba25 451 if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
dfa59717 452 unlikely(!page_pool_dma_map(pool, page))) {
ff7d6b27
JDB
453 put_page(page);
454 return NULL;
455 }
ff7d6b27 456
8610037e 457 alloc_stat_inc(pool, slow_high_order);
57f05bc2 458 page_pool_set_pp_info(pool, page);
c07aea3e 459
99c07c43
JDB
460 /* Track how many pages are held 'in-flight' */
461 pool->pages_state_hold_cnt++;
32c28f7e 462 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
be5dba25
JDB
463 return page;
464}
465
466/* slow path */
467noinline
468static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
469 gfp_t gfp)
470{
471 const int bulk = PP_ALLOC_CACHE_REFILL;
472 unsigned int pp_flags = pool->p.flags;
473 unsigned int pp_order = pool->p.order;
474 struct page *page;
475 int i, nr_pages;
476
477 /* Don't support bulk alloc for high-order pages */
478 if (unlikely(pp_order))
479 return __page_pool_alloc_page_order(pool, gfp);
480
481 /* Unnecessary as alloc cache is empty, but guarantees zero count */
482 if (unlikely(pool->alloc.count > 0))
483 return pool->alloc.cache[--pool->alloc.count];
484
485 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
486 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
487
d810d367
JW
488 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
489 pool->alloc.cache);
be5dba25
JDB
490 if (unlikely(!nr_pages))
491 return NULL;
492
493 /* Pages have been filled into alloc.cache array, but count is zero and
494 * page element have not been (possibly) DMA mapped.
495 */
496 for (i = 0; i < nr_pages; i++) {
497 page = pool->alloc.cache[i];
498 if ((pp_flags & PP_FLAG_DMA_MAP) &&
499 unlikely(!page_pool_dma_map(pool, page))) {
500 put_page(page);
501 continue;
502 }
57f05bc2
YL
503
504 page_pool_set_pp_info(pool, page);
be5dba25
JDB
505 pool->alloc.cache[pool->alloc.count++] = page;
506 /* Track how many pages are held 'in-flight' */
507 pool->pages_state_hold_cnt++;
508 trace_page_pool_state_hold(pool, page,
509 pool->pages_state_hold_cnt);
510 }
511
512 /* Return last page */
8610037e 513 if (likely(pool->alloc.count > 0)) {
be5dba25 514 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
515 alloc_stat_inc(pool, slow);
516 } else {
be5dba25 517 page = NULL;
8610037e 518 }
32c28f7e 519
ff7d6b27
JDB
520 /* When page just alloc'ed is should/must have refcnt 1. */
521 return page;
522}
523
524/* For using page_pool replace: alloc_pages() API calls, but provide
525 * synchronization guarantee for allocation side.
526 */
527struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
528{
529 struct page *page;
530
531 /* Fast-path: Get a page from cache */
532 page = __page_pool_get_cached(pool);
533 if (page)
534 return page;
535
536 /* Slow-path: cache empty, do real allocation */
537 page = __page_pool_alloc_pages_slow(pool, gfp);
538 return page;
539}
540EXPORT_SYMBOL(page_pool_alloc_pages);
541
99c07c43
JDB
542/* Calculate distance between two u32 values, valid if distance is below 2^(31)
543 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
544 */
545#define _distance(a, b) (s32)((a) - (b))
546
7aee8429 547s32 page_pool_inflight(const struct page_pool *pool, bool strict)
99c07c43
JDB
548{
549 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
550 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
c3f812ce 551 s32 inflight;
99c07c43 552
c3f812ce 553 inflight = _distance(hold_cnt, release_cnt);
99c07c43 554
7aee8429
JK
555 if (strict) {
556 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
557 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
558 inflight);
559 } else {
560 inflight = max(0, inflight);
561 }
99c07c43 562
c3f812ce 563 return inflight;
99c07c43
JDB
564}
565
c3f687d8
JK
566static __always_inline
567void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
ff7d6b27 568{
1567b85e
IA
569 dma_addr_t dma;
570
ff7d6b27 571 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
458de8a9
IA
572 /* Always account for inflight pages, even if we didn't
573 * map them
574 */
c3f687d8 575 return;
ff7d6b27 576
9ddb3c14 577 dma = page_pool_get_dma_addr(page);
458de8a9 578
9ddb3c14 579 /* When page is unmapped, it cannot be returned to our pool */
13f16d9d
JDB
580 dma_unmap_page_attrs(pool->p.dev, dma,
581 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
8e4c62c7 582 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
9ddb3c14 583 page_pool_set_dma_addr(page, 0);
c3f687d8
JK
584}
585
586/* Disconnects a page (from a page_pool). API users can have a need
587 * to disconnect a page (from a page_pool), to allow it to be used as
588 * a regular page (that will eventually be returned to the normal
589 * page-allocator via put_page).
590 */
591void page_pool_return_page(struct page_pool *pool, struct page *page)
592{
593 int count;
594
595 __page_pool_release_page_dma(pool, page);
596
57f05bc2 597 page_pool_clear_pp_info(page);
c07aea3e 598
c3f812ce
JL
599 /* This may be the last page returned, releasing the pool, so
600 * it is not safe to reference pool afterwards.
601 */
7fb9b66d 602 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
c3f812ce 603 trace_page_pool_state_release(pool, page, count);
99c07c43 604
ff7d6b27
JDB
605 put_page(page);
606 /* An optimization would be to call __free_pages(page, pool->p.order)
607 * knowing page is not part of page-cache (thus avoiding a
608 * __page_cache_release() call).
609 */
610}
611
458de8a9 612static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
ff7d6b27
JDB
613{
614 int ret;
542bcea4
QD
615 /* BH protection not needed if current is softirq */
616 if (in_softirq())
ff7d6b27
JDB
617 ret = ptr_ring_produce(&pool->ring, page);
618 else
619 ret = ptr_ring_produce_bh(&pool->ring, page);
620
ad6fa1e1
JD
621 if (!ret) {
622 recycle_stat_inc(pool, ring);
623 return true;
624 }
625
626 return false;
ff7d6b27
JDB
627}
628
629/* Only allow direct recycling in special circumstances, into the
630 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
631 *
632 * Caller must provide appropriate safe context.
633 */
458de8a9 634static bool page_pool_recycle_in_cache(struct page *page,
ff7d6b27
JDB
635 struct page_pool *pool)
636{
ad6fa1e1
JD
637 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
638 recycle_stat_inc(pool, cache_full);
ff7d6b27 639 return false;
ad6fa1e1 640 }
ff7d6b27
JDB
641
642 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
643 pool->alloc.cache[pool->alloc.count++] = page;
ad6fa1e1 644 recycle_stat_inc(pool, cached);
ff7d6b27
JDB
645 return true;
646}
647
458de8a9
IA
648/* If the page refcnt == 1, this will try to recycle the page.
649 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
650 * the configured size min(dma_sync_size, pool->max_len).
651 * If the page refcnt != 1, then the page will be returned to memory
652 * subsystem.
653 */
78862447
LB
654static __always_inline struct page *
655__page_pool_put_page(struct page_pool *pool, struct page *page,
656 unsigned int dma_sync_size, bool allow_direct)
ff7d6b27 657{
ff4e538c
JK
658 lockdep_assert_no_hardirq();
659
ff7d6b27
JDB
660 /* This allocator is optimized for the XDP mode that uses
661 * one-frame-per-page, but have fallbacks that act like the
662 * regular page allocator APIs.
663 *
664 * refcnt == 1 means page_pool owns page, and can recycle it.
05656132
AL
665 *
666 * page is NOT reusable when allocated when system is under
667 * some pressure. (page_is_pfmemalloc)
ff7d6b27 668 */
05656132 669 if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
ff7d6b27
JDB
670 /* Read barrier done in page_ref_count / READ_ONCE */
671
e68bc756
LB
672 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
673 page_pool_dma_sync_for_device(pool, page,
674 dma_sync_size);
675
542bcea4 676 if (allow_direct && in_softirq() &&
78862447
LB
677 page_pool_recycle_in_cache(page, pool))
678 return NULL;
ff7d6b27 679
78862447
LB
680 /* Page found as candidate for recycling */
681 return page;
ff7d6b27
JDB
682 }
683 /* Fallback/non-XDP mode: API user have elevated refcnt.
684 *
685 * Many drivers split up the page into fragments, and some
686 * want to keep doing this to save memory and do refcnt based
687 * recycling. Support this use case too, to ease drivers
688 * switching between XDP/non-XDP.
689 *
690 * In-case page_pool maintains the DMA mapping, API user must
691 * call page_pool_put_page once. In this elevated refcnt
692 * case, the DMA is unmapped/released, as driver is likely
693 * doing refcnt based recycle tricks, meaning another process
694 * will be invoking put_page.
695 */
ad6fa1e1 696 recycle_stat_inc(pool, released_refcnt);
07e0c7d3 697 page_pool_return_page(pool, page);
78862447
LB
698
699 return NULL;
700}
701
0a149ab7
LC
702void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
703 unsigned int dma_sync_size, bool allow_direct)
78862447
LB
704{
705 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
706 if (page && !page_pool_recycle_in_ring(pool, page)) {
707 /* Cache full, fallback to free pages */
ad6fa1e1 708 recycle_stat_inc(pool, ring_full);
78862447
LB
709 page_pool_return_page(pool, page);
710 }
ff7d6b27 711}
0a149ab7 712EXPORT_SYMBOL(page_pool_put_unrefed_page);
ff7d6b27 713
82e896d9
JK
714/**
715 * page_pool_put_page_bulk() - release references on multiple pages
716 * @pool: pool from which pages were allocated
717 * @data: array holding page pointers
718 * @count: number of pages in @data
719 *
720 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
721 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
722 * will release leftover pages to the page allocator.
723 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
724 * completion loop for the XDP_REDIRECT use case.
725 *
726 * Please note the caller must not use data area after running
727 * page_pool_put_page_bulk(), as this function overwrites it.
728 */
78862447
LB
729void page_pool_put_page_bulk(struct page_pool *pool, void **data,
730 int count)
731{
732 int i, bulk_len = 0;
368d3cb4 733 bool in_softirq;
78862447
LB
734
735 for (i = 0; i < count; i++) {
736 struct page *page = virt_to_head_page(data[i]);
737
52cc6ffc 738 /* It is not the last user for the page frag case */
0a149ab7 739 if (!page_pool_is_last_ref(page))
52cc6ffc
AD
740 continue;
741
78862447
LB
742 page = __page_pool_put_page(pool, page, -1, false);
743 /* Approved for bulk recycling in ptr_ring cache */
744 if (page)
745 data[bulk_len++] = page;
746 }
747
748 if (unlikely(!bulk_len))
749 return;
750
751 /* Bulk producer into ptr_ring page_pool cache */
368d3cb4 752 in_softirq = page_pool_producer_lock(pool);
78862447 753 for (i = 0; i < bulk_len; i++) {
590032a4
LB
754 if (__ptr_ring_produce(&pool->ring, data[i])) {
755 /* ring full */
756 recycle_stat_inc(pool, ring_full);
757 break;
758 }
78862447 759 }
590032a4 760 recycle_stat_add(pool, ring, i);
368d3cb4 761 page_pool_producer_unlock(pool, in_softirq);
78862447
LB
762
763 /* Hopefully all pages was return into ptr_ring */
764 if (likely(i == bulk_len))
765 return;
766
767 /* ptr_ring cache full, free remaining pages outside producer lock
768 * since put_page() with refcnt == 1 can be an expensive operation
769 */
770 for (; i < bulk_len; i++)
771 page_pool_return_page(pool, data[i]);
772}
773EXPORT_SYMBOL(page_pool_put_page_bulk);
774
53e0961d
YL
775static struct page *page_pool_drain_frag(struct page_pool *pool,
776 struct page *page)
777{
778 long drain_count = BIAS_MAX - pool->frag_users;
779
780 /* Some user is still using the page frag */
0a149ab7 781 if (likely(page_pool_unref_page(page, drain_count)))
53e0961d
YL
782 return NULL;
783
784 if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
785 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
786 page_pool_dma_sync_for_device(pool, page, -1);
787
788 return page;
789 }
790
791 page_pool_return_page(pool, page);
792 return NULL;
793}
794
795static void page_pool_free_frag(struct page_pool *pool)
796{
797 long drain_count = BIAS_MAX - pool->frag_users;
798 struct page *page = pool->frag_page;
799
800 pool->frag_page = NULL;
801
0a149ab7 802 if (!page || page_pool_unref_page(page, drain_count))
53e0961d
YL
803 return;
804
805 page_pool_return_page(pool, page);
806}
807
808struct page *page_pool_alloc_frag(struct page_pool *pool,
809 unsigned int *offset,
810 unsigned int size, gfp_t gfp)
811{
812 unsigned int max_size = PAGE_SIZE << pool->p.order;
813 struct page *page = pool->frag_page;
814
09d96ee5 815 if (WARN_ON(size > max_size))
53e0961d
YL
816 return NULL;
817
818 size = ALIGN(size, dma_get_cache_alignment());
819 *offset = pool->frag_offset;
820
821 if (page && *offset + size > max_size) {
822 page = page_pool_drain_frag(pool, page);
0f6deac3
JW
823 if (page) {
824 alloc_stat_inc(pool, fast);
53e0961d 825 goto frag_reset;
0f6deac3 826 }
53e0961d
YL
827 }
828
829 if (!page) {
830 page = page_pool_alloc_pages(pool, gfp);
831 if (unlikely(!page)) {
832 pool->frag_page = NULL;
833 return NULL;
834 }
835
836 pool->frag_page = page;
837
838frag_reset:
839 pool->frag_users = 1;
840 *offset = 0;
841 pool->frag_offset = size;
52cc6ffc 842 page_pool_fragment_page(page, BIAS_MAX);
53e0961d
YL
843 return page;
844 }
845
846 pool->frag_users++;
847 pool->frag_offset = *offset + size;
0f6deac3 848 alloc_stat_inc(pool, fast);
53e0961d
YL
849 return page;
850}
851EXPORT_SYMBOL(page_pool_alloc_frag);
852
458de8a9 853static void page_pool_empty_ring(struct page_pool *pool)
ff7d6b27
JDB
854{
855 struct page *page;
856
857 /* Empty recycle ring */
4905bd9a 858 while ((page = ptr_ring_consume_bh(&pool->ring))) {
ff7d6b27
JDB
859 /* Verify the refcnt invariant of cached pages */
860 if (!(page_ref_count(page) == 1))
861 pr_crit("%s() page_pool refcnt %d violation\n",
862 __func__, page_ref_count(page));
863
458de8a9 864 page_pool_return_page(pool, page);
ff7d6b27
JDB
865 }
866}
867
de97502e 868static void __page_pool_destroy(struct page_pool *pool)
d956a048 869{
c3f812ce
JL
870 if (pool->disconnect)
871 pool->disconnect(pool);
e54cfd7e 872
f17c6964 873 page_pool_unlist(pool);
23cfaf67 874 page_pool_uninit(pool);
e54cfd7e
JDB
875 kfree(pool);
876}
e54cfd7e 877
7c9e6942 878static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
ff7d6b27
JDB
879{
880 struct page *page;
881
7c9e6942
JDB
882 if (pool->destroy_cnt)
883 return;
884
ff7d6b27
JDB
885 /* Empty alloc cache, assume caller made sure this is
886 * no-longer in use, and page_pool_alloc_pages() cannot be
887 * call concurrently.
888 */
889 while (pool->alloc.count) {
890 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 891 page_pool_return_page(pool, page);
ff7d6b27 892 }
7c9e6942
JDB
893}
894
895static void page_pool_scrub(struct page_pool *pool)
896{
897 page_pool_empty_alloc_cache_once(pool);
898 pool->destroy_cnt++;
ff7d6b27
JDB
899
900 /* No more consumers should exist, but producers could still
901 * be in-flight.
902 */
458de8a9 903 page_pool_empty_ring(pool);
c3f812ce
JL
904}
905
906static int page_pool_release(struct page_pool *pool)
907{
908 int inflight;
909
910 page_pool_scrub(pool);
7aee8429 911 inflight = page_pool_inflight(pool, true);
c3f812ce 912 if (!inflight)
de97502e 913 __page_pool_destroy(pool);
c3f812ce
JL
914
915 return inflight;
916}
917
918static void page_pool_release_retry(struct work_struct *wq)
919{
920 struct delayed_work *dwq = to_delayed_work(wq);
921 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
be009667 922 void *netdev;
c3f812ce
JL
923 int inflight;
924
925 inflight = page_pool_release(pool);
926 if (!inflight)
927 return;
928
be009667
JK
929 /* Periodic warning for page pools the user can't see */
930 netdev = READ_ONCE(pool->slow.netdev);
931 if (time_after_eq(jiffies, pool->defer_warn) &&
932 (!netdev || netdev == NET_PTR_POISON)) {
c3f812ce
JL
933 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
934
be009667
JK
935 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
936 __func__, pool->user.id, inflight, sec);
c3f812ce
JL
937 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
938 }
939
940 /* Still not ready to be disconnected, retry later */
941 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
942}
943
64693ec7
THJ
944void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
945 struct xdp_mem_info *mem)
c3f812ce
JL
946{
947 refcount_inc(&pool->user_cnt);
948 pool->disconnect = disconnect;
64693ec7 949 pool->xdp_mem_id = mem->id;
c3f812ce
JL
950}
951
56ef27e3 952static void page_pool_disable_direct_recycling(struct page_pool *pool)
dd64b232 953{
56ef27e3
AL
954 /* Disable direct recycling based on pool->cpuid.
955 * Paired with READ_ONCE() in napi_pp_put_page().
956 */
957 WRITE_ONCE(pool->cpuid, -1);
958
dd64b232
JK
959 if (!pool->p.napi)
960 return;
961
962 /* To avoid races with recycling and additional barriers make sure
963 * pool and NAPI are unlinked when NAPI is disabled.
964 */
965 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
966 READ_ONCE(pool->p.napi->list_owner) != -1);
967
968 WRITE_ONCE(pool->p.napi, NULL);
969}
dd64b232 970
c3f812ce
JL
971void page_pool_destroy(struct page_pool *pool)
972{
973 if (!pool)
974 return;
975
976 if (!page_pool_put(pool))
977 return;
978
56ef27e3 979 page_pool_disable_direct_recycling(pool);
53e0961d
YL
980 page_pool_free_frag(pool);
981
c3f812ce
JL
982 if (!page_pool_release(pool))
983 return;
984
69cb4952 985 page_pool_detached(pool);
c3f812ce
JL
986 pool->defer_start = jiffies;
987 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
ff7d6b27 988
c3f812ce
JL
989 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
990 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
ff7d6b27 991}
c3f812ce 992EXPORT_SYMBOL(page_pool_destroy);
bc836748
SM
993
994/* Caller must provide appropriate safe context, e.g. NAPI. */
995void page_pool_update_nid(struct page_pool *pool, int new_nid)
996{
44768dec
JDB
997 struct page *page;
998
bc836748
SM
999 trace_page_pool_update_nid(pool, new_nid);
1000 pool->p.nid = new_nid;
44768dec
JDB
1001
1002 /* Flush pool alloc cache, as refill will check NUMA node */
1003 while (pool->alloc.count) {
1004 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 1005 page_pool_return_page(pool, page);
44768dec 1006 }
bc836748
SM
1007}
1008EXPORT_SYMBOL(page_pool_update_nid);