page_pool: check for DMA sync shortcut earlier
[linux-2.6-block.git] / net / core / page_pool.c
CommitLineData
ff7d6b27
JDB
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
32c28f7e 7
ff7d6b27
JDB
8#include <linux/types.h>
9#include <linux/kernel.h>
10#include <linux/slab.h>
f71fec47 11#include <linux/device.h>
ff7d6b27 12
a9ca9f9c 13#include <net/page_pool/helpers.h>
78862447
LB
14#include <net/xdp.h>
15
ff7d6b27
JDB
16#include <linux/dma-direction.h>
17#include <linux/dma-mapping.h>
18#include <linux/page-flags.h>
8d29c703 19#include <linux/mm.h> /* for put_page() */
c07aea3e 20#include <linux/poison.h>
f3c5264f 21#include <linux/ethtool.h>
8c48eea3 22#include <linux/netdevice.h>
ff7d6b27 23
32c28f7e
JDB
24#include <trace/events/page_pool.h>
25
f17c6964
JK
26#include "page_pool_priv.h"
27
c3f812ce
JL
28#define DEFER_TIME (msecs_to_jiffies(1000))
29#define DEFER_WARN_INTERVAL (60 * HZ)
30
aaf153ae 31#define BIAS_MAX (LONG_MAX >> 1)
53e0961d 32
8610037e 33#ifdef CONFIG_PAGE_POOL_STATS
f853fa5c
LB
34static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
35
8610037e
JD
36/* alloc_stat_inc is intended to be used in softirq context */
37#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
ad6fa1e1
JD
38/* recycle_stat_inc is safe to use when preemption is possible. */
39#define recycle_stat_inc(pool, __stat) \
40 do { \
41 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
42 this_cpu_inc(s->__stat); \
43 } while (0)
6b95e338 44
590032a4
LB
45#define recycle_stat_add(pool, __stat, val) \
46 do { \
47 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
48 this_cpu_add(s->__stat, val); \
49 } while (0)
50
f3c5264f
LB
51static const char pp_stats[][ETH_GSTRING_LEN] = {
52 "rx_pp_alloc_fast",
53 "rx_pp_alloc_slow",
54 "rx_pp_alloc_slow_ho",
55 "rx_pp_alloc_empty",
56 "rx_pp_alloc_refill",
57 "rx_pp_alloc_waive",
58 "rx_pp_recycle_cached",
59 "rx_pp_recycle_cache_full",
60 "rx_pp_recycle_ring",
61 "rx_pp_recycle_ring_full",
62 "rx_pp_recycle_released_ref",
63};
64
82e896d9
JK
65/**
66 * page_pool_get_stats() - fetch page pool stats
67 * @pool: pool from which page was allocated
68 * @stats: struct page_pool_stats to fill in
69 *
70 * Retrieve statistics about the page_pool. This API is only available
71 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
72 * A pointer to a caller allocated struct page_pool_stats structure
73 * is passed to this API which is filled in. The caller can then report
74 * those stats to the user (perhaps via ethtool, debugfs, etc.).
75 */
d49010ad 76bool page_pool_get_stats(const struct page_pool *pool,
6b95e338
JD
77 struct page_pool_stats *stats)
78{
79 int cpu = 0;
80
81 if (!stats)
82 return false;
83
f3c5264f
LB
84 /* The caller is responsible to initialize stats. */
85 stats->alloc_stats.fast += pool->alloc_stats.fast;
86 stats->alloc_stats.slow += pool->alloc_stats.slow;
87 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
88 stats->alloc_stats.empty += pool->alloc_stats.empty;
89 stats->alloc_stats.refill += pool->alloc_stats.refill;
90 stats->alloc_stats.waive += pool->alloc_stats.waive;
6b95e338
JD
91
92 for_each_possible_cpu(cpu) {
93 const struct page_pool_recycle_stats *pcpu =
94 per_cpu_ptr(pool->recycle_stats, cpu);
95
96 stats->recycle_stats.cached += pcpu->cached;
97 stats->recycle_stats.cache_full += pcpu->cache_full;
98 stats->recycle_stats.ring += pcpu->ring;
99 stats->recycle_stats.ring_full += pcpu->ring_full;
100 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
101 }
102
103 return true;
104}
105EXPORT_SYMBOL(page_pool_get_stats);
f3c5264f
LB
106
107u8 *page_pool_ethtool_stats_get_strings(u8 *data)
108{
109 int i;
110
111 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
112 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
113 data += ETH_GSTRING_LEN;
114 }
115
116 return data;
117}
118EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
119
120int page_pool_ethtool_stats_get_count(void)
121{
122 return ARRAY_SIZE(pp_stats);
123}
124EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
125
126u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
127{
128 struct page_pool_stats *pool_stats = stats;
129
130 *data++ = pool_stats->alloc_stats.fast;
131 *data++ = pool_stats->alloc_stats.slow;
132 *data++ = pool_stats->alloc_stats.slow_high_order;
133 *data++ = pool_stats->alloc_stats.empty;
134 *data++ = pool_stats->alloc_stats.refill;
135 *data++ = pool_stats->alloc_stats.waive;
136 *data++ = pool_stats->recycle_stats.cached;
137 *data++ = pool_stats->recycle_stats.cache_full;
138 *data++ = pool_stats->recycle_stats.ring;
139 *data++ = pool_stats->recycle_stats.ring_full;
140 *data++ = pool_stats->recycle_stats.released_refcnt;
141
142 return data;
143}
144EXPORT_SYMBOL(page_pool_ethtool_stats_get);
145
8610037e
JD
146#else
147#define alloc_stat_inc(pool, __stat)
ad6fa1e1 148#define recycle_stat_inc(pool, __stat)
590032a4 149#define recycle_stat_add(pool, __stat, val)
8610037e
JD
150#endif
151
368d3cb4
YL
152static bool page_pool_producer_lock(struct page_pool *pool)
153 __acquires(&pool->ring.producer_lock)
154{
155 bool in_softirq = in_softirq();
156
157 if (in_softirq)
158 spin_lock(&pool->ring.producer_lock);
159 else
160 spin_lock_bh(&pool->ring.producer_lock);
161
162 return in_softirq;
163}
164
165static void page_pool_producer_unlock(struct page_pool *pool,
166 bool in_softirq)
167 __releases(&pool->ring.producer_lock)
168{
169 if (in_softirq)
170 spin_unlock(&pool->ring.producer_lock);
171 else
172 spin_unlock_bh(&pool->ring.producer_lock);
173}
174
1f20a576
AL
175static void page_pool_struct_check(void)
176{
177 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
178 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
179 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
180 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long));
181}
182
ff7d6b27 183static int page_pool_init(struct page_pool *pool,
2b0cfa6e
LB
184 const struct page_pool_params *params,
185 int cpuid)
ff7d6b27
JDB
186{
187 unsigned int ring_qsize = 1024; /* Default */
188
1f20a576
AL
189 page_pool_struct_check();
190
5027ec19
JK
191 memcpy(&pool->p, &params->fast, sizeof(pool->p));
192 memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
ff7d6b27 193
2b0cfa6e
LB
194 pool->cpuid = cpuid;
195
ff7d6b27 196 /* Validate only known flags were used */
403f11ac 197 if (pool->slow.flags & ~PP_FLAG_ALL)
ff7d6b27
JDB
198 return -EINVAL;
199
200 if (pool->p.pool_size)
201 ring_qsize = pool->p.pool_size;
202
203 /* Sanity limit mem that can be pinned down */
204 if (ring_qsize > 32768)
205 return -E2BIG;
206
207 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
208 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
209 * which is the XDP_TX use-case.
210 */
403f11ac 211 if (pool->slow.flags & PP_FLAG_DMA_MAP) {
798dda81
DK
212 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
213 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
214 return -EINVAL;
403f11ac
AL
215
216 pool->dma_map = true;
798dda81 217 }
ff7d6b27 218
403f11ac 219 if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
e68bc756
LB
220 /* In order to request DMA-sync-for-device the page
221 * needs to be mapped
222 */
403f11ac 223 if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
e68bc756
LB
224 return -EINVAL;
225
226 if (!pool->p.max_len)
227 return -EINVAL;
228
403f11ac
AL
229 pool->dma_sync = true;
230
e68bc756
LB
231 /* pool->p.offset has to be set according to the address
232 * offset used by the DMA engine to start copying rx data
233 */
234 }
235
2da0cac1
JK
236 pool->has_init_callback = !!pool->slow.init_callback;
237
ad6fa1e1 238#ifdef CONFIG_PAGE_POOL_STATS
403f11ac 239 if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
f853fa5c
LB
240 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
241 if (!pool->recycle_stats)
242 return -ENOMEM;
243 } else {
244 /* For system page pool instance we use a singular stats object
245 * instead of allocating a separate percpu variable for each
246 * (also percpu) page pool instance.
247 */
248 pool->recycle_stats = &pp_system_recycle_stats;
403f11ac 249 pool->system = true;
f853fa5c 250 }
ad6fa1e1
JD
251#endif
252
8ffbd166
JS
253 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
254#ifdef CONFIG_PAGE_POOL_STATS
403f11ac 255 if (!pool->system)
f853fa5c 256 free_percpu(pool->recycle_stats);
8ffbd166 257#endif
ff7d6b27 258 return -ENOMEM;
8ffbd166 259 }
ff7d6b27 260
99c07c43
JDB
261 atomic_set(&pool->pages_state_release_cnt, 0);
262
1da4bbef
IK
263 /* Driver calling page_pool_create() also call page_pool_destroy() */
264 refcount_set(&pool->user_cnt, 1);
265
403f11ac 266 if (pool->dma_map)
f71fec47
JDB
267 get_device(pool->p.dev);
268
ff7d6b27
JDB
269 return 0;
270}
271
23cfaf67
JK
272static void page_pool_uninit(struct page_pool *pool)
273{
274 ptr_ring_cleanup(&pool->ring, NULL);
275
403f11ac 276 if (pool->dma_map)
23cfaf67
JK
277 put_device(pool->p.dev);
278
279#ifdef CONFIG_PAGE_POOL_STATS
403f11ac 280 if (!pool->system)
f853fa5c 281 free_percpu(pool->recycle_stats);
23cfaf67
JK
282#endif
283}
284
82e896d9 285/**
2b0cfa6e 286 * page_pool_create_percpu() - create a page pool for a given cpu.
82e896d9 287 * @params: parameters, see struct page_pool_params
2b0cfa6e 288 * @cpuid: cpu identifier
82e896d9 289 */
2b0cfa6e
LB
290struct page_pool *
291page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
ff7d6b27
JDB
292{
293 struct page_pool *pool;
873343e7 294 int err;
ff7d6b27
JDB
295
296 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
297 if (!pool)
298 return ERR_PTR(-ENOMEM);
299
2b0cfa6e 300 err = page_pool_init(pool, params, cpuid);
f17c6964
JK
301 if (err < 0)
302 goto err_free;
303
304 err = page_pool_list(pool);
305 if (err)
306 goto err_uninit;
1da4bbef 307
ff7d6b27 308 return pool;
f17c6964
JK
309
310err_uninit:
311 page_pool_uninit(pool);
312err_free:
313 pr_warn("%s() gave up with errno %d\n", __func__, err);
314 kfree(pool);
315 return ERR_PTR(err);
ff7d6b27 316}
2b0cfa6e
LB
317EXPORT_SYMBOL(page_pool_create_percpu);
318
319/**
320 * page_pool_create() - create a page pool
321 * @params: parameters, see struct page_pool_params
322 */
323struct page_pool *page_pool_create(const struct page_pool_params *params)
324{
325 return page_pool_create_percpu(params, -1);
326}
ff7d6b27
JDB
327EXPORT_SYMBOL(page_pool_create);
328
458de8a9 329static void page_pool_return_page(struct page_pool *pool, struct page *page);
44768dec
JDB
330
331noinline
304db6cb 332static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
44768dec
JDB
333{
334 struct ptr_ring *r = &pool->ring;
335 struct page *page;
336 int pref_nid; /* preferred NUMA node */
337
338 /* Quicker fallback, avoid locks when ring is empty */
8610037e
JD
339 if (__ptr_ring_empty(r)) {
340 alloc_stat_inc(pool, empty);
44768dec 341 return NULL;
8610037e 342 }
44768dec
JDB
343
344 /* Softirq guarantee CPU and thus NUMA node is stable. This,
345 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
346 */
f13fc107 347#ifdef CONFIG_NUMA
44768dec 348 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
f13fc107
JDB
349#else
350 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
351 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
352#endif
44768dec 353
44768dec
JDB
354 /* Refill alloc array, but only if NUMA match */
355 do {
356 page = __ptr_ring_consume(r);
357 if (unlikely(!page))
358 break;
359
360 if (likely(page_to_nid(page) == pref_nid)) {
361 pool->alloc.cache[pool->alloc.count++] = page;
362 } else {
363 /* NUMA mismatch;
364 * (1) release 1 page to page-allocator and
365 * (2) break out to fallthrough to alloc_pages_node.
366 * This limit stress on page buddy alloactor.
367 */
458de8a9 368 page_pool_return_page(pool, page);
8610037e 369 alloc_stat_inc(pool, waive);
44768dec
JDB
370 page = NULL;
371 break;
372 }
304db6cb 373 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
44768dec
JDB
374
375 /* Return last page */
8610037e 376 if (likely(pool->alloc.count > 0)) {
44768dec 377 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
378 alloc_stat_inc(pool, refill);
379 }
44768dec 380
44768dec
JDB
381 return page;
382}
383
ff7d6b27
JDB
384/* fast path */
385static struct page *__page_pool_get_cached(struct page_pool *pool)
386{
ff7d6b27
JDB
387 struct page *page;
388
304db6cb
LR
389 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
390 if (likely(pool->alloc.count)) {
391 /* Fast-path */
392 page = pool->alloc.cache[--pool->alloc.count];
8610037e 393 alloc_stat_inc(pool, fast);
304db6cb
LR
394 } else {
395 page = page_pool_refill_alloc_cache(pool);
ff7d6b27
JDB
396 }
397
ff7d6b27
JDB
398 return page;
399}
400
4321de44
AL
401static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
402 struct page *page,
403 u32 dma_sync_size)
e68bc756 404{
4321de44 405#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
9ddb3c14
MWO
406 dma_addr_t dma_addr = page_pool_get_dma_addr(page);
407
e68bc756 408 dma_sync_size = min(dma_sync_size, pool->p.max_len);
4321de44
AL
409 __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
410 dma_sync_size, pool->p.dma_dir);
411#endif
412}
413
414static __always_inline void
415page_pool_dma_sync_for_device(const struct page_pool *pool,
416 struct page *page,
417 u32 dma_sync_size)
418{
419 if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
420 __page_pool_dma_sync_for_device(pool, page, dma_sync_size);
e68bc756
LB
421}
422
dfa59717
JDB
423static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
424{
425 dma_addr_t dma;
426
427 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
428 * since dma_addr_t can be either 32 or 64 bits and does not always fit
429 * into page private data (i.e 32bit cpu with 64bit DMA caps)
430 * This mapping is kept for lifetime of page, until leaving pool.
431 */
432 dma = dma_map_page_attrs(pool->p.dev, page, 0,
433 (PAGE_SIZE << pool->p.order),
8e4c62c7
JK
434 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
435 DMA_ATTR_WEAK_ORDERING);
dfa59717
JDB
436 if (dma_mapping_error(pool->p.dev, dma))
437 return false;
438
90de47f0
YL
439 if (page_pool_set_dma_addr(page, dma))
440 goto unmap_failed;
dfa59717 441
4321de44 442 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
dfa59717
JDB
443
444 return true;
90de47f0
YL
445
446unmap_failed:
447 WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
448 dma_unmap_page_attrs(pool->p.dev, dma,
449 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
450 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
451 return false;
dfa59717
JDB
452}
453
57f05bc2
YL
454static void page_pool_set_pp_info(struct page_pool *pool,
455 struct page *page)
456{
457 page->pp = pool;
458 page->pp_magic |= PP_SIGNATURE;
58d53d8f
YL
459
460 /* Ensuring all pages have been split into one fragment initially:
461 * page_pool_set_pp_info() is only called once for every page when it
462 * is allocated from the page allocator and page_pool_fragment_page()
463 * is dirtying the same cache line as the page->pp_magic above, so
464 * the overhead is negligible.
465 */
466 page_pool_fragment_page(page, 1);
2da0cac1 467 if (pool->has_init_callback)
5027ec19 468 pool->slow.init_callback(page, pool->slow.init_arg);
57f05bc2
YL
469}
470
471static void page_pool_clear_pp_info(struct page *page)
472{
473 page->pp_magic = 0;
474 page->pp = NULL;
475}
476
be5dba25
JDB
477static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
478 gfp_t gfp)
ff7d6b27
JDB
479{
480 struct page *page;
ff7d6b27 481
be5dba25 482 gfp |= __GFP_COMP;
ff7d6b27 483 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
be5dba25 484 if (unlikely(!page))
ff7d6b27
JDB
485 return NULL;
486
403f11ac 487 if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page))) {
ff7d6b27
JDB
488 put_page(page);
489 return NULL;
490 }
ff7d6b27 491
8610037e 492 alloc_stat_inc(pool, slow_high_order);
57f05bc2 493 page_pool_set_pp_info(pool, page);
c07aea3e 494
99c07c43
JDB
495 /* Track how many pages are held 'in-flight' */
496 pool->pages_state_hold_cnt++;
32c28f7e 497 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
be5dba25
JDB
498 return page;
499}
500
501/* slow path */
502noinline
503static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
504 gfp_t gfp)
505{
506 const int bulk = PP_ALLOC_CACHE_REFILL;
be5dba25 507 unsigned int pp_order = pool->p.order;
403f11ac 508 bool dma_map = pool->dma_map;
be5dba25
JDB
509 struct page *page;
510 int i, nr_pages;
511
512 /* Don't support bulk alloc for high-order pages */
513 if (unlikely(pp_order))
514 return __page_pool_alloc_page_order(pool, gfp);
515
516 /* Unnecessary as alloc cache is empty, but guarantees zero count */
517 if (unlikely(pool->alloc.count > 0))
518 return pool->alloc.cache[--pool->alloc.count];
519
520 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
521 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
522
d810d367
JW
523 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
524 pool->alloc.cache);
be5dba25
JDB
525 if (unlikely(!nr_pages))
526 return NULL;
527
528 /* Pages have been filled into alloc.cache array, but count is zero and
529 * page element have not been (possibly) DMA mapped.
530 */
531 for (i = 0; i < nr_pages; i++) {
532 page = pool->alloc.cache[i];
403f11ac 533 if (dma_map && unlikely(!page_pool_dma_map(pool, page))) {
be5dba25
JDB
534 put_page(page);
535 continue;
536 }
57f05bc2
YL
537
538 page_pool_set_pp_info(pool, page);
be5dba25
JDB
539 pool->alloc.cache[pool->alloc.count++] = page;
540 /* Track how many pages are held 'in-flight' */
541 pool->pages_state_hold_cnt++;
542 trace_page_pool_state_hold(pool, page,
543 pool->pages_state_hold_cnt);
544 }
545
546 /* Return last page */
8610037e 547 if (likely(pool->alloc.count > 0)) {
be5dba25 548 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
549 alloc_stat_inc(pool, slow);
550 } else {
be5dba25 551 page = NULL;
8610037e 552 }
32c28f7e 553
ff7d6b27
JDB
554 /* When page just alloc'ed is should/must have refcnt 1. */
555 return page;
556}
557
558/* For using page_pool replace: alloc_pages() API calls, but provide
559 * synchronization guarantee for allocation side.
560 */
561struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
562{
563 struct page *page;
564
565 /* Fast-path: Get a page from cache */
566 page = __page_pool_get_cached(pool);
567 if (page)
568 return page;
569
570 /* Slow-path: cache empty, do real allocation */
571 page = __page_pool_alloc_pages_slow(pool, gfp);
572 return page;
573}
574EXPORT_SYMBOL(page_pool_alloc_pages);
575
99c07c43
JDB
576/* Calculate distance between two u32 values, valid if distance is below 2^(31)
577 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
578 */
579#define _distance(a, b) (s32)((a) - (b))
580
7aee8429 581s32 page_pool_inflight(const struct page_pool *pool, bool strict)
99c07c43
JDB
582{
583 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
584 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
c3f812ce 585 s32 inflight;
99c07c43 586
c3f812ce 587 inflight = _distance(hold_cnt, release_cnt);
99c07c43 588
7aee8429
JK
589 if (strict) {
590 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
591 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
592 inflight);
593 } else {
594 inflight = max(0, inflight);
595 }
99c07c43 596
c3f812ce 597 return inflight;
99c07c43
JDB
598}
599
c3f687d8
JK
600static __always_inline
601void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
ff7d6b27 602{
1567b85e
IA
603 dma_addr_t dma;
604
403f11ac 605 if (!pool->dma_map)
458de8a9
IA
606 /* Always account for inflight pages, even if we didn't
607 * map them
608 */
c3f687d8 609 return;
ff7d6b27 610
9ddb3c14 611 dma = page_pool_get_dma_addr(page);
458de8a9 612
9ddb3c14 613 /* When page is unmapped, it cannot be returned to our pool */
13f16d9d
JDB
614 dma_unmap_page_attrs(pool->p.dev, dma,
615 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
8e4c62c7 616 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
9ddb3c14 617 page_pool_set_dma_addr(page, 0);
c3f687d8
JK
618}
619
620/* Disconnects a page (from a page_pool). API users can have a need
621 * to disconnect a page (from a page_pool), to allow it to be used as
622 * a regular page (that will eventually be returned to the normal
623 * page-allocator via put_page).
624 */
625void page_pool_return_page(struct page_pool *pool, struct page *page)
626{
627 int count;
628
629 __page_pool_release_page_dma(pool, page);
630
57f05bc2 631 page_pool_clear_pp_info(page);
c07aea3e 632
c3f812ce
JL
633 /* This may be the last page returned, releasing the pool, so
634 * it is not safe to reference pool afterwards.
635 */
7fb9b66d 636 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
c3f812ce 637 trace_page_pool_state_release(pool, page, count);
99c07c43 638
ff7d6b27
JDB
639 put_page(page);
640 /* An optimization would be to call __free_pages(page, pool->p.order)
641 * knowing page is not part of page-cache (thus avoiding a
642 * __page_cache_release() call).
643 */
644}
645
458de8a9 646static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
ff7d6b27
JDB
647{
648 int ret;
542bcea4
QD
649 /* BH protection not needed if current is softirq */
650 if (in_softirq())
ff7d6b27
JDB
651 ret = ptr_ring_produce(&pool->ring, page);
652 else
653 ret = ptr_ring_produce_bh(&pool->ring, page);
654
ad6fa1e1
JD
655 if (!ret) {
656 recycle_stat_inc(pool, ring);
657 return true;
658 }
659
660 return false;
ff7d6b27
JDB
661}
662
663/* Only allow direct recycling in special circumstances, into the
664 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
665 *
666 * Caller must provide appropriate safe context.
667 */
458de8a9 668static bool page_pool_recycle_in_cache(struct page *page,
ff7d6b27
JDB
669 struct page_pool *pool)
670{
ad6fa1e1
JD
671 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
672 recycle_stat_inc(pool, cache_full);
ff7d6b27 673 return false;
ad6fa1e1 674 }
ff7d6b27
JDB
675
676 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
677 pool->alloc.cache[pool->alloc.count++] = page;
ad6fa1e1 678 recycle_stat_inc(pool, cached);
ff7d6b27
JDB
679 return true;
680}
681
46f40172
MA
682static bool __page_pool_page_can_be_recycled(const struct page *page)
683{
684 return page_ref_count(page) == 1 && !page_is_pfmemalloc(page);
685}
686
458de8a9 687/* If the page refcnt == 1, this will try to recycle the page.
403f11ac 688 * If pool->dma_sync is set, we'll try to sync the DMA area for
458de8a9
IA
689 * the configured size min(dma_sync_size, pool->max_len).
690 * If the page refcnt != 1, then the page will be returned to memory
691 * subsystem.
692 */
78862447
LB
693static __always_inline struct page *
694__page_pool_put_page(struct page_pool *pool, struct page *page,
695 unsigned int dma_sync_size, bool allow_direct)
ff7d6b27 696{
ff4e538c
JK
697 lockdep_assert_no_hardirq();
698
ff7d6b27
JDB
699 /* This allocator is optimized for the XDP mode that uses
700 * one-frame-per-page, but have fallbacks that act like the
701 * regular page allocator APIs.
702 *
703 * refcnt == 1 means page_pool owns page, and can recycle it.
05656132
AL
704 *
705 * page is NOT reusable when allocated when system is under
706 * some pressure. (page_is_pfmemalloc)
ff7d6b27 707 */
46f40172 708 if (likely(__page_pool_page_can_be_recycled(page))) {
ff7d6b27
JDB
709 /* Read barrier done in page_ref_count / READ_ONCE */
710
4321de44 711 page_pool_dma_sync_for_device(pool, page, dma_sync_size);
e68bc756 712
542bcea4 713 if (allow_direct && in_softirq() &&
78862447
LB
714 page_pool_recycle_in_cache(page, pool))
715 return NULL;
ff7d6b27 716
78862447
LB
717 /* Page found as candidate for recycling */
718 return page;
ff7d6b27
JDB
719 }
720 /* Fallback/non-XDP mode: API user have elevated refcnt.
721 *
722 * Many drivers split up the page into fragments, and some
723 * want to keep doing this to save memory and do refcnt based
724 * recycling. Support this use case too, to ease drivers
725 * switching between XDP/non-XDP.
726 *
727 * In-case page_pool maintains the DMA mapping, API user must
728 * call page_pool_put_page once. In this elevated refcnt
729 * case, the DMA is unmapped/released, as driver is likely
730 * doing refcnt based recycle tricks, meaning another process
731 * will be invoking put_page.
732 */
ad6fa1e1 733 recycle_stat_inc(pool, released_refcnt);
07e0c7d3 734 page_pool_return_page(pool, page);
78862447
LB
735
736 return NULL;
737}
738
0a149ab7
LC
739void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
740 unsigned int dma_sync_size, bool allow_direct)
78862447
LB
741{
742 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
743 if (page && !page_pool_recycle_in_ring(pool, page)) {
744 /* Cache full, fallback to free pages */
ad6fa1e1 745 recycle_stat_inc(pool, ring_full);
78862447
LB
746 page_pool_return_page(pool, page);
747 }
ff7d6b27 748}
0a149ab7 749EXPORT_SYMBOL(page_pool_put_unrefed_page);
ff7d6b27 750
82e896d9
JK
751/**
752 * page_pool_put_page_bulk() - release references on multiple pages
753 * @pool: pool from which pages were allocated
754 * @data: array holding page pointers
755 * @count: number of pages in @data
756 *
757 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
758 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
759 * will release leftover pages to the page allocator.
760 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
761 * completion loop for the XDP_REDIRECT use case.
762 *
763 * Please note the caller must not use data area after running
764 * page_pool_put_page_bulk(), as this function overwrites it.
765 */
78862447
LB
766void page_pool_put_page_bulk(struct page_pool *pool, void **data,
767 int count)
768{
769 int i, bulk_len = 0;
368d3cb4 770 bool in_softirq;
78862447
LB
771
772 for (i = 0; i < count; i++) {
773 struct page *page = virt_to_head_page(data[i]);
774
52cc6ffc 775 /* It is not the last user for the page frag case */
0a149ab7 776 if (!page_pool_is_last_ref(page))
52cc6ffc
AD
777 continue;
778
78862447
LB
779 page = __page_pool_put_page(pool, page, -1, false);
780 /* Approved for bulk recycling in ptr_ring cache */
781 if (page)
782 data[bulk_len++] = page;
783 }
784
785 if (unlikely(!bulk_len))
786 return;
787
788 /* Bulk producer into ptr_ring page_pool cache */
368d3cb4 789 in_softirq = page_pool_producer_lock(pool);
78862447 790 for (i = 0; i < bulk_len; i++) {
590032a4
LB
791 if (__ptr_ring_produce(&pool->ring, data[i])) {
792 /* ring full */
793 recycle_stat_inc(pool, ring_full);
794 break;
795 }
78862447 796 }
590032a4 797 recycle_stat_add(pool, ring, i);
368d3cb4 798 page_pool_producer_unlock(pool, in_softirq);
78862447
LB
799
800 /* Hopefully all pages was return into ptr_ring */
801 if (likely(i == bulk_len))
802 return;
803
804 /* ptr_ring cache full, free remaining pages outside producer lock
805 * since put_page() with refcnt == 1 can be an expensive operation
806 */
807 for (; i < bulk_len; i++)
808 page_pool_return_page(pool, data[i]);
809}
810EXPORT_SYMBOL(page_pool_put_page_bulk);
811
53e0961d
YL
812static struct page *page_pool_drain_frag(struct page_pool *pool,
813 struct page *page)
814{
815 long drain_count = BIAS_MAX - pool->frag_users;
816
817 /* Some user is still using the page frag */
0a149ab7 818 if (likely(page_pool_unref_page(page, drain_count)))
53e0961d
YL
819 return NULL;
820
46f40172 821 if (__page_pool_page_can_be_recycled(page)) {
4321de44 822 page_pool_dma_sync_for_device(pool, page, -1);
53e0961d
YL
823 return page;
824 }
825
826 page_pool_return_page(pool, page);
827 return NULL;
828}
829
830static void page_pool_free_frag(struct page_pool *pool)
831{
832 long drain_count = BIAS_MAX - pool->frag_users;
833 struct page *page = pool->frag_page;
834
835 pool->frag_page = NULL;
836
0a149ab7 837 if (!page || page_pool_unref_page(page, drain_count))
53e0961d
YL
838 return;
839
840 page_pool_return_page(pool, page);
841}
842
843struct page *page_pool_alloc_frag(struct page_pool *pool,
844 unsigned int *offset,
845 unsigned int size, gfp_t gfp)
846{
847 unsigned int max_size = PAGE_SIZE << pool->p.order;
848 struct page *page = pool->frag_page;
849
09d96ee5 850 if (WARN_ON(size > max_size))
53e0961d
YL
851 return NULL;
852
853 size = ALIGN(size, dma_get_cache_alignment());
854 *offset = pool->frag_offset;
855
856 if (page && *offset + size > max_size) {
857 page = page_pool_drain_frag(pool, page);
0f6deac3
JW
858 if (page) {
859 alloc_stat_inc(pool, fast);
53e0961d 860 goto frag_reset;
0f6deac3 861 }
53e0961d
YL
862 }
863
864 if (!page) {
865 page = page_pool_alloc_pages(pool, gfp);
866 if (unlikely(!page)) {
867 pool->frag_page = NULL;
868 return NULL;
869 }
870
871 pool->frag_page = page;
872
873frag_reset:
874 pool->frag_users = 1;
875 *offset = 0;
876 pool->frag_offset = size;
52cc6ffc 877 page_pool_fragment_page(page, BIAS_MAX);
53e0961d
YL
878 return page;
879 }
880
881 pool->frag_users++;
882 pool->frag_offset = *offset + size;
0f6deac3 883 alloc_stat_inc(pool, fast);
53e0961d
YL
884 return page;
885}
886EXPORT_SYMBOL(page_pool_alloc_frag);
887
458de8a9 888static void page_pool_empty_ring(struct page_pool *pool)
ff7d6b27
JDB
889{
890 struct page *page;
891
892 /* Empty recycle ring */
4905bd9a 893 while ((page = ptr_ring_consume_bh(&pool->ring))) {
ff7d6b27
JDB
894 /* Verify the refcnt invariant of cached pages */
895 if (!(page_ref_count(page) == 1))
896 pr_crit("%s() page_pool refcnt %d violation\n",
897 __func__, page_ref_count(page));
898
458de8a9 899 page_pool_return_page(pool, page);
ff7d6b27
JDB
900 }
901}
902
de97502e 903static void __page_pool_destroy(struct page_pool *pool)
d956a048 904{
c3f812ce
JL
905 if (pool->disconnect)
906 pool->disconnect(pool);
e54cfd7e 907
f17c6964 908 page_pool_unlist(pool);
23cfaf67 909 page_pool_uninit(pool);
e54cfd7e
JDB
910 kfree(pool);
911}
e54cfd7e 912
7c9e6942 913static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
ff7d6b27
JDB
914{
915 struct page *page;
916
7c9e6942
JDB
917 if (pool->destroy_cnt)
918 return;
919
ff7d6b27
JDB
920 /* Empty alloc cache, assume caller made sure this is
921 * no-longer in use, and page_pool_alloc_pages() cannot be
922 * call concurrently.
923 */
924 while (pool->alloc.count) {
925 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 926 page_pool_return_page(pool, page);
ff7d6b27 927 }
7c9e6942
JDB
928}
929
930static void page_pool_scrub(struct page_pool *pool)
931{
932 page_pool_empty_alloc_cache_once(pool);
933 pool->destroy_cnt++;
ff7d6b27
JDB
934
935 /* No more consumers should exist, but producers could still
936 * be in-flight.
937 */
458de8a9 938 page_pool_empty_ring(pool);
c3f812ce
JL
939}
940
941static int page_pool_release(struct page_pool *pool)
942{
943 int inflight;
944
945 page_pool_scrub(pool);
7aee8429 946 inflight = page_pool_inflight(pool, true);
c3f812ce 947 if (!inflight)
de97502e 948 __page_pool_destroy(pool);
c3f812ce
JL
949
950 return inflight;
951}
952
953static void page_pool_release_retry(struct work_struct *wq)
954{
955 struct delayed_work *dwq = to_delayed_work(wq);
956 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
be009667 957 void *netdev;
c3f812ce
JL
958 int inflight;
959
960 inflight = page_pool_release(pool);
961 if (!inflight)
962 return;
963
be009667
JK
964 /* Periodic warning for page pools the user can't see */
965 netdev = READ_ONCE(pool->slow.netdev);
966 if (time_after_eq(jiffies, pool->defer_warn) &&
967 (!netdev || netdev == NET_PTR_POISON)) {
c3f812ce
JL
968 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
969
be009667
JK
970 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
971 __func__, pool->user.id, inflight, sec);
c3f812ce
JL
972 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
973 }
974
975 /* Still not ready to be disconnected, retry later */
976 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
977}
978
64693ec7
THJ
979void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
980 struct xdp_mem_info *mem)
c3f812ce
JL
981{
982 refcount_inc(&pool->user_cnt);
983 pool->disconnect = disconnect;
64693ec7 984 pool->xdp_mem_id = mem->id;
c3f812ce
JL
985}
986
56ef27e3 987static void page_pool_disable_direct_recycling(struct page_pool *pool)
dd64b232 988{
56ef27e3
AL
989 /* Disable direct recycling based on pool->cpuid.
990 * Paired with READ_ONCE() in napi_pp_put_page().
991 */
992 WRITE_ONCE(pool->cpuid, -1);
993
dd64b232
JK
994 if (!pool->p.napi)
995 return;
996
997 /* To avoid races with recycling and additional barriers make sure
998 * pool and NAPI are unlinked when NAPI is disabled.
999 */
1000 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
1001 READ_ONCE(pool->p.napi->list_owner) != -1);
1002
1003 WRITE_ONCE(pool->p.napi, NULL);
1004}
dd64b232 1005
c3f812ce
JL
1006void page_pool_destroy(struct page_pool *pool)
1007{
1008 if (!pool)
1009 return;
1010
1011 if (!page_pool_put(pool))
1012 return;
1013
56ef27e3 1014 page_pool_disable_direct_recycling(pool);
53e0961d
YL
1015 page_pool_free_frag(pool);
1016
c3f812ce
JL
1017 if (!page_pool_release(pool))
1018 return;
1019
69cb4952 1020 page_pool_detached(pool);
c3f812ce
JL
1021 pool->defer_start = jiffies;
1022 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
ff7d6b27 1023
c3f812ce
JL
1024 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1025 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
ff7d6b27 1026}
c3f812ce 1027EXPORT_SYMBOL(page_pool_destroy);
bc836748
SM
1028
1029/* Caller must provide appropriate safe context, e.g. NAPI. */
1030void page_pool_update_nid(struct page_pool *pool, int new_nid)
1031{
44768dec
JDB
1032 struct page *page;
1033
bc836748
SM
1034 trace_page_pool_update_nid(pool, new_nid);
1035 pool->p.nid = new_nid;
44768dec
JDB
1036
1037 /* Flush pool alloc cache, as refill will check NUMA node */
1038 while (pool->alloc.count) {
1039 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 1040 page_pool_return_page(pool, page);
44768dec 1041 }
bc836748
SM
1042}
1043EXPORT_SYMBOL(page_pool_update_nid);