page_pool: make sure frag API fields don't span between cachelines
[linux-2.6-block.git] / net / core / page_pool.c
CommitLineData
ff7d6b27
JDB
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
32c28f7e 7
ff7d6b27
JDB
8#include <linux/types.h>
9#include <linux/kernel.h>
10#include <linux/slab.h>
f71fec47 11#include <linux/device.h>
ff7d6b27 12
a9ca9f9c 13#include <net/page_pool/helpers.h>
78862447
LB
14#include <net/xdp.h>
15
ff7d6b27
JDB
16#include <linux/dma-direction.h>
17#include <linux/dma-mapping.h>
18#include <linux/page-flags.h>
8d29c703 19#include <linux/mm.h> /* for put_page() */
c07aea3e 20#include <linux/poison.h>
f3c5264f 21#include <linux/ethtool.h>
8c48eea3 22#include <linux/netdevice.h>
ff7d6b27 23
32c28f7e
JDB
24#include <trace/events/page_pool.h>
25
f17c6964
JK
26#include "page_pool_priv.h"
27
c3f812ce
JL
28#define DEFER_TIME (msecs_to_jiffies(1000))
29#define DEFER_WARN_INTERVAL (60 * HZ)
30
aaf153ae 31#define BIAS_MAX (LONG_MAX >> 1)
53e0961d 32
8610037e 33#ifdef CONFIG_PAGE_POOL_STATS
f853fa5c
LB
34static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
35
8610037e
JD
36/* alloc_stat_inc is intended to be used in softirq context */
37#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
ad6fa1e1
JD
38/* recycle_stat_inc is safe to use when preemption is possible. */
39#define recycle_stat_inc(pool, __stat) \
40 do { \
41 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
42 this_cpu_inc(s->__stat); \
43 } while (0)
6b95e338 44
590032a4
LB
45#define recycle_stat_add(pool, __stat, val) \
46 do { \
47 struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
48 this_cpu_add(s->__stat, val); \
49 } while (0)
50
f3c5264f
LB
51static const char pp_stats[][ETH_GSTRING_LEN] = {
52 "rx_pp_alloc_fast",
53 "rx_pp_alloc_slow",
54 "rx_pp_alloc_slow_ho",
55 "rx_pp_alloc_empty",
56 "rx_pp_alloc_refill",
57 "rx_pp_alloc_waive",
58 "rx_pp_recycle_cached",
59 "rx_pp_recycle_cache_full",
60 "rx_pp_recycle_ring",
61 "rx_pp_recycle_ring_full",
62 "rx_pp_recycle_released_ref",
63};
64
82e896d9
JK
65/**
66 * page_pool_get_stats() - fetch page pool stats
67 * @pool: pool from which page was allocated
68 * @stats: struct page_pool_stats to fill in
69 *
70 * Retrieve statistics about the page_pool. This API is only available
71 * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
72 * A pointer to a caller allocated struct page_pool_stats structure
73 * is passed to this API which is filled in. The caller can then report
74 * those stats to the user (perhaps via ethtool, debugfs, etc.).
75 */
d49010ad 76bool page_pool_get_stats(const struct page_pool *pool,
6b95e338
JD
77 struct page_pool_stats *stats)
78{
79 int cpu = 0;
80
81 if (!stats)
82 return false;
83
f3c5264f
LB
84 /* The caller is responsible to initialize stats. */
85 stats->alloc_stats.fast += pool->alloc_stats.fast;
86 stats->alloc_stats.slow += pool->alloc_stats.slow;
87 stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
88 stats->alloc_stats.empty += pool->alloc_stats.empty;
89 stats->alloc_stats.refill += pool->alloc_stats.refill;
90 stats->alloc_stats.waive += pool->alloc_stats.waive;
6b95e338
JD
91
92 for_each_possible_cpu(cpu) {
93 const struct page_pool_recycle_stats *pcpu =
94 per_cpu_ptr(pool->recycle_stats, cpu);
95
96 stats->recycle_stats.cached += pcpu->cached;
97 stats->recycle_stats.cache_full += pcpu->cache_full;
98 stats->recycle_stats.ring += pcpu->ring;
99 stats->recycle_stats.ring_full += pcpu->ring_full;
100 stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
101 }
102
103 return true;
104}
105EXPORT_SYMBOL(page_pool_get_stats);
f3c5264f
LB
106
107u8 *page_pool_ethtool_stats_get_strings(u8 *data)
108{
109 int i;
110
111 for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
112 memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
113 data += ETH_GSTRING_LEN;
114 }
115
116 return data;
117}
118EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
119
120int page_pool_ethtool_stats_get_count(void)
121{
122 return ARRAY_SIZE(pp_stats);
123}
124EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
125
126u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
127{
128 struct page_pool_stats *pool_stats = stats;
129
130 *data++ = pool_stats->alloc_stats.fast;
131 *data++ = pool_stats->alloc_stats.slow;
132 *data++ = pool_stats->alloc_stats.slow_high_order;
133 *data++ = pool_stats->alloc_stats.empty;
134 *data++ = pool_stats->alloc_stats.refill;
135 *data++ = pool_stats->alloc_stats.waive;
136 *data++ = pool_stats->recycle_stats.cached;
137 *data++ = pool_stats->recycle_stats.cache_full;
138 *data++ = pool_stats->recycle_stats.ring;
139 *data++ = pool_stats->recycle_stats.ring_full;
140 *data++ = pool_stats->recycle_stats.released_refcnt;
141
142 return data;
143}
144EXPORT_SYMBOL(page_pool_ethtool_stats_get);
145
8610037e
JD
146#else
147#define alloc_stat_inc(pool, __stat)
ad6fa1e1 148#define recycle_stat_inc(pool, __stat)
590032a4 149#define recycle_stat_add(pool, __stat, val)
8610037e
JD
150#endif
151
368d3cb4
YL
152static bool page_pool_producer_lock(struct page_pool *pool)
153 __acquires(&pool->ring.producer_lock)
154{
155 bool in_softirq = in_softirq();
156
157 if (in_softirq)
158 spin_lock(&pool->ring.producer_lock);
159 else
160 spin_lock_bh(&pool->ring.producer_lock);
161
162 return in_softirq;
163}
164
165static void page_pool_producer_unlock(struct page_pool *pool,
166 bool in_softirq)
167 __releases(&pool->ring.producer_lock)
168{
169 if (in_softirq)
170 spin_unlock(&pool->ring.producer_lock);
171 else
172 spin_unlock_bh(&pool->ring.producer_lock);
173}
174
1f20a576
AL
175static void page_pool_struct_check(void)
176{
177 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
178 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
179 CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
180 CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, 4 * sizeof(long));
181}
182
ff7d6b27 183static int page_pool_init(struct page_pool *pool,
2b0cfa6e
LB
184 const struct page_pool_params *params,
185 int cpuid)
ff7d6b27
JDB
186{
187 unsigned int ring_qsize = 1024; /* Default */
188
1f20a576
AL
189 page_pool_struct_check();
190
5027ec19
JK
191 memcpy(&pool->p, &params->fast, sizeof(pool->p));
192 memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
ff7d6b27 193
2b0cfa6e
LB
194 pool->cpuid = cpuid;
195
ff7d6b27
JDB
196 /* Validate only known flags were used */
197 if (pool->p.flags & ~(PP_FLAG_ALL))
198 return -EINVAL;
199
200 if (pool->p.pool_size)
201 ring_qsize = pool->p.pool_size;
202
203 /* Sanity limit mem that can be pinned down */
204 if (ring_qsize > 32768)
205 return -E2BIG;
206
207 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
208 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
209 * which is the XDP_TX use-case.
210 */
798dda81
DK
211 if (pool->p.flags & PP_FLAG_DMA_MAP) {
212 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
213 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
214 return -EINVAL;
215 }
ff7d6b27 216
e68bc756
LB
217 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
218 /* In order to request DMA-sync-for-device the page
219 * needs to be mapped
220 */
221 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
222 return -EINVAL;
223
224 if (!pool->p.max_len)
225 return -EINVAL;
226
227 /* pool->p.offset has to be set according to the address
228 * offset used by the DMA engine to start copying rx data
229 */
230 }
231
2da0cac1
JK
232 pool->has_init_callback = !!pool->slow.init_callback;
233
ad6fa1e1 234#ifdef CONFIG_PAGE_POOL_STATS
f853fa5c
LB
235 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) {
236 pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
237 if (!pool->recycle_stats)
238 return -ENOMEM;
239 } else {
240 /* For system page pool instance we use a singular stats object
241 * instead of allocating a separate percpu variable for each
242 * (also percpu) page pool instance.
243 */
244 pool->recycle_stats = &pp_system_recycle_stats;
245 }
ad6fa1e1
JD
246#endif
247
8ffbd166
JS
248 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
249#ifdef CONFIG_PAGE_POOL_STATS
f853fa5c
LB
250 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
251 free_percpu(pool->recycle_stats);
8ffbd166 252#endif
ff7d6b27 253 return -ENOMEM;
8ffbd166 254 }
ff7d6b27 255
99c07c43
JDB
256 atomic_set(&pool->pages_state_release_cnt, 0);
257
1da4bbef
IK
258 /* Driver calling page_pool_create() also call page_pool_destroy() */
259 refcount_set(&pool->user_cnt, 1);
260
f71fec47
JDB
261 if (pool->p.flags & PP_FLAG_DMA_MAP)
262 get_device(pool->p.dev);
263
ff7d6b27
JDB
264 return 0;
265}
266
23cfaf67
JK
267static void page_pool_uninit(struct page_pool *pool)
268{
269 ptr_ring_cleanup(&pool->ring, NULL);
270
271 if (pool->p.flags & PP_FLAG_DMA_MAP)
272 put_device(pool->p.dev);
273
274#ifdef CONFIG_PAGE_POOL_STATS
f853fa5c
LB
275 if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL))
276 free_percpu(pool->recycle_stats);
23cfaf67
JK
277#endif
278}
279
82e896d9 280/**
2b0cfa6e 281 * page_pool_create_percpu() - create a page pool for a given cpu.
82e896d9 282 * @params: parameters, see struct page_pool_params
2b0cfa6e 283 * @cpuid: cpu identifier
82e896d9 284 */
2b0cfa6e
LB
285struct page_pool *
286page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
ff7d6b27
JDB
287{
288 struct page_pool *pool;
873343e7 289 int err;
ff7d6b27
JDB
290
291 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
292 if (!pool)
293 return ERR_PTR(-ENOMEM);
294
2b0cfa6e 295 err = page_pool_init(pool, params, cpuid);
f17c6964
JK
296 if (err < 0)
297 goto err_free;
298
299 err = page_pool_list(pool);
300 if (err)
301 goto err_uninit;
1da4bbef 302
ff7d6b27 303 return pool;
f17c6964
JK
304
305err_uninit:
306 page_pool_uninit(pool);
307err_free:
308 pr_warn("%s() gave up with errno %d\n", __func__, err);
309 kfree(pool);
310 return ERR_PTR(err);
ff7d6b27 311}
2b0cfa6e
LB
312EXPORT_SYMBOL(page_pool_create_percpu);
313
314/**
315 * page_pool_create() - create a page pool
316 * @params: parameters, see struct page_pool_params
317 */
318struct page_pool *page_pool_create(const struct page_pool_params *params)
319{
320 return page_pool_create_percpu(params, -1);
321}
ff7d6b27
JDB
322EXPORT_SYMBOL(page_pool_create);
323
458de8a9 324static void page_pool_return_page(struct page_pool *pool, struct page *page);
44768dec
JDB
325
326noinline
304db6cb 327static struct page *page_pool_refill_alloc_cache(struct page_pool *pool)
44768dec
JDB
328{
329 struct ptr_ring *r = &pool->ring;
330 struct page *page;
331 int pref_nid; /* preferred NUMA node */
332
333 /* Quicker fallback, avoid locks when ring is empty */
8610037e
JD
334 if (__ptr_ring_empty(r)) {
335 alloc_stat_inc(pool, empty);
44768dec 336 return NULL;
8610037e 337 }
44768dec
JDB
338
339 /* Softirq guarantee CPU and thus NUMA node is stable. This,
340 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
341 */
f13fc107 342#ifdef CONFIG_NUMA
44768dec 343 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
f13fc107
JDB
344#else
345 /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
346 pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
347#endif
44768dec 348
44768dec
JDB
349 /* Refill alloc array, but only if NUMA match */
350 do {
351 page = __ptr_ring_consume(r);
352 if (unlikely(!page))
353 break;
354
355 if (likely(page_to_nid(page) == pref_nid)) {
356 pool->alloc.cache[pool->alloc.count++] = page;
357 } else {
358 /* NUMA mismatch;
359 * (1) release 1 page to page-allocator and
360 * (2) break out to fallthrough to alloc_pages_node.
361 * This limit stress on page buddy alloactor.
362 */
458de8a9 363 page_pool_return_page(pool, page);
8610037e 364 alloc_stat_inc(pool, waive);
44768dec
JDB
365 page = NULL;
366 break;
367 }
304db6cb 368 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
44768dec
JDB
369
370 /* Return last page */
8610037e 371 if (likely(pool->alloc.count > 0)) {
44768dec 372 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
373 alloc_stat_inc(pool, refill);
374 }
44768dec 375
44768dec
JDB
376 return page;
377}
378
ff7d6b27
JDB
379/* fast path */
380static struct page *__page_pool_get_cached(struct page_pool *pool)
381{
ff7d6b27
JDB
382 struct page *page;
383
304db6cb
LR
384 /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
385 if (likely(pool->alloc.count)) {
386 /* Fast-path */
387 page = pool->alloc.cache[--pool->alloc.count];
8610037e 388 alloc_stat_inc(pool, fast);
304db6cb
LR
389 } else {
390 page = page_pool_refill_alloc_cache(pool);
ff7d6b27
JDB
391 }
392
ff7d6b27
JDB
393 return page;
394}
395
e68bc756
LB
396static void page_pool_dma_sync_for_device(struct page_pool *pool,
397 struct page *page,
398 unsigned int dma_sync_size)
399{
9ddb3c14
MWO
400 dma_addr_t dma_addr = page_pool_get_dma_addr(page);
401
e68bc756 402 dma_sync_size = min(dma_sync_size, pool->p.max_len);
9ddb3c14 403 dma_sync_single_range_for_device(pool->p.dev, dma_addr,
e68bc756
LB
404 pool->p.offset, dma_sync_size,
405 pool->p.dma_dir);
406}
407
dfa59717
JDB
408static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
409{
410 dma_addr_t dma;
411
412 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
413 * since dma_addr_t can be either 32 or 64 bits and does not always fit
414 * into page private data (i.e 32bit cpu with 64bit DMA caps)
415 * This mapping is kept for lifetime of page, until leaving pool.
416 */
417 dma = dma_map_page_attrs(pool->p.dev, page, 0,
418 (PAGE_SIZE << pool->p.order),
8e4c62c7
JK
419 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC |
420 DMA_ATTR_WEAK_ORDERING);
dfa59717
JDB
421 if (dma_mapping_error(pool->p.dev, dma))
422 return false;
423
90de47f0
YL
424 if (page_pool_set_dma_addr(page, dma))
425 goto unmap_failed;
dfa59717
JDB
426
427 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
428 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
429
430 return true;
90de47f0
YL
431
432unmap_failed:
433 WARN_ON_ONCE("unexpected DMA address, please report to netdev@");
434 dma_unmap_page_attrs(pool->p.dev, dma,
435 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
436 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
437 return false;
dfa59717
JDB
438}
439
57f05bc2
YL
440static void page_pool_set_pp_info(struct page_pool *pool,
441 struct page *page)
442{
443 page->pp = pool;
444 page->pp_magic |= PP_SIGNATURE;
58d53d8f
YL
445
446 /* Ensuring all pages have been split into one fragment initially:
447 * page_pool_set_pp_info() is only called once for every page when it
448 * is allocated from the page allocator and page_pool_fragment_page()
449 * is dirtying the same cache line as the page->pp_magic above, so
450 * the overhead is negligible.
451 */
452 page_pool_fragment_page(page, 1);
2da0cac1 453 if (pool->has_init_callback)
5027ec19 454 pool->slow.init_callback(page, pool->slow.init_arg);
57f05bc2
YL
455}
456
457static void page_pool_clear_pp_info(struct page *page)
458{
459 page->pp_magic = 0;
460 page->pp = NULL;
461}
462
be5dba25
JDB
463static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
464 gfp_t gfp)
ff7d6b27
JDB
465{
466 struct page *page;
ff7d6b27 467
be5dba25 468 gfp |= __GFP_COMP;
ff7d6b27 469 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
be5dba25 470 if (unlikely(!page))
ff7d6b27
JDB
471 return NULL;
472
be5dba25 473 if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
dfa59717 474 unlikely(!page_pool_dma_map(pool, page))) {
ff7d6b27
JDB
475 put_page(page);
476 return NULL;
477 }
ff7d6b27 478
8610037e 479 alloc_stat_inc(pool, slow_high_order);
57f05bc2 480 page_pool_set_pp_info(pool, page);
c07aea3e 481
99c07c43
JDB
482 /* Track how many pages are held 'in-flight' */
483 pool->pages_state_hold_cnt++;
32c28f7e 484 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
be5dba25
JDB
485 return page;
486}
487
488/* slow path */
489noinline
490static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
491 gfp_t gfp)
492{
493 const int bulk = PP_ALLOC_CACHE_REFILL;
494 unsigned int pp_flags = pool->p.flags;
495 unsigned int pp_order = pool->p.order;
496 struct page *page;
497 int i, nr_pages;
498
499 /* Don't support bulk alloc for high-order pages */
500 if (unlikely(pp_order))
501 return __page_pool_alloc_page_order(pool, gfp);
502
503 /* Unnecessary as alloc cache is empty, but guarantees zero count */
504 if (unlikely(pool->alloc.count > 0))
505 return pool->alloc.cache[--pool->alloc.count];
506
507 /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
508 memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
509
d810d367
JW
510 nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk,
511 pool->alloc.cache);
be5dba25
JDB
512 if (unlikely(!nr_pages))
513 return NULL;
514
515 /* Pages have been filled into alloc.cache array, but count is zero and
516 * page element have not been (possibly) DMA mapped.
517 */
518 for (i = 0; i < nr_pages; i++) {
519 page = pool->alloc.cache[i];
520 if ((pp_flags & PP_FLAG_DMA_MAP) &&
521 unlikely(!page_pool_dma_map(pool, page))) {
522 put_page(page);
523 continue;
524 }
57f05bc2
YL
525
526 page_pool_set_pp_info(pool, page);
be5dba25
JDB
527 pool->alloc.cache[pool->alloc.count++] = page;
528 /* Track how many pages are held 'in-flight' */
529 pool->pages_state_hold_cnt++;
530 trace_page_pool_state_hold(pool, page,
531 pool->pages_state_hold_cnt);
532 }
533
534 /* Return last page */
8610037e 535 if (likely(pool->alloc.count > 0)) {
be5dba25 536 page = pool->alloc.cache[--pool->alloc.count];
8610037e
JD
537 alloc_stat_inc(pool, slow);
538 } else {
be5dba25 539 page = NULL;
8610037e 540 }
32c28f7e 541
ff7d6b27
JDB
542 /* When page just alloc'ed is should/must have refcnt 1. */
543 return page;
544}
545
546/* For using page_pool replace: alloc_pages() API calls, but provide
547 * synchronization guarantee for allocation side.
548 */
549struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
550{
551 struct page *page;
552
553 /* Fast-path: Get a page from cache */
554 page = __page_pool_get_cached(pool);
555 if (page)
556 return page;
557
558 /* Slow-path: cache empty, do real allocation */
559 page = __page_pool_alloc_pages_slow(pool, gfp);
560 return page;
561}
562EXPORT_SYMBOL(page_pool_alloc_pages);
563
99c07c43
JDB
564/* Calculate distance between two u32 values, valid if distance is below 2^(31)
565 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
566 */
567#define _distance(a, b) (s32)((a) - (b))
568
7aee8429 569s32 page_pool_inflight(const struct page_pool *pool, bool strict)
99c07c43
JDB
570{
571 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
572 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
c3f812ce 573 s32 inflight;
99c07c43 574
c3f812ce 575 inflight = _distance(hold_cnt, release_cnt);
99c07c43 576
7aee8429
JK
577 if (strict) {
578 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
579 WARN(inflight < 0, "Negative(%d) inflight packet-pages",
580 inflight);
581 } else {
582 inflight = max(0, inflight);
583 }
99c07c43 584
c3f812ce 585 return inflight;
99c07c43
JDB
586}
587
c3f687d8
JK
588static __always_inline
589void __page_pool_release_page_dma(struct page_pool *pool, struct page *page)
ff7d6b27 590{
1567b85e
IA
591 dma_addr_t dma;
592
ff7d6b27 593 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
458de8a9
IA
594 /* Always account for inflight pages, even if we didn't
595 * map them
596 */
c3f687d8 597 return;
ff7d6b27 598
9ddb3c14 599 dma = page_pool_get_dma_addr(page);
458de8a9 600
9ddb3c14 601 /* When page is unmapped, it cannot be returned to our pool */
13f16d9d
JDB
602 dma_unmap_page_attrs(pool->p.dev, dma,
603 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
8e4c62c7 604 DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
9ddb3c14 605 page_pool_set_dma_addr(page, 0);
c3f687d8
JK
606}
607
608/* Disconnects a page (from a page_pool). API users can have a need
609 * to disconnect a page (from a page_pool), to allow it to be used as
610 * a regular page (that will eventually be returned to the normal
611 * page-allocator via put_page).
612 */
613void page_pool_return_page(struct page_pool *pool, struct page *page)
614{
615 int count;
616
617 __page_pool_release_page_dma(pool, page);
618
57f05bc2 619 page_pool_clear_pp_info(page);
c07aea3e 620
c3f812ce
JL
621 /* This may be the last page returned, releasing the pool, so
622 * it is not safe to reference pool afterwards.
623 */
7fb9b66d 624 count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
c3f812ce 625 trace_page_pool_state_release(pool, page, count);
99c07c43 626
ff7d6b27
JDB
627 put_page(page);
628 /* An optimization would be to call __free_pages(page, pool->p.order)
629 * knowing page is not part of page-cache (thus avoiding a
630 * __page_cache_release() call).
631 */
632}
633
458de8a9 634static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
ff7d6b27
JDB
635{
636 int ret;
542bcea4
QD
637 /* BH protection not needed if current is softirq */
638 if (in_softirq())
ff7d6b27
JDB
639 ret = ptr_ring_produce(&pool->ring, page);
640 else
641 ret = ptr_ring_produce_bh(&pool->ring, page);
642
ad6fa1e1
JD
643 if (!ret) {
644 recycle_stat_inc(pool, ring);
645 return true;
646 }
647
648 return false;
ff7d6b27
JDB
649}
650
651/* Only allow direct recycling in special circumstances, into the
652 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
653 *
654 * Caller must provide appropriate safe context.
655 */
458de8a9 656static bool page_pool_recycle_in_cache(struct page *page,
ff7d6b27
JDB
657 struct page_pool *pool)
658{
ad6fa1e1
JD
659 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
660 recycle_stat_inc(pool, cache_full);
ff7d6b27 661 return false;
ad6fa1e1 662 }
ff7d6b27
JDB
663
664 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
665 pool->alloc.cache[pool->alloc.count++] = page;
ad6fa1e1 666 recycle_stat_inc(pool, cached);
ff7d6b27
JDB
667 return true;
668}
669
46f40172
MA
670static bool __page_pool_page_can_be_recycled(const struct page *page)
671{
672 return page_ref_count(page) == 1 && !page_is_pfmemalloc(page);
673}
674
458de8a9
IA
675/* If the page refcnt == 1, this will try to recycle the page.
676 * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
677 * the configured size min(dma_sync_size, pool->max_len).
678 * If the page refcnt != 1, then the page will be returned to memory
679 * subsystem.
680 */
78862447
LB
681static __always_inline struct page *
682__page_pool_put_page(struct page_pool *pool, struct page *page,
683 unsigned int dma_sync_size, bool allow_direct)
ff7d6b27 684{
ff4e538c
JK
685 lockdep_assert_no_hardirq();
686
ff7d6b27
JDB
687 /* This allocator is optimized for the XDP mode that uses
688 * one-frame-per-page, but have fallbacks that act like the
689 * regular page allocator APIs.
690 *
691 * refcnt == 1 means page_pool owns page, and can recycle it.
05656132
AL
692 *
693 * page is NOT reusable when allocated when system is under
694 * some pressure. (page_is_pfmemalloc)
ff7d6b27 695 */
46f40172 696 if (likely(__page_pool_page_can_be_recycled(page))) {
ff7d6b27
JDB
697 /* Read barrier done in page_ref_count / READ_ONCE */
698
e68bc756
LB
699 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
700 page_pool_dma_sync_for_device(pool, page,
701 dma_sync_size);
702
542bcea4 703 if (allow_direct && in_softirq() &&
78862447
LB
704 page_pool_recycle_in_cache(page, pool))
705 return NULL;
ff7d6b27 706
78862447
LB
707 /* Page found as candidate for recycling */
708 return page;
ff7d6b27
JDB
709 }
710 /* Fallback/non-XDP mode: API user have elevated refcnt.
711 *
712 * Many drivers split up the page into fragments, and some
713 * want to keep doing this to save memory and do refcnt based
714 * recycling. Support this use case too, to ease drivers
715 * switching between XDP/non-XDP.
716 *
717 * In-case page_pool maintains the DMA mapping, API user must
718 * call page_pool_put_page once. In this elevated refcnt
719 * case, the DMA is unmapped/released, as driver is likely
720 * doing refcnt based recycle tricks, meaning another process
721 * will be invoking put_page.
722 */
ad6fa1e1 723 recycle_stat_inc(pool, released_refcnt);
07e0c7d3 724 page_pool_return_page(pool, page);
78862447
LB
725
726 return NULL;
727}
728
0a149ab7
LC
729void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
730 unsigned int dma_sync_size, bool allow_direct)
78862447
LB
731{
732 page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
733 if (page && !page_pool_recycle_in_ring(pool, page)) {
734 /* Cache full, fallback to free pages */
ad6fa1e1 735 recycle_stat_inc(pool, ring_full);
78862447
LB
736 page_pool_return_page(pool, page);
737 }
ff7d6b27 738}
0a149ab7 739EXPORT_SYMBOL(page_pool_put_unrefed_page);
ff7d6b27 740
82e896d9
JK
741/**
742 * page_pool_put_page_bulk() - release references on multiple pages
743 * @pool: pool from which pages were allocated
744 * @data: array holding page pointers
745 * @count: number of pages in @data
746 *
747 * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
748 * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
749 * will release leftover pages to the page allocator.
750 * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
751 * completion loop for the XDP_REDIRECT use case.
752 *
753 * Please note the caller must not use data area after running
754 * page_pool_put_page_bulk(), as this function overwrites it.
755 */
78862447
LB
756void page_pool_put_page_bulk(struct page_pool *pool, void **data,
757 int count)
758{
759 int i, bulk_len = 0;
368d3cb4 760 bool in_softirq;
78862447
LB
761
762 for (i = 0; i < count; i++) {
763 struct page *page = virt_to_head_page(data[i]);
764
52cc6ffc 765 /* It is not the last user for the page frag case */
0a149ab7 766 if (!page_pool_is_last_ref(page))
52cc6ffc
AD
767 continue;
768
78862447
LB
769 page = __page_pool_put_page(pool, page, -1, false);
770 /* Approved for bulk recycling in ptr_ring cache */
771 if (page)
772 data[bulk_len++] = page;
773 }
774
775 if (unlikely(!bulk_len))
776 return;
777
778 /* Bulk producer into ptr_ring page_pool cache */
368d3cb4 779 in_softirq = page_pool_producer_lock(pool);
78862447 780 for (i = 0; i < bulk_len; i++) {
590032a4
LB
781 if (__ptr_ring_produce(&pool->ring, data[i])) {
782 /* ring full */
783 recycle_stat_inc(pool, ring_full);
784 break;
785 }
78862447 786 }
590032a4 787 recycle_stat_add(pool, ring, i);
368d3cb4 788 page_pool_producer_unlock(pool, in_softirq);
78862447
LB
789
790 /* Hopefully all pages was return into ptr_ring */
791 if (likely(i == bulk_len))
792 return;
793
794 /* ptr_ring cache full, free remaining pages outside producer lock
795 * since put_page() with refcnt == 1 can be an expensive operation
796 */
797 for (; i < bulk_len; i++)
798 page_pool_return_page(pool, data[i]);
799}
800EXPORT_SYMBOL(page_pool_put_page_bulk);
801
53e0961d
YL
802static struct page *page_pool_drain_frag(struct page_pool *pool,
803 struct page *page)
804{
805 long drain_count = BIAS_MAX - pool->frag_users;
806
807 /* Some user is still using the page frag */
0a149ab7 808 if (likely(page_pool_unref_page(page, drain_count)))
53e0961d
YL
809 return NULL;
810
46f40172 811 if (__page_pool_page_can_be_recycled(page)) {
53e0961d
YL
812 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
813 page_pool_dma_sync_for_device(pool, page, -1);
814
815 return page;
816 }
817
818 page_pool_return_page(pool, page);
819 return NULL;
820}
821
822static void page_pool_free_frag(struct page_pool *pool)
823{
824 long drain_count = BIAS_MAX - pool->frag_users;
825 struct page *page = pool->frag_page;
826
827 pool->frag_page = NULL;
828
0a149ab7 829 if (!page || page_pool_unref_page(page, drain_count))
53e0961d
YL
830 return;
831
832 page_pool_return_page(pool, page);
833}
834
835struct page *page_pool_alloc_frag(struct page_pool *pool,
836 unsigned int *offset,
837 unsigned int size, gfp_t gfp)
838{
839 unsigned int max_size = PAGE_SIZE << pool->p.order;
840 struct page *page = pool->frag_page;
841
09d96ee5 842 if (WARN_ON(size > max_size))
53e0961d
YL
843 return NULL;
844
845 size = ALIGN(size, dma_get_cache_alignment());
846 *offset = pool->frag_offset;
847
848 if (page && *offset + size > max_size) {
849 page = page_pool_drain_frag(pool, page);
0f6deac3
JW
850 if (page) {
851 alloc_stat_inc(pool, fast);
53e0961d 852 goto frag_reset;
0f6deac3 853 }
53e0961d
YL
854 }
855
856 if (!page) {
857 page = page_pool_alloc_pages(pool, gfp);
858 if (unlikely(!page)) {
859 pool->frag_page = NULL;
860 return NULL;
861 }
862
863 pool->frag_page = page;
864
865frag_reset:
866 pool->frag_users = 1;
867 *offset = 0;
868 pool->frag_offset = size;
52cc6ffc 869 page_pool_fragment_page(page, BIAS_MAX);
53e0961d
YL
870 return page;
871 }
872
873 pool->frag_users++;
874 pool->frag_offset = *offset + size;
0f6deac3 875 alloc_stat_inc(pool, fast);
53e0961d
YL
876 return page;
877}
878EXPORT_SYMBOL(page_pool_alloc_frag);
879
458de8a9 880static void page_pool_empty_ring(struct page_pool *pool)
ff7d6b27
JDB
881{
882 struct page *page;
883
884 /* Empty recycle ring */
4905bd9a 885 while ((page = ptr_ring_consume_bh(&pool->ring))) {
ff7d6b27
JDB
886 /* Verify the refcnt invariant of cached pages */
887 if (!(page_ref_count(page) == 1))
888 pr_crit("%s() page_pool refcnt %d violation\n",
889 __func__, page_ref_count(page));
890
458de8a9 891 page_pool_return_page(pool, page);
ff7d6b27
JDB
892 }
893}
894
de97502e 895static void __page_pool_destroy(struct page_pool *pool)
d956a048 896{
c3f812ce
JL
897 if (pool->disconnect)
898 pool->disconnect(pool);
e54cfd7e 899
f17c6964 900 page_pool_unlist(pool);
23cfaf67 901 page_pool_uninit(pool);
e54cfd7e
JDB
902 kfree(pool);
903}
e54cfd7e 904
7c9e6942 905static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
ff7d6b27
JDB
906{
907 struct page *page;
908
7c9e6942
JDB
909 if (pool->destroy_cnt)
910 return;
911
ff7d6b27
JDB
912 /* Empty alloc cache, assume caller made sure this is
913 * no-longer in use, and page_pool_alloc_pages() cannot be
914 * call concurrently.
915 */
916 while (pool->alloc.count) {
917 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 918 page_pool_return_page(pool, page);
ff7d6b27 919 }
7c9e6942
JDB
920}
921
922static void page_pool_scrub(struct page_pool *pool)
923{
924 page_pool_empty_alloc_cache_once(pool);
925 pool->destroy_cnt++;
ff7d6b27
JDB
926
927 /* No more consumers should exist, but producers could still
928 * be in-flight.
929 */
458de8a9 930 page_pool_empty_ring(pool);
c3f812ce
JL
931}
932
933static int page_pool_release(struct page_pool *pool)
934{
935 int inflight;
936
937 page_pool_scrub(pool);
7aee8429 938 inflight = page_pool_inflight(pool, true);
c3f812ce 939 if (!inflight)
de97502e 940 __page_pool_destroy(pool);
c3f812ce
JL
941
942 return inflight;
943}
944
945static void page_pool_release_retry(struct work_struct *wq)
946{
947 struct delayed_work *dwq = to_delayed_work(wq);
948 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
be009667 949 void *netdev;
c3f812ce
JL
950 int inflight;
951
952 inflight = page_pool_release(pool);
953 if (!inflight)
954 return;
955
be009667
JK
956 /* Periodic warning for page pools the user can't see */
957 netdev = READ_ONCE(pool->slow.netdev);
958 if (time_after_eq(jiffies, pool->defer_warn) &&
959 (!netdev || netdev == NET_PTR_POISON)) {
c3f812ce
JL
960 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
961
be009667
JK
962 pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
963 __func__, pool->user.id, inflight, sec);
c3f812ce
JL
964 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
965 }
966
967 /* Still not ready to be disconnected, retry later */
968 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
969}
970
64693ec7
THJ
971void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
972 struct xdp_mem_info *mem)
c3f812ce
JL
973{
974 refcount_inc(&pool->user_cnt);
975 pool->disconnect = disconnect;
64693ec7 976 pool->xdp_mem_id = mem->id;
c3f812ce
JL
977}
978
56ef27e3 979static void page_pool_disable_direct_recycling(struct page_pool *pool)
dd64b232 980{
56ef27e3
AL
981 /* Disable direct recycling based on pool->cpuid.
982 * Paired with READ_ONCE() in napi_pp_put_page().
983 */
984 WRITE_ONCE(pool->cpuid, -1);
985
dd64b232
JK
986 if (!pool->p.napi)
987 return;
988
989 /* To avoid races with recycling and additional barriers make sure
990 * pool and NAPI are unlinked when NAPI is disabled.
991 */
992 WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) ||
993 READ_ONCE(pool->p.napi->list_owner) != -1);
994
995 WRITE_ONCE(pool->p.napi, NULL);
996}
dd64b232 997
c3f812ce
JL
998void page_pool_destroy(struct page_pool *pool)
999{
1000 if (!pool)
1001 return;
1002
1003 if (!page_pool_put(pool))
1004 return;
1005
56ef27e3 1006 page_pool_disable_direct_recycling(pool);
53e0961d
YL
1007 page_pool_free_frag(pool);
1008
c3f812ce
JL
1009 if (!page_pool_release(pool))
1010 return;
1011
69cb4952 1012 page_pool_detached(pool);
c3f812ce
JL
1013 pool->defer_start = jiffies;
1014 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
ff7d6b27 1015
c3f812ce
JL
1016 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
1017 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
ff7d6b27 1018}
c3f812ce 1019EXPORT_SYMBOL(page_pool_destroy);
bc836748
SM
1020
1021/* Caller must provide appropriate safe context, e.g. NAPI. */
1022void page_pool_update_nid(struct page_pool *pool, int new_nid)
1023{
44768dec
JDB
1024 struct page *page;
1025
bc836748
SM
1026 trace_page_pool_update_nid(pool, new_nid);
1027 pool->p.nid = new_nid;
44768dec
JDB
1028
1029 /* Flush pool alloc cache, as refill will check NUMA node */
1030 while (pool->alloc.count) {
1031 page = pool->alloc.cache[--pool->alloc.count];
458de8a9 1032 page_pool_return_page(pool, page);
44768dec 1033 }
bc836748
SM
1034}
1035EXPORT_SYMBOL(page_pool_update_nid);