Commit | Line | Data |
---|---|---|
ff7d6b27 JDB |
1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * | |
3 | * page_pool.c | |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> | |
5 | * Copyright (C) 2016 Red Hat, Inc. | |
6 | */ | |
32c28f7e | 7 | |
12b6c3a0 | 8 | #include <linux/error-injection.h> |
ff7d6b27 JDB |
9 | #include <linux/types.h> |
10 | #include <linux/kernel.h> | |
11 | #include <linux/slab.h> | |
f71fec47 | 12 | #include <linux/device.h> |
ff7d6b27 | 13 | |
b5245865 | 14 | #include <net/netdev_lock.h> |
0f921404 | 15 | #include <net/netdev_rx_queue.h> |
a9ca9f9c | 16 | #include <net/page_pool/helpers.h> |
57afb483 | 17 | #include <net/page_pool/memory_provider.h> |
78862447 LB |
18 | #include <net/xdp.h> |
19 | ||
ff7d6b27 JDB |
20 | #include <linux/dma-direction.h> |
21 | #include <linux/dma-mapping.h> | |
22 | #include <linux/page-flags.h> | |
8d29c703 | 23 | #include <linux/mm.h> /* for put_page() */ |
c07aea3e | 24 | #include <linux/poison.h> |
f3c5264f | 25 | #include <linux/ethtool.h> |
8c48eea3 | 26 | #include <linux/netdevice.h> |
ff7d6b27 | 27 | |
32c28f7e JDB |
28 | #include <trace/events/page_pool.h> |
29 | ||
c1e00bc4 | 30 | #include "dev.h" |
0f921404 | 31 | #include "mp_dmabuf_devmem.h" |
8ab79ed5 | 32 | #include "netmem_priv.h" |
f17c6964 JK |
33 | #include "page_pool_priv.h" |
34 | ||
8ab79ed5 MA |
35 | DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); |
36 | ||
c3f812ce JL |
37 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
38 | #define DEFER_WARN_INTERVAL (60 * HZ) | |
39 | ||
aaf153ae | 40 | #define BIAS_MAX (LONG_MAX >> 1) |
53e0961d | 41 | |
8610037e | 42 | #ifdef CONFIG_PAGE_POOL_STATS |
f853fa5c LB |
43 | static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); |
44 | ||
8610037e JD |
45 | /* alloc_stat_inc is intended to be used in softirq context */ |
46 | #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) | |
ad6fa1e1 JD |
47 | /* recycle_stat_inc is safe to use when preemption is possible. */ |
48 | #define recycle_stat_inc(pool, __stat) \ | |
49 | do { \ | |
50 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ | |
51 | this_cpu_inc(s->__stat); \ | |
52 | } while (0) | |
6b95e338 | 53 | |
590032a4 LB |
54 | #define recycle_stat_add(pool, __stat, val) \ |
55 | do { \ | |
56 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ | |
57 | this_cpu_add(s->__stat, val); \ | |
58 | } while (0) | |
59 | ||
f3c5264f LB |
60 | static const char pp_stats[][ETH_GSTRING_LEN] = { |
61 | "rx_pp_alloc_fast", | |
62 | "rx_pp_alloc_slow", | |
63 | "rx_pp_alloc_slow_ho", | |
64 | "rx_pp_alloc_empty", | |
65 | "rx_pp_alloc_refill", | |
66 | "rx_pp_alloc_waive", | |
67 | "rx_pp_recycle_cached", | |
68 | "rx_pp_recycle_cache_full", | |
69 | "rx_pp_recycle_ring", | |
70 | "rx_pp_recycle_ring_full", | |
71 | "rx_pp_recycle_released_ref", | |
72 | }; | |
73 | ||
82e896d9 JK |
74 | /** |
75 | * page_pool_get_stats() - fetch page pool stats | |
76 | * @pool: pool from which page was allocated | |
77 | * @stats: struct page_pool_stats to fill in | |
78 | * | |
79 | * Retrieve statistics about the page_pool. This API is only available | |
80 | * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. | |
81 | * A pointer to a caller allocated struct page_pool_stats structure | |
82 | * is passed to this API which is filled in. The caller can then report | |
83 | * those stats to the user (perhaps via ethtool, debugfs, etc.). | |
84 | */ | |
d49010ad | 85 | bool page_pool_get_stats(const struct page_pool *pool, |
6b95e338 JD |
86 | struct page_pool_stats *stats) |
87 | { | |
88 | int cpu = 0; | |
89 | ||
90 | if (!stats) | |
91 | return false; | |
92 | ||
f3c5264f LB |
93 | /* The caller is responsible to initialize stats. */ |
94 | stats->alloc_stats.fast += pool->alloc_stats.fast; | |
95 | stats->alloc_stats.slow += pool->alloc_stats.slow; | |
96 | stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; | |
97 | stats->alloc_stats.empty += pool->alloc_stats.empty; | |
98 | stats->alloc_stats.refill += pool->alloc_stats.refill; | |
99 | stats->alloc_stats.waive += pool->alloc_stats.waive; | |
6b95e338 JD |
100 | |
101 | for_each_possible_cpu(cpu) { | |
102 | const struct page_pool_recycle_stats *pcpu = | |
103 | per_cpu_ptr(pool->recycle_stats, cpu); | |
104 | ||
105 | stats->recycle_stats.cached += pcpu->cached; | |
106 | stats->recycle_stats.cache_full += pcpu->cache_full; | |
107 | stats->recycle_stats.ring += pcpu->ring; | |
108 | stats->recycle_stats.ring_full += pcpu->ring_full; | |
109 | stats->recycle_stats.released_refcnt += pcpu->released_refcnt; | |
110 | } | |
111 | ||
112 | return true; | |
113 | } | |
114 | EXPORT_SYMBOL(page_pool_get_stats); | |
f3c5264f LB |
115 | |
116 | u8 *page_pool_ethtool_stats_get_strings(u8 *data) | |
117 | { | |
118 | int i; | |
119 | ||
120 | for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { | |
121 | memcpy(data, pp_stats[i], ETH_GSTRING_LEN); | |
122 | data += ETH_GSTRING_LEN; | |
123 | } | |
124 | ||
125 | return data; | |
126 | } | |
127 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); | |
128 | ||
129 | int page_pool_ethtool_stats_get_count(void) | |
130 | { | |
131 | return ARRAY_SIZE(pp_stats); | |
132 | } | |
133 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); | |
134 | ||
ef9226cd | 135 | u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) |
f3c5264f | 136 | { |
ef9226cd | 137 | const struct page_pool_stats *pool_stats = stats; |
f3c5264f LB |
138 | |
139 | *data++ = pool_stats->alloc_stats.fast; | |
140 | *data++ = pool_stats->alloc_stats.slow; | |
141 | *data++ = pool_stats->alloc_stats.slow_high_order; | |
142 | *data++ = pool_stats->alloc_stats.empty; | |
143 | *data++ = pool_stats->alloc_stats.refill; | |
144 | *data++ = pool_stats->alloc_stats.waive; | |
145 | *data++ = pool_stats->recycle_stats.cached; | |
146 | *data++ = pool_stats->recycle_stats.cache_full; | |
147 | *data++ = pool_stats->recycle_stats.ring; | |
148 | *data++ = pool_stats->recycle_stats.ring_full; | |
149 | *data++ = pool_stats->recycle_stats.released_refcnt; | |
150 | ||
151 | return data; | |
152 | } | |
153 | EXPORT_SYMBOL(page_pool_ethtool_stats_get); | |
154 | ||
8610037e | 155 | #else |
271683bb DC |
156 | #define alloc_stat_inc(...) do { } while (0) |
157 | #define recycle_stat_inc(...) do { } while (0) | |
158 | #define recycle_stat_add(...) do { } while (0) | |
8610037e JD |
159 | #endif |
160 | ||
368d3cb4 YL |
161 | static bool page_pool_producer_lock(struct page_pool *pool) |
162 | __acquires(&pool->ring.producer_lock) | |
163 | { | |
164 | bool in_softirq = in_softirq(); | |
165 | ||
166 | if (in_softirq) | |
167 | spin_lock(&pool->ring.producer_lock); | |
168 | else | |
169 | spin_lock_bh(&pool->ring.producer_lock); | |
170 | ||
171 | return in_softirq; | |
172 | } | |
173 | ||
174 | static void page_pool_producer_unlock(struct page_pool *pool, | |
175 | bool in_softirq) | |
176 | __releases(&pool->ring.producer_lock) | |
177 | { | |
178 | if (in_softirq) | |
179 | spin_unlock(&pool->ring.producer_lock); | |
180 | else | |
181 | spin_unlock_bh(&pool->ring.producer_lock); | |
182 | } | |
183 | ||
1f20a576 AL |
184 | static void page_pool_struct_check(void) |
185 | { | |
186 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); | |
187 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); | |
188 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); | |
39daa09d AL |
189 | CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, |
190 | PAGE_POOL_FRAG_GROUP_ALIGN); | |
1f20a576 AL |
191 | } |
192 | ||
ff7d6b27 | 193 | static int page_pool_init(struct page_pool *pool, |
2b0cfa6e LB |
194 | const struct page_pool_params *params, |
195 | int cpuid) | |
ff7d6b27 JDB |
196 | { |
197 | unsigned int ring_qsize = 1024; /* Default */ | |
0f921404 MA |
198 | struct netdev_rx_queue *rxq; |
199 | int err; | |
ff7d6b27 | 200 | |
1f20a576 AL |
201 | page_pool_struct_check(); |
202 | ||
5027ec19 JK |
203 | memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); |
204 | memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); | |
ff7d6b27 | 205 | |
2b0cfa6e | 206 | pool->cpuid = cpuid; |
7dba339f | 207 | pool->dma_sync_for_cpu = true; |
2b0cfa6e | 208 | |
ff7d6b27 | 209 | /* Validate only known flags were used */ |
403f11ac | 210 | if (pool->slow.flags & ~PP_FLAG_ALL) |
ff7d6b27 JDB |
211 | return -EINVAL; |
212 | ||
213 | if (pool->p.pool_size) | |
214 | ring_qsize = pool->p.pool_size; | |
215 | ||
216 | /* Sanity limit mem that can be pinned down */ | |
217 | if (ring_qsize > 32768) | |
218 | return -E2BIG; | |
219 | ||
220 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. | |
221 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, | |
222 | * which is the XDP_TX use-case. | |
223 | */ | |
403f11ac | 224 | if (pool->slow.flags & PP_FLAG_DMA_MAP) { |
798dda81 DK |
225 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && |
226 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) | |
227 | return -EINVAL; | |
403f11ac AL |
228 | |
229 | pool->dma_map = true; | |
798dda81 | 230 | } |
ff7d6b27 | 231 | |
403f11ac | 232 | if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { |
e68bc756 LB |
233 | /* In order to request DMA-sync-for-device the page |
234 | * needs to be mapped | |
235 | */ | |
403f11ac | 236 | if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) |
e68bc756 LB |
237 | return -EINVAL; |
238 | ||
239 | if (!pool->p.max_len) | |
240 | return -EINVAL; | |
241 | ||
403f11ac AL |
242 | pool->dma_sync = true; |
243 | ||
e68bc756 LB |
244 | /* pool->p.offset has to be set according to the address |
245 | * offset used by the DMA engine to start copying rx data | |
246 | */ | |
247 | } | |
248 | ||
2da0cac1 JK |
249 | pool->has_init_callback = !!pool->slow.init_callback; |
250 | ||
ad6fa1e1 | 251 | #ifdef CONFIG_PAGE_POOL_STATS |
403f11ac | 252 | if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { |
f853fa5c LB |
253 | pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); |
254 | if (!pool->recycle_stats) | |
255 | return -ENOMEM; | |
256 | } else { | |
257 | /* For system page pool instance we use a singular stats object | |
258 | * instead of allocating a separate percpu variable for each | |
259 | * (also percpu) page pool instance. | |
260 | */ | |
261 | pool->recycle_stats = &pp_system_recycle_stats; | |
403f11ac | 262 | pool->system = true; |
f853fa5c | 263 | } |
ad6fa1e1 JD |
264 | #endif |
265 | ||
8ffbd166 JS |
266 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { |
267 | #ifdef CONFIG_PAGE_POOL_STATS | |
403f11ac | 268 | if (!pool->system) |
f853fa5c | 269 | free_percpu(pool->recycle_stats); |
8ffbd166 | 270 | #endif |
ff7d6b27 | 271 | return -ENOMEM; |
8ffbd166 | 272 | } |
ff7d6b27 | 273 | |
99c07c43 JDB |
274 | atomic_set(&pool->pages_state_release_cnt, 0); |
275 | ||
1da4bbef IK |
276 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
277 | refcount_set(&pool->user_cnt, 1); | |
278 | ||
ee62ce7a | 279 | xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); |
f71fec47 | 280 | |
0f921404 | 281 | if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { |
b5245865 | 282 | netdev_assert_locked(pool->slow.netdev); |
0f921404 MA |
283 | rxq = __netif_get_rx_queue(pool->slow.netdev, |
284 | pool->slow.queue_idx); | |
285 | pool->mp_priv = rxq->mp_params.mp_priv; | |
57afb483 | 286 | pool->mp_ops = rxq->mp_params.mp_ops; |
0f921404 MA |
287 | } |
288 | ||
57afb483 | 289 | if (pool->mp_ops) { |
b400f4b8 SK |
290 | if (!pool->dma_map || !pool->dma_sync) |
291 | return -EOPNOTSUPP; | |
292 | ||
57afb483 PB |
293 | if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { |
294 | err = -EFAULT; | |
295 | goto free_ptr_ring; | |
296 | } | |
297 | ||
298 | err = pool->mp_ops->init(pool); | |
0f921404 MA |
299 | if (err) { |
300 | pr_warn("%s() mem-provider init failed %d\n", __func__, | |
301 | err); | |
302 | goto free_ptr_ring; | |
303 | } | |
304 | ||
305 | static_branch_inc(&page_pool_mem_providers); | |
306 | } | |
307 | ||
ff7d6b27 | 308 | return 0; |
0f921404 MA |
309 | |
310 | free_ptr_ring: | |
311 | ptr_ring_cleanup(&pool->ring, NULL); | |
312 | #ifdef CONFIG_PAGE_POOL_STATS | |
313 | if (!pool->system) | |
314 | free_percpu(pool->recycle_stats); | |
315 | #endif | |
316 | return err; | |
ff7d6b27 JDB |
317 | } |
318 | ||
23cfaf67 JK |
319 | static void page_pool_uninit(struct page_pool *pool) |
320 | { | |
321 | ptr_ring_cleanup(&pool->ring, NULL); | |
ee62ce7a | 322 | xa_destroy(&pool->dma_mapped); |
23cfaf67 JK |
323 | |
324 | #ifdef CONFIG_PAGE_POOL_STATS | |
403f11ac | 325 | if (!pool->system) |
f853fa5c | 326 | free_percpu(pool->recycle_stats); |
23cfaf67 JK |
327 | #endif |
328 | } | |
329 | ||
82e896d9 | 330 | /** |
2b0cfa6e | 331 | * page_pool_create_percpu() - create a page pool for a given cpu. |
82e896d9 | 332 | * @params: parameters, see struct page_pool_params |
2b0cfa6e | 333 | * @cpuid: cpu identifier |
82e896d9 | 334 | */ |
2b0cfa6e LB |
335 | struct page_pool * |
336 | page_pool_create_percpu(const struct page_pool_params *params, int cpuid) | |
ff7d6b27 JDB |
337 | { |
338 | struct page_pool *pool; | |
873343e7 | 339 | int err; |
ff7d6b27 JDB |
340 | |
341 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); | |
342 | if (!pool) | |
343 | return ERR_PTR(-ENOMEM); | |
344 | ||
2b0cfa6e | 345 | err = page_pool_init(pool, params, cpuid); |
f17c6964 JK |
346 | if (err < 0) |
347 | goto err_free; | |
348 | ||
349 | err = page_pool_list(pool); | |
350 | if (err) | |
351 | goto err_uninit; | |
1da4bbef | 352 | |
ff7d6b27 | 353 | return pool; |
f17c6964 JK |
354 | |
355 | err_uninit: | |
356 | page_pool_uninit(pool); | |
357 | err_free: | |
358 | pr_warn("%s() gave up with errno %d\n", __func__, err); | |
359 | kfree(pool); | |
360 | return ERR_PTR(err); | |
ff7d6b27 | 361 | } |
2b0cfa6e LB |
362 | EXPORT_SYMBOL(page_pool_create_percpu); |
363 | ||
364 | /** | |
365 | * page_pool_create() - create a page pool | |
366 | * @params: parameters, see struct page_pool_params | |
367 | */ | |
368 | struct page_pool *page_pool_create(const struct page_pool_params *params) | |
369 | { | |
370 | return page_pool_create_percpu(params, -1); | |
371 | } | |
ff7d6b27 JDB |
372 | EXPORT_SYMBOL(page_pool_create); |
373 | ||
4dec64c5 | 374 | static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); |
44768dec | 375 | |
4dec64c5 | 376 | static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) |
44768dec JDB |
377 | { |
378 | struct ptr_ring *r = &pool->ring; | |
4dec64c5 | 379 | netmem_ref netmem; |
44768dec JDB |
380 | int pref_nid; /* preferred NUMA node */ |
381 | ||
382 | /* Quicker fallback, avoid locks when ring is empty */ | |
8610037e JD |
383 | if (__ptr_ring_empty(r)) { |
384 | alloc_stat_inc(pool, empty); | |
4dec64c5 | 385 | return 0; |
8610037e | 386 | } |
44768dec JDB |
387 | |
388 | /* Softirq guarantee CPU and thus NUMA node is stable. This, | |
389 | * assumes CPU refilling driver RX-ring will also run RX-NAPI. | |
390 | */ | |
f13fc107 | 391 | #ifdef CONFIG_NUMA |
44768dec | 392 | pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; |
f13fc107 JDB |
393 | #else |
394 | /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ | |
395 | pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ | |
396 | #endif | |
44768dec | 397 | |
44768dec JDB |
398 | /* Refill alloc array, but only if NUMA match */ |
399 | do { | |
4dec64c5 MA |
400 | netmem = (__force netmem_ref)__ptr_ring_consume(r); |
401 | if (unlikely(!netmem)) | |
44768dec JDB |
402 | break; |
403 | ||
8ab79ed5 | 404 | if (likely(netmem_is_pref_nid(netmem, pref_nid))) { |
4dec64c5 | 405 | pool->alloc.cache[pool->alloc.count++] = netmem; |
44768dec JDB |
406 | } else { |
407 | /* NUMA mismatch; | |
408 | * (1) release 1 page to page-allocator and | |
409 | * (2) break out to fallthrough to alloc_pages_node. | |
410 | * This limit stress on page buddy alloactor. | |
411 | */ | |
4dec64c5 | 412 | page_pool_return_page(pool, netmem); |
8610037e | 413 | alloc_stat_inc(pool, waive); |
4dec64c5 | 414 | netmem = 0; |
44768dec JDB |
415 | break; |
416 | } | |
304db6cb | 417 | } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); |
44768dec JDB |
418 | |
419 | /* Return last page */ | |
8610037e | 420 | if (likely(pool->alloc.count > 0)) { |
4dec64c5 | 421 | netmem = pool->alloc.cache[--pool->alloc.count]; |
8610037e JD |
422 | alloc_stat_inc(pool, refill); |
423 | } | |
44768dec | 424 | |
4dec64c5 | 425 | return netmem; |
44768dec JDB |
426 | } |
427 | ||
ff7d6b27 | 428 | /* fast path */ |
4dec64c5 | 429 | static netmem_ref __page_pool_get_cached(struct page_pool *pool) |
ff7d6b27 | 430 | { |
4dec64c5 | 431 | netmem_ref netmem; |
ff7d6b27 | 432 | |
304db6cb LR |
433 | /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ |
434 | if (likely(pool->alloc.count)) { | |
435 | /* Fast-path */ | |
4dec64c5 | 436 | netmem = pool->alloc.cache[--pool->alloc.count]; |
8610037e | 437 | alloc_stat_inc(pool, fast); |
304db6cb | 438 | } else { |
4dec64c5 | 439 | netmem = page_pool_refill_alloc_cache(pool); |
ff7d6b27 JDB |
440 | } |
441 | ||
4dec64c5 | 442 | return netmem; |
ff7d6b27 JDB |
443 | } |
444 | ||
4321de44 | 445 | static void __page_pool_dma_sync_for_device(const struct page_pool *pool, |
4dec64c5 | 446 | netmem_ref netmem, |
4321de44 | 447 | u32 dma_sync_size) |
e68bc756 | 448 | { |
4321de44 | 449 | #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) |
4dec64c5 | 450 | dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); |
9ddb3c14 | 451 | |
e68bc756 | 452 | dma_sync_size = min(dma_sync_size, pool->p.max_len); |
4321de44 AL |
453 | __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, |
454 | dma_sync_size, pool->p.dma_dir); | |
455 | #endif | |
456 | } | |
457 | ||
458 | static __always_inline void | |
459 | page_pool_dma_sync_for_device(const struct page_pool *pool, | |
4dec64c5 | 460 | netmem_ref netmem, |
4321de44 AL |
461 | u32 dma_sync_size) |
462 | { | |
ee62ce7a THJ |
463 | if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { |
464 | rcu_read_lock(); | |
465 | /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ | |
466 | if (pool->dma_sync) | |
467 | __page_pool_dma_sync_for_device(pool, netmem, | |
468 | dma_sync_size); | |
469 | rcu_read_unlock(); | |
470 | } | |
e68bc756 LB |
471 | } |
472 | ||
ee62ce7a | 473 | static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) |
dfa59717 JDB |
474 | { |
475 | dma_addr_t dma; | |
ee62ce7a THJ |
476 | int err; |
477 | u32 id; | |
dfa59717 JDB |
478 | |
479 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr | |
480 | * since dma_addr_t can be either 32 or 64 bits and does not always fit | |
481 | * into page private data (i.e 32bit cpu with 64bit DMA caps) | |
482 | * This mapping is kept for lifetime of page, until leaving pool. | |
483 | */ | |
4dec64c5 MA |
484 | dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, |
485 | (PAGE_SIZE << pool->p.order), pool->p.dma_dir, | |
486 | DMA_ATTR_SKIP_CPU_SYNC | | |
487 | DMA_ATTR_WEAK_ORDERING); | |
dfa59717 JDB |
488 | if (dma_mapping_error(pool->p.dev, dma)) |
489 | return false; | |
490 | ||
ee62ce7a THJ |
491 | if (page_pool_set_dma_addr_netmem(netmem, dma)) { |
492 | WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); | |
90de47f0 | 493 | goto unmap_failed; |
ee62ce7a THJ |
494 | } |
495 | ||
496 | if (in_softirq()) | |
497 | err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), | |
498 | PP_DMA_INDEX_LIMIT, gfp); | |
499 | else | |
500 | err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), | |
501 | PP_DMA_INDEX_LIMIT, gfp); | |
502 | if (err) { | |
503 | WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); | |
504 | goto unset_failed; | |
505 | } | |
dfa59717 | 506 | |
ee62ce7a | 507 | netmem_set_dma_index(netmem, id); |
4dec64c5 | 508 | page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); |
dfa59717 JDB |
509 | |
510 | return true; | |
90de47f0 | 511 | |
ee62ce7a THJ |
512 | unset_failed: |
513 | page_pool_set_dma_addr_netmem(netmem, 0); | |
90de47f0 | 514 | unmap_failed: |
90de47f0 YL |
515 | dma_unmap_page_attrs(pool->p.dev, dma, |
516 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, | |
517 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); | |
518 | return false; | |
dfa59717 JDB |
519 | } |
520 | ||
be5dba25 JDB |
521 | static struct page *__page_pool_alloc_page_order(struct page_pool *pool, |
522 | gfp_t gfp) | |
ff7d6b27 JDB |
523 | { |
524 | struct page *page; | |
ff7d6b27 | 525 | |
be5dba25 | 526 | gfp |= __GFP_COMP; |
ff7d6b27 | 527 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); |
be5dba25 | 528 | if (unlikely(!page)) |
ff7d6b27 JDB |
529 | return NULL; |
530 | ||
ee62ce7a | 531 | if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { |
ff7d6b27 JDB |
532 | put_page(page); |
533 | return NULL; | |
534 | } | |
ff7d6b27 | 535 | |
8610037e | 536 | alloc_stat_inc(pool, slow_high_order); |
4dec64c5 | 537 | page_pool_set_pp_info(pool, page_to_netmem(page)); |
c07aea3e | 538 | |
99c07c43 JDB |
539 | /* Track how many pages are held 'in-flight' */ |
540 | pool->pages_state_hold_cnt++; | |
4dec64c5 MA |
541 | trace_page_pool_state_hold(pool, page_to_netmem(page), |
542 | pool->pages_state_hold_cnt); | |
be5dba25 JDB |
543 | return page; |
544 | } | |
545 | ||
546 | /* slow path */ | |
4dec64c5 MA |
547 | static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, |
548 | gfp_t gfp) | |
be5dba25 JDB |
549 | { |
550 | const int bulk = PP_ALLOC_CACHE_REFILL; | |
be5dba25 | 551 | unsigned int pp_order = pool->p.order; |
403f11ac | 552 | bool dma_map = pool->dma_map; |
4dec64c5 | 553 | netmem_ref netmem; |
be5dba25 JDB |
554 | int i, nr_pages; |
555 | ||
556 | /* Don't support bulk alloc for high-order pages */ | |
557 | if (unlikely(pp_order)) | |
4dec64c5 | 558 | return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); |
be5dba25 JDB |
559 | |
560 | /* Unnecessary as alloc cache is empty, but guarantees zero count */ | |
561 | if (unlikely(pool->alloc.count > 0)) | |
562 | return pool->alloc.cache[--pool->alloc.count]; | |
563 | ||
6bf9b5b4 | 564 | /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ |
be5dba25 JDB |
565 | memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); |
566 | ||
6bf9b5b4 LC |
567 | nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, |
568 | (struct page **)pool->alloc.cache); | |
be5dba25 | 569 | if (unlikely(!nr_pages)) |
4dec64c5 | 570 | return 0; |
be5dba25 JDB |
571 | |
572 | /* Pages have been filled into alloc.cache array, but count is zero and | |
573 | * page element have not been (possibly) DMA mapped. | |
574 | */ | |
575 | for (i = 0; i < nr_pages; i++) { | |
4dec64c5 | 576 | netmem = pool->alloc.cache[i]; |
ee62ce7a | 577 | if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { |
4dec64c5 | 578 | put_page(netmem_to_page(netmem)); |
be5dba25 JDB |
579 | continue; |
580 | } | |
57f05bc2 | 581 | |
4dec64c5 MA |
582 | page_pool_set_pp_info(pool, netmem); |
583 | pool->alloc.cache[pool->alloc.count++] = netmem; | |
be5dba25 JDB |
584 | /* Track how many pages are held 'in-flight' */ |
585 | pool->pages_state_hold_cnt++; | |
4dec64c5 | 586 | trace_page_pool_state_hold(pool, netmem, |
be5dba25 JDB |
587 | pool->pages_state_hold_cnt); |
588 | } | |
589 | ||
590 | /* Return last page */ | |
8610037e | 591 | if (likely(pool->alloc.count > 0)) { |
4dec64c5 | 592 | netmem = pool->alloc.cache[--pool->alloc.count]; |
8610037e JD |
593 | alloc_stat_inc(pool, slow); |
594 | } else { | |
4dec64c5 | 595 | netmem = 0; |
8610037e | 596 | } |
32c28f7e | 597 | |
ff7d6b27 | 598 | /* When page just alloc'ed is should/must have refcnt 1. */ |
4dec64c5 | 599 | return netmem; |
ff7d6b27 JDB |
600 | } |
601 | ||
602 | /* For using page_pool replace: alloc_pages() API calls, but provide | |
603 | * synchronization guarantee for allocation side. | |
604 | */ | |
91a152cb | 605 | netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) |
ff7d6b27 | 606 | { |
4dec64c5 | 607 | netmem_ref netmem; |
ff7d6b27 JDB |
608 | |
609 | /* Fast-path: Get a page from cache */ | |
4dec64c5 MA |
610 | netmem = __page_pool_get_cached(pool); |
611 | if (netmem) | |
612 | return netmem; | |
ff7d6b27 JDB |
613 | |
614 | /* Slow-path: cache empty, do real allocation */ | |
57afb483 PB |
615 | if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) |
616 | netmem = pool->mp_ops->alloc_netmems(pool, gfp); | |
0f921404 MA |
617 | else |
618 | netmem = __page_pool_alloc_pages_slow(pool, gfp); | |
4dec64c5 MA |
619 | return netmem; |
620 | } | |
91a152cb | 621 | EXPORT_SYMBOL(page_pool_alloc_netmems); |
8d20dcda | 622 | ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); |
4dec64c5 MA |
623 | |
624 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) | |
625 | { | |
91a152cb | 626 | return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); |
ff7d6b27 JDB |
627 | } |
628 | EXPORT_SYMBOL(page_pool_alloc_pages); | |
629 | ||
99c07c43 JDB |
630 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
631 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution | |
632 | */ | |
633 | #define _distance(a, b) (s32)((a) - (b)) | |
634 | ||
7aee8429 | 635 | s32 page_pool_inflight(const struct page_pool *pool, bool strict) |
99c07c43 JDB |
636 | { |
637 | u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); | |
638 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); | |
c3f812ce | 639 | s32 inflight; |
99c07c43 | 640 | |
c3f812ce | 641 | inflight = _distance(hold_cnt, release_cnt); |
99c07c43 | 642 | |
7aee8429 JK |
643 | if (strict) { |
644 | trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); | |
645 | WARN(inflight < 0, "Negative(%d) inflight packet-pages", | |
646 | inflight); | |
647 | } else { | |
648 | inflight = max(0, inflight); | |
649 | } | |
99c07c43 | 650 | |
c3f812ce | 651 | return inflight; |
99c07c43 JDB |
652 | } |
653 | ||
0f921404 MA |
654 | void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) |
655 | { | |
656 | netmem_set_pp(netmem, pool); | |
657 | netmem_or_pp_magic(netmem, PP_SIGNATURE); | |
658 | ||
659 | /* Ensuring all pages have been split into one fragment initially: | |
660 | * page_pool_set_pp_info() is only called once for every page when it | |
661 | * is allocated from the page allocator and page_pool_fragment_page() | |
662 | * is dirtying the same cache line as the page->pp_magic above, so | |
663 | * the overhead is negligible. | |
664 | */ | |
665 | page_pool_fragment_netmem(netmem, 1); | |
666 | if (pool->has_init_callback) | |
667 | pool->slow.init_callback(netmem, pool->slow.init_arg); | |
668 | } | |
669 | ||
670 | void page_pool_clear_pp_info(netmem_ref netmem) | |
671 | { | |
672 | netmem_clear_pp_magic(netmem); | |
673 | netmem_set_pp(netmem, NULL); | |
674 | } | |
675 | ||
4dec64c5 MA |
676 | static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, |
677 | netmem_ref netmem) | |
ff7d6b27 | 678 | { |
ee62ce7a THJ |
679 | struct page *old, *page = netmem_to_page(netmem); |
680 | unsigned long id; | |
1567b85e IA |
681 | dma_addr_t dma; |
682 | ||
403f11ac | 683 | if (!pool->dma_map) |
458de8a9 IA |
684 | /* Always account for inflight pages, even if we didn't |
685 | * map them | |
686 | */ | |
c3f687d8 | 687 | return; |
ff7d6b27 | 688 | |
ee62ce7a THJ |
689 | id = netmem_get_dma_index(netmem); |
690 | if (!id) | |
691 | return; | |
692 | ||
693 | if (in_softirq()) | |
694 | old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); | |
695 | else | |
696 | old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); | |
697 | if (old != page) | |
698 | return; | |
699 | ||
4dec64c5 | 700 | dma = page_pool_get_dma_addr_netmem(netmem); |
458de8a9 | 701 | |
9ddb3c14 | 702 | /* When page is unmapped, it cannot be returned to our pool */ |
13f16d9d JDB |
703 | dma_unmap_page_attrs(pool->p.dev, dma, |
704 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, | |
8e4c62c7 | 705 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); |
4dec64c5 | 706 | page_pool_set_dma_addr_netmem(netmem, 0); |
ee62ce7a | 707 | netmem_set_dma_index(netmem, 0); |
c3f687d8 JK |
708 | } |
709 | ||
710 | /* Disconnects a page (from a page_pool). API users can have a need | |
711 | * to disconnect a page (from a page_pool), to allow it to be used as | |
712 | * a regular page (that will eventually be returned to the normal | |
713 | * page-allocator via put_page). | |
714 | */ | |
4dec64c5 | 715 | void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) |
c3f687d8 JK |
716 | { |
717 | int count; | |
0f921404 | 718 | bool put; |
c3f687d8 | 719 | |
0f921404 | 720 | put = true; |
57afb483 PB |
721 | if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) |
722 | put = pool->mp_ops->release_netmem(pool, netmem); | |
0f921404 MA |
723 | else |
724 | __page_pool_release_page_dma(pool, netmem); | |
c07aea3e | 725 | |
c3f812ce JL |
726 | /* This may be the last page returned, releasing the pool, so |
727 | * it is not safe to reference pool afterwards. | |
728 | */ | |
7fb9b66d | 729 | count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); |
4dec64c5 | 730 | trace_page_pool_state_release(pool, netmem, count); |
99c07c43 | 731 | |
0f921404 MA |
732 | if (put) { |
733 | page_pool_clear_pp_info(netmem); | |
734 | put_page(netmem_to_page(netmem)); | |
735 | } | |
ff7d6b27 JDB |
736 | /* An optimization would be to call __free_pages(page, pool->p.order) |
737 | * knowing page is not part of page-cache (thus avoiding a | |
738 | * __page_cache_release() call). | |
739 | */ | |
740 | } | |
741 | ||
4dec64c5 | 742 | static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) |
ff7d6b27 | 743 | { |
271683bb | 744 | bool in_softirq, ret; |
ff7d6b27 | 745 | |
271683bb DC |
746 | /* BH protection not needed if current is softirq */ |
747 | in_softirq = page_pool_producer_lock(pool); | |
748 | ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); | |
749 | if (ret) | |
ad6fa1e1 | 750 | recycle_stat_inc(pool, ring); |
271683bb | 751 | page_pool_producer_unlock(pool, in_softirq); |
ad6fa1e1 | 752 | |
271683bb | 753 | return ret; |
ff7d6b27 JDB |
754 | } |
755 | ||
756 | /* Only allow direct recycling in special circumstances, into the | |
757 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. | |
758 | * | |
759 | * Caller must provide appropriate safe context. | |
760 | */ | |
4dec64c5 | 761 | static bool page_pool_recycle_in_cache(netmem_ref netmem, |
ff7d6b27 JDB |
762 | struct page_pool *pool) |
763 | { | |
ad6fa1e1 JD |
764 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { |
765 | recycle_stat_inc(pool, cache_full); | |
ff7d6b27 | 766 | return false; |
ad6fa1e1 | 767 | } |
ff7d6b27 JDB |
768 | |
769 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ | |
4dec64c5 | 770 | pool->alloc.cache[pool->alloc.count++] = netmem; |
ad6fa1e1 | 771 | recycle_stat_inc(pool, cached); |
ff7d6b27 JDB |
772 | return true; |
773 | } | |
774 | ||
4dec64c5 | 775 | static bool __page_pool_page_can_be_recycled(netmem_ref netmem) |
46f40172 | 776 | { |
8ab79ed5 MA |
777 | return netmem_is_net_iov(netmem) || |
778 | (page_ref_count(netmem_to_page(netmem)) == 1 && | |
779 | !page_is_pfmemalloc(netmem_to_page(netmem))); | |
46f40172 MA |
780 | } |
781 | ||
458de8a9 | 782 | /* If the page refcnt == 1, this will try to recycle the page. |
403f11ac | 783 | * If pool->dma_sync is set, we'll try to sync the DMA area for |
458de8a9 IA |
784 | * the configured size min(dma_sync_size, pool->max_len). |
785 | * If the page refcnt != 1, then the page will be returned to memory | |
786 | * subsystem. | |
787 | */ | |
4dec64c5 MA |
788 | static __always_inline netmem_ref |
789 | __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, | |
78862447 | 790 | unsigned int dma_sync_size, bool allow_direct) |
ff7d6b27 | 791 | { |
ff4e538c JK |
792 | lockdep_assert_no_hardirq(); |
793 | ||
ff7d6b27 JDB |
794 | /* This allocator is optimized for the XDP mode that uses |
795 | * one-frame-per-page, but have fallbacks that act like the | |
796 | * regular page allocator APIs. | |
797 | * | |
798 | * refcnt == 1 means page_pool owns page, and can recycle it. | |
05656132 AL |
799 | * |
800 | * page is NOT reusable when allocated when system is under | |
801 | * some pressure. (page_is_pfmemalloc) | |
ff7d6b27 | 802 | */ |
4dec64c5 | 803 | if (likely(__page_pool_page_can_be_recycled(netmem))) { |
ff7d6b27 JDB |
804 | /* Read barrier done in page_ref_count / READ_ONCE */ |
805 | ||
4dec64c5 | 806 | page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); |
e68bc756 | 807 | |
4dec64c5 MA |
808 | if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) |
809 | return 0; | |
ff7d6b27 | 810 | |
78862447 | 811 | /* Page found as candidate for recycling */ |
4dec64c5 | 812 | return netmem; |
ff7d6b27 | 813 | } |
8ab79ed5 | 814 | |
ff7d6b27 JDB |
815 | /* Fallback/non-XDP mode: API user have elevated refcnt. |
816 | * | |
817 | * Many drivers split up the page into fragments, and some | |
818 | * want to keep doing this to save memory and do refcnt based | |
819 | * recycling. Support this use case too, to ease drivers | |
820 | * switching between XDP/non-XDP. | |
821 | * | |
822 | * In-case page_pool maintains the DMA mapping, API user must | |
823 | * call page_pool_put_page once. In this elevated refcnt | |
824 | * case, the DMA is unmapped/released, as driver is likely | |
825 | * doing refcnt based recycle tricks, meaning another process | |
826 | * will be invoking put_page. | |
827 | */ | |
ad6fa1e1 | 828 | recycle_stat_inc(pool, released_refcnt); |
4dec64c5 | 829 | page_pool_return_page(pool, netmem); |
78862447 | 830 | |
4dec64c5 | 831 | return 0; |
78862447 LB |
832 | } |
833 | ||
4a96a4e8 AL |
834 | static bool page_pool_napi_local(const struct page_pool *pool) |
835 | { | |
836 | const struct napi_struct *napi; | |
837 | u32 cpuid; | |
838 | ||
32471b2f SAS |
839 | /* On PREEMPT_RT the softirq can be preempted by the consumer */ |
840 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) | |
841 | return false; | |
842 | ||
4a96a4e8 AL |
843 | if (unlikely(!in_softirq())) |
844 | return false; | |
845 | ||
846 | /* Allow direct recycle if we have reasons to believe that we are | |
847 | * in the same context as the consumer would run, so there's | |
848 | * no possible race. | |
849 | * __page_pool_put_page() makes sure we're not in hardirq context | |
850 | * and interrupts are enabled prior to accessing the cache. | |
851 | */ | |
852 | cpuid = smp_processor_id(); | |
853 | if (READ_ONCE(pool->cpuid) == cpuid) | |
854 | return true; | |
855 | ||
856 | napi = READ_ONCE(pool->p.napi); | |
857 | ||
858 | return napi && READ_ONCE(napi->list_owner) == cpuid; | |
859 | } | |
860 | ||
4dec64c5 MA |
861 | void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, |
862 | unsigned int dma_sync_size, bool allow_direct) | |
78862447 | 863 | { |
4a96a4e8 AL |
864 | if (!allow_direct) |
865 | allow_direct = page_pool_napi_local(pool); | |
866 | ||
170ebc60 MA |
867 | netmem = __page_pool_put_page(pool, netmem, dma_sync_size, |
868 | allow_direct); | |
4dec64c5 | 869 | if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { |
78862447 | 870 | /* Cache full, fallback to free pages */ |
ad6fa1e1 | 871 | recycle_stat_inc(pool, ring_full); |
4dec64c5 | 872 | page_pool_return_page(pool, netmem); |
78862447 | 873 | } |
ff7d6b27 | 874 | } |
4dec64c5 MA |
875 | EXPORT_SYMBOL(page_pool_put_unrefed_netmem); |
876 | ||
877 | void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, | |
878 | unsigned int dma_sync_size, bool allow_direct) | |
879 | { | |
880 | page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, | |
881 | allow_direct); | |
882 | } | |
0a149ab7 | 883 | EXPORT_SYMBOL(page_pool_put_unrefed_page); |
ff7d6b27 | 884 | |
fcc680a6 AL |
885 | static void page_pool_recycle_ring_bulk(struct page_pool *pool, |
886 | netmem_ref *bulk, | |
887 | u32 bulk_len) | |
888 | { | |
889 | bool in_softirq; | |
890 | u32 i; | |
891 | ||
892 | /* Bulk produce into ptr_ring page_pool cache */ | |
893 | in_softirq = page_pool_producer_lock(pool); | |
894 | ||
895 | for (i = 0; i < bulk_len; i++) { | |
896 | if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { | |
897 | /* ring full */ | |
898 | recycle_stat_inc(pool, ring_full); | |
899 | break; | |
900 | } | |
901 | } | |
902 | ||
903 | page_pool_producer_unlock(pool, in_softirq); | |
904 | recycle_stat_add(pool, ring, i); | |
905 | ||
906 | /* Hopefully all pages were returned into ptr_ring */ | |
907 | if (likely(i == bulk_len)) | |
908 | return; | |
909 | ||
910 | /* | |
911 | * ptr_ring cache is full, free remaining pages outside producer lock | |
912 | * since put_page() with refcnt == 1 can be an expensive operation. | |
913 | */ | |
914 | for (; i < bulk_len; i++) | |
915 | page_pool_return_page(pool, bulk[i]); | |
916 | } | |
917 | ||
82e896d9 | 918 | /** |
024bfd2e | 919 | * page_pool_put_netmem_bulk() - release references on multiple netmems |
024bfd2e AL |
920 | * @data: array holding netmem references |
921 | * @count: number of entries in @data | |
82e896d9 | 922 | * |
024bfd2e AL |
923 | * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring |
924 | * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() | |
925 | * will release leftover netmems to the memory provider. | |
926 | * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx | |
82e896d9 JK |
927 | * completion loop for the XDP_REDIRECT use case. |
928 | * | |
929 | * Please note the caller must not use data area after running | |
024bfd2e | 930 | * page_pool_put_netmem_bulk(), as this function overwrites it. |
82e896d9 | 931 | */ |
fcc680a6 | 932 | void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) |
78862447 | 933 | { |
fcc680a6 | 934 | u32 bulk_len = 0; |
39806b96 | 935 | |
fcc680a6 | 936 | for (u32 i = 0; i < count; i++) { |
024bfd2e | 937 | netmem_ref netmem = netmem_compound_head(data[i]); |
78862447 | 938 | |
d3c9510d | 939 | if (page_pool_unref_and_test(netmem)) |
024bfd2e | 940 | data[bulk_len++] = netmem; |
78862447 LB |
941 | } |
942 | ||
fcc680a6 AL |
943 | count = bulk_len; |
944 | while (count) { | |
945 | netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; | |
946 | struct page_pool *pool = NULL; | |
947 | bool allow_direct; | |
948 | u32 foreign = 0; | |
949 | ||
950 | bulk_len = 0; | |
951 | ||
952 | for (u32 i = 0; i < count; i++) { | |
953 | struct page_pool *netmem_pp; | |
954 | netmem_ref netmem = data[i]; | |
955 | ||
956 | netmem_pp = netmem_get_pp(netmem); | |
957 | if (unlikely(!pool)) { | |
958 | pool = netmem_pp; | |
959 | allow_direct = page_pool_napi_local(pool); | |
960 | } else if (netmem_pp != pool) { | |
961 | /* | |
962 | * If the netmem belongs to a different | |
963 | * page_pool, save it for another round. | |
964 | */ | |
965 | data[foreign++] = netmem; | |
966 | continue; | |
967 | } | |
968 | ||
969 | netmem = __page_pool_put_page(pool, netmem, -1, | |
970 | allow_direct); | |
971 | /* Approved for bulk recycling in ptr_ring cache */ | |
972 | if (netmem) | |
973 | bulk[bulk_len++] = netmem; | |
590032a4 | 974 | } |
78862447 | 975 | |
fcc680a6 AL |
976 | if (bulk_len) |
977 | page_pool_recycle_ring_bulk(pool, bulk, bulk_len); | |
78862447 | 978 | |
fcc680a6 AL |
979 | count = foreign; |
980 | } | |
78862447 | 981 | } |
024bfd2e | 982 | EXPORT_SYMBOL(page_pool_put_netmem_bulk); |
78862447 | 983 | |
4dec64c5 MA |
984 | static netmem_ref page_pool_drain_frag(struct page_pool *pool, |
985 | netmem_ref netmem) | |
53e0961d YL |
986 | { |
987 | long drain_count = BIAS_MAX - pool->frag_users; | |
988 | ||
989 | /* Some user is still using the page frag */ | |
4dec64c5 MA |
990 | if (likely(page_pool_unref_netmem(netmem, drain_count))) |
991 | return 0; | |
53e0961d | 992 | |
4dec64c5 MA |
993 | if (__page_pool_page_can_be_recycled(netmem)) { |
994 | page_pool_dma_sync_for_device(pool, netmem, -1); | |
995 | return netmem; | |
53e0961d YL |
996 | } |
997 | ||
4dec64c5 MA |
998 | page_pool_return_page(pool, netmem); |
999 | return 0; | |
53e0961d YL |
1000 | } |
1001 | ||
1002 | static void page_pool_free_frag(struct page_pool *pool) | |
1003 | { | |
1004 | long drain_count = BIAS_MAX - pool->frag_users; | |
4dec64c5 | 1005 | netmem_ref netmem = pool->frag_page; |
53e0961d | 1006 | |
4dec64c5 | 1007 | pool->frag_page = 0; |
53e0961d | 1008 | |
4dec64c5 | 1009 | if (!netmem || page_pool_unref_netmem(netmem, drain_count)) |
53e0961d YL |
1010 | return; |
1011 | ||
4dec64c5 | 1012 | page_pool_return_page(pool, netmem); |
53e0961d YL |
1013 | } |
1014 | ||
4dec64c5 MA |
1015 | netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, |
1016 | unsigned int *offset, unsigned int size, | |
1017 | gfp_t gfp) | |
53e0961d YL |
1018 | { |
1019 | unsigned int max_size = PAGE_SIZE << pool->p.order; | |
4dec64c5 | 1020 | netmem_ref netmem = pool->frag_page; |
53e0961d | 1021 | |
09d96ee5 | 1022 | if (WARN_ON(size > max_size)) |
4dec64c5 | 1023 | return 0; |
53e0961d YL |
1024 | |
1025 | size = ALIGN(size, dma_get_cache_alignment()); | |
1026 | *offset = pool->frag_offset; | |
1027 | ||
4dec64c5 MA |
1028 | if (netmem && *offset + size > max_size) { |
1029 | netmem = page_pool_drain_frag(pool, netmem); | |
1030 | if (netmem) { | |
ef04d290 | 1031 | recycle_stat_inc(pool, cached); |
0f6deac3 | 1032 | alloc_stat_inc(pool, fast); |
53e0961d | 1033 | goto frag_reset; |
0f6deac3 | 1034 | } |
53e0961d YL |
1035 | } |
1036 | ||
4dec64c5 | 1037 | if (!netmem) { |
91a152cb | 1038 | netmem = page_pool_alloc_netmems(pool, gfp); |
4dec64c5 MA |
1039 | if (unlikely(!netmem)) { |
1040 | pool->frag_page = 0; | |
1041 | return 0; | |
53e0961d YL |
1042 | } |
1043 | ||
4dec64c5 | 1044 | pool->frag_page = netmem; |
53e0961d YL |
1045 | |
1046 | frag_reset: | |
1047 | pool->frag_users = 1; | |
1048 | *offset = 0; | |
1049 | pool->frag_offset = size; | |
4dec64c5 MA |
1050 | page_pool_fragment_netmem(netmem, BIAS_MAX); |
1051 | return netmem; | |
53e0961d YL |
1052 | } |
1053 | ||
1054 | pool->frag_users++; | |
1055 | pool->frag_offset = *offset + size; | |
4dec64c5 MA |
1056 | return netmem; |
1057 | } | |
1058 | EXPORT_SYMBOL(page_pool_alloc_frag_netmem); | |
1059 | ||
1060 | struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, | |
1061 | unsigned int size, gfp_t gfp) | |
1062 | { | |
1063 | return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, | |
1064 | gfp)); | |
53e0961d YL |
1065 | } |
1066 | EXPORT_SYMBOL(page_pool_alloc_frag); | |
1067 | ||
458de8a9 | 1068 | static void page_pool_empty_ring(struct page_pool *pool) |
ff7d6b27 | 1069 | { |
4dec64c5 | 1070 | netmem_ref netmem; |
ff7d6b27 JDB |
1071 | |
1072 | /* Empty recycle ring */ | |
4dec64c5 | 1073 | while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { |
ff7d6b27 | 1074 | /* Verify the refcnt invariant of cached pages */ |
8ab79ed5 | 1075 | if (!(netmem_ref_count(netmem) == 1)) |
ff7d6b27 | 1076 | pr_crit("%s() page_pool refcnt %d violation\n", |
4dec64c5 | 1077 | __func__, netmem_ref_count(netmem)); |
ff7d6b27 | 1078 | |
4dec64c5 | 1079 | page_pool_return_page(pool, netmem); |
ff7d6b27 JDB |
1080 | } |
1081 | } | |
1082 | ||
de97502e | 1083 | static void __page_pool_destroy(struct page_pool *pool) |
d956a048 | 1084 | { |
c3f812ce JL |
1085 | if (pool->disconnect) |
1086 | pool->disconnect(pool); | |
e54cfd7e | 1087 | |
f17c6964 | 1088 | page_pool_unlist(pool); |
23cfaf67 | 1089 | page_pool_uninit(pool); |
0f921404 | 1090 | |
57afb483 PB |
1091 | if (pool->mp_ops) { |
1092 | pool->mp_ops->destroy(pool); | |
0f921404 MA |
1093 | static_branch_dec(&page_pool_mem_providers); |
1094 | } | |
1095 | ||
e54cfd7e JDB |
1096 | kfree(pool); |
1097 | } | |
e54cfd7e | 1098 | |
7c9e6942 | 1099 | static void page_pool_empty_alloc_cache_once(struct page_pool *pool) |
ff7d6b27 | 1100 | { |
4dec64c5 | 1101 | netmem_ref netmem; |
ff7d6b27 | 1102 | |
7c9e6942 JDB |
1103 | if (pool->destroy_cnt) |
1104 | return; | |
1105 | ||
ff7d6b27 JDB |
1106 | /* Empty alloc cache, assume caller made sure this is |
1107 | * no-longer in use, and page_pool_alloc_pages() cannot be | |
1108 | * call concurrently. | |
1109 | */ | |
1110 | while (pool->alloc.count) { | |
4dec64c5 MA |
1111 | netmem = pool->alloc.cache[--pool->alloc.count]; |
1112 | page_pool_return_page(pool, netmem); | |
ff7d6b27 | 1113 | } |
7c9e6942 JDB |
1114 | } |
1115 | ||
1116 | static void page_pool_scrub(struct page_pool *pool) | |
1117 | { | |
ee62ce7a THJ |
1118 | unsigned long id; |
1119 | void *ptr; | |
1120 | ||
7c9e6942 | 1121 | page_pool_empty_alloc_cache_once(pool); |
ee62ce7a THJ |
1122 | if (!pool->destroy_cnt++ && pool->dma_map) { |
1123 | if (pool->dma_sync) { | |
1124 | /* Disable page_pool_dma_sync_for_device() */ | |
1125 | pool->dma_sync = false; | |
1126 | ||
1127 | /* Make sure all concurrent returns that may see the old | |
1128 | * value of dma_sync (and thus perform a sync) have | |
1129 | * finished before doing the unmapping below. Skip the | |
1130 | * wait if the device doesn't actually need syncing, or | |
1131 | * if there are no outstanding mapped pages. | |
1132 | */ | |
1133 | if (dma_dev_need_sync(pool->p.dev) && | |
1134 | !xa_empty(&pool->dma_mapped)) | |
1135 | synchronize_net(); | |
1136 | } | |
1137 | ||
1138 | xa_for_each(&pool->dma_mapped, id, ptr) | |
1139 | __page_pool_release_page_dma(pool, page_to_netmem(ptr)); | |
1140 | } | |
ff7d6b27 JDB |
1141 | |
1142 | /* No more consumers should exist, but producers could still | |
1143 | * be in-flight. | |
1144 | */ | |
458de8a9 | 1145 | page_pool_empty_ring(pool); |
c3f812ce JL |
1146 | } |
1147 | ||
1148 | static int page_pool_release(struct page_pool *pool) | |
1149 | { | |
271683bb | 1150 | bool in_softirq; |
c3f812ce JL |
1151 | int inflight; |
1152 | ||
1153 | page_pool_scrub(pool); | |
7aee8429 | 1154 | inflight = page_pool_inflight(pool, true); |
271683bb DC |
1155 | /* Acquire producer lock to make sure producers have exited. */ |
1156 | in_softirq = page_pool_producer_lock(pool); | |
1157 | page_pool_producer_unlock(pool, in_softirq); | |
c3f812ce | 1158 | if (!inflight) |
de97502e | 1159 | __page_pool_destroy(pool); |
c3f812ce JL |
1160 | |
1161 | return inflight; | |
1162 | } | |
1163 | ||
1164 | static void page_pool_release_retry(struct work_struct *wq) | |
1165 | { | |
1166 | struct delayed_work *dwq = to_delayed_work(wq); | |
1167 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); | |
be009667 | 1168 | void *netdev; |
c3f812ce JL |
1169 | int inflight; |
1170 | ||
1171 | inflight = page_pool_release(pool); | |
43130d02 JX |
1172 | /* In rare cases, a driver bug may cause inflight to go negative. |
1173 | * Don't reschedule release if inflight is 0 or negative. | |
1174 | * - If 0, the page_pool has been destroyed | |
1175 | * - if negative, we will never recover | |
1176 | * in both cases no reschedule is necessary. | |
1177 | */ | |
1178 | if (inflight <= 0) | |
c3f812ce JL |
1179 | return; |
1180 | ||
be009667 JK |
1181 | /* Periodic warning for page pools the user can't see */ |
1182 | netdev = READ_ONCE(pool->slow.netdev); | |
1183 | if (time_after_eq(jiffies, pool->defer_warn) && | |
1184 | (!netdev || netdev == NET_PTR_POISON)) { | |
c3f812ce JL |
1185 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; |
1186 | ||
be009667 JK |
1187 | pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", |
1188 | __func__, pool->user.id, inflight, sec); | |
c3f812ce JL |
1189 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; |
1190 | } | |
1191 | ||
1192 | /* Still not ready to be disconnected, retry later */ | |
1193 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
1194 | } | |
1195 | ||
64693ec7 | 1196 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), |
ef9226cd | 1197 | const struct xdp_mem_info *mem) |
c3f812ce JL |
1198 | { |
1199 | refcount_inc(&pool->user_cnt); | |
1200 | pool->disconnect = disconnect; | |
64693ec7 | 1201 | pool->xdp_mem_id = mem->id; |
c3f812ce JL |
1202 | } |
1203 | ||
d7f39aee | 1204 | void page_pool_disable_direct_recycling(struct page_pool *pool) |
dd64b232 | 1205 | { |
56ef27e3 | 1206 | /* Disable direct recycling based on pool->cpuid. |
4a96a4e8 | 1207 | * Paired with READ_ONCE() in page_pool_napi_local(). |
56ef27e3 AL |
1208 | */ |
1209 | WRITE_ONCE(pool->cpuid, -1); | |
1210 | ||
dd64b232 JK |
1211 | if (!pool->p.napi) |
1212 | return; | |
1213 | ||
c1e00bc4 | 1214 | napi_assert_will_not_race(pool->p.napi); |
dd64b232 | 1215 | |
67e4bb2c | 1216 | mutex_lock(&page_pools_lock); |
dd64b232 | 1217 | WRITE_ONCE(pool->p.napi, NULL); |
67e4bb2c | 1218 | mutex_unlock(&page_pools_lock); |
dd64b232 | 1219 | } |
d7f39aee | 1220 | EXPORT_SYMBOL(page_pool_disable_direct_recycling); |
dd64b232 | 1221 | |
c3f812ce JL |
1222 | void page_pool_destroy(struct page_pool *pool) |
1223 | { | |
1224 | if (!pool) | |
1225 | return; | |
1226 | ||
1227 | if (!page_pool_put(pool)) | |
1228 | return; | |
1229 | ||
56ef27e3 | 1230 | page_pool_disable_direct_recycling(pool); |
53e0961d YL |
1231 | page_pool_free_frag(pool); |
1232 | ||
c3f812ce JL |
1233 | if (!page_pool_release(pool)) |
1234 | return; | |
1235 | ||
69cb4952 | 1236 | page_pool_detached(pool); |
c3f812ce JL |
1237 | pool->defer_start = jiffies; |
1238 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
ff7d6b27 | 1239 | |
c3f812ce JL |
1240 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
1241 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
ff7d6b27 | 1242 | } |
c3f812ce | 1243 | EXPORT_SYMBOL(page_pool_destroy); |
bc836748 SM |
1244 | |
1245 | /* Caller must provide appropriate safe context, e.g. NAPI. */ | |
1246 | void page_pool_update_nid(struct page_pool *pool, int new_nid) | |
1247 | { | |
4dec64c5 | 1248 | netmem_ref netmem; |
44768dec | 1249 | |
bc836748 SM |
1250 | trace_page_pool_update_nid(pool, new_nid); |
1251 | pool->p.nid = new_nid; | |
44768dec JDB |
1252 | |
1253 | /* Flush pool alloc cache, as refill will check NUMA node */ | |
1254 | while (pool->alloc.count) { | |
4dec64c5 MA |
1255 | netmem = pool->alloc.cache[--pool->alloc.count]; |
1256 | page_pool_return_page(pool, netmem); | |
44768dec | 1257 | } |
bc836748 SM |
1258 | } |
1259 | EXPORT_SYMBOL(page_pool_update_nid); | |
56102c01 PB |
1260 | |
1261 | bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) | |
1262 | { | |
1263 | return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); | |
1264 | } | |
1265 | ||
1266 | /* Associate a niov with a page pool. Should follow with a matching | |
1267 | * net_mp_niov_clear_page_pool() | |
1268 | */ | |
1269 | void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) | |
1270 | { | |
1271 | netmem_ref netmem = net_iov_to_netmem(niov); | |
1272 | ||
1273 | page_pool_set_pp_info(pool, netmem); | |
1274 | ||
1275 | pool->pages_state_hold_cnt++; | |
1276 | trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); | |
1277 | } | |
1278 | ||
1279 | /* Disassociate a niov from a page pool. Should only be used in the | |
1280 | * ->release_netmem() path. | |
1281 | */ | |
1282 | void net_mp_niov_clear_page_pool(struct net_iov *niov) | |
1283 | { | |
1284 | netmem_ref netmem = net_iov_to_netmem(niov); | |
1285 | ||
1286 | page_pool_clear_pp_info(netmem); | |
1287 | } |