| 1 | /* SPDX-License-Identifier: GPL-2.0 |
| 2 | * |
| 3 | * page_pool.c |
| 4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> |
| 5 | * Copyright (C) 2016 Red Hat, Inc. |
| 6 | */ |
| 7 | |
| 8 | #include <linux/error-injection.h> |
| 9 | #include <linux/types.h> |
| 10 | #include <linux/kernel.h> |
| 11 | #include <linux/slab.h> |
| 12 | #include <linux/device.h> |
| 13 | |
| 14 | #include <net/netdev_lock.h> |
| 15 | #include <net/netdev_rx_queue.h> |
| 16 | #include <net/page_pool/helpers.h> |
| 17 | #include <net/page_pool/memory_provider.h> |
| 18 | #include <net/xdp.h> |
| 19 | |
| 20 | #include <linux/dma-direction.h> |
| 21 | #include <linux/dma-mapping.h> |
| 22 | #include <linux/page-flags.h> |
| 23 | #include <linux/mm.h> /* for put_page() */ |
| 24 | #include <linux/poison.h> |
| 25 | #include <linux/ethtool.h> |
| 26 | #include <linux/netdevice.h> |
| 27 | |
| 28 | #include <trace/events/page_pool.h> |
| 29 | |
| 30 | #include "dev.h" |
| 31 | #include "mp_dmabuf_devmem.h" |
| 32 | #include "netmem_priv.h" |
| 33 | #include "page_pool_priv.h" |
| 34 | |
| 35 | DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); |
| 36 | |
| 37 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
| 38 | #define DEFER_WARN_INTERVAL (60 * HZ) |
| 39 | |
| 40 | #define BIAS_MAX (LONG_MAX >> 1) |
| 41 | |
| 42 | #ifdef CONFIG_PAGE_POOL_STATS |
| 43 | static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); |
| 44 | |
| 45 | /* alloc_stat_inc is intended to be used in softirq context */ |
| 46 | #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) |
| 47 | /* recycle_stat_inc is safe to use when preemption is possible. */ |
| 48 | #define recycle_stat_inc(pool, __stat) \ |
| 49 | do { \ |
| 50 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ |
| 51 | this_cpu_inc(s->__stat); \ |
| 52 | } while (0) |
| 53 | |
| 54 | #define recycle_stat_add(pool, __stat, val) \ |
| 55 | do { \ |
| 56 | struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ |
| 57 | this_cpu_add(s->__stat, val); \ |
| 58 | } while (0) |
| 59 | |
| 60 | static const char pp_stats[][ETH_GSTRING_LEN] = { |
| 61 | "rx_pp_alloc_fast", |
| 62 | "rx_pp_alloc_slow", |
| 63 | "rx_pp_alloc_slow_ho", |
| 64 | "rx_pp_alloc_empty", |
| 65 | "rx_pp_alloc_refill", |
| 66 | "rx_pp_alloc_waive", |
| 67 | "rx_pp_recycle_cached", |
| 68 | "rx_pp_recycle_cache_full", |
| 69 | "rx_pp_recycle_ring", |
| 70 | "rx_pp_recycle_ring_full", |
| 71 | "rx_pp_recycle_released_ref", |
| 72 | }; |
| 73 | |
| 74 | /** |
| 75 | * page_pool_get_stats() - fetch page pool stats |
| 76 | * @pool: pool from which page was allocated |
| 77 | * @stats: struct page_pool_stats to fill in |
| 78 | * |
| 79 | * Retrieve statistics about the page_pool. This API is only available |
| 80 | * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. |
| 81 | * A pointer to a caller allocated struct page_pool_stats structure |
| 82 | * is passed to this API which is filled in. The caller can then report |
| 83 | * those stats to the user (perhaps via ethtool, debugfs, etc.). |
| 84 | */ |
| 85 | bool page_pool_get_stats(const struct page_pool *pool, |
| 86 | struct page_pool_stats *stats) |
| 87 | { |
| 88 | int cpu = 0; |
| 89 | |
| 90 | if (!stats) |
| 91 | return false; |
| 92 | |
| 93 | /* The caller is responsible to initialize stats. */ |
| 94 | stats->alloc_stats.fast += pool->alloc_stats.fast; |
| 95 | stats->alloc_stats.slow += pool->alloc_stats.slow; |
| 96 | stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; |
| 97 | stats->alloc_stats.empty += pool->alloc_stats.empty; |
| 98 | stats->alloc_stats.refill += pool->alloc_stats.refill; |
| 99 | stats->alloc_stats.waive += pool->alloc_stats.waive; |
| 100 | |
| 101 | for_each_possible_cpu(cpu) { |
| 102 | const struct page_pool_recycle_stats *pcpu = |
| 103 | per_cpu_ptr(pool->recycle_stats, cpu); |
| 104 | |
| 105 | stats->recycle_stats.cached += pcpu->cached; |
| 106 | stats->recycle_stats.cache_full += pcpu->cache_full; |
| 107 | stats->recycle_stats.ring += pcpu->ring; |
| 108 | stats->recycle_stats.ring_full += pcpu->ring_full; |
| 109 | stats->recycle_stats.released_refcnt += pcpu->released_refcnt; |
| 110 | } |
| 111 | |
| 112 | return true; |
| 113 | } |
| 114 | EXPORT_SYMBOL(page_pool_get_stats); |
| 115 | |
| 116 | u8 *page_pool_ethtool_stats_get_strings(u8 *data) |
| 117 | { |
| 118 | int i; |
| 119 | |
| 120 | for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { |
| 121 | memcpy(data, pp_stats[i], ETH_GSTRING_LEN); |
| 122 | data += ETH_GSTRING_LEN; |
| 123 | } |
| 124 | |
| 125 | return data; |
| 126 | } |
| 127 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); |
| 128 | |
| 129 | int page_pool_ethtool_stats_get_count(void) |
| 130 | { |
| 131 | return ARRAY_SIZE(pp_stats); |
| 132 | } |
| 133 | EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); |
| 134 | |
| 135 | u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) |
| 136 | { |
| 137 | const struct page_pool_stats *pool_stats = stats; |
| 138 | |
| 139 | *data++ = pool_stats->alloc_stats.fast; |
| 140 | *data++ = pool_stats->alloc_stats.slow; |
| 141 | *data++ = pool_stats->alloc_stats.slow_high_order; |
| 142 | *data++ = pool_stats->alloc_stats.empty; |
| 143 | *data++ = pool_stats->alloc_stats.refill; |
| 144 | *data++ = pool_stats->alloc_stats.waive; |
| 145 | *data++ = pool_stats->recycle_stats.cached; |
| 146 | *data++ = pool_stats->recycle_stats.cache_full; |
| 147 | *data++ = pool_stats->recycle_stats.ring; |
| 148 | *data++ = pool_stats->recycle_stats.ring_full; |
| 149 | *data++ = pool_stats->recycle_stats.released_refcnt; |
| 150 | |
| 151 | return data; |
| 152 | } |
| 153 | EXPORT_SYMBOL(page_pool_ethtool_stats_get); |
| 154 | |
| 155 | #else |
| 156 | #define alloc_stat_inc(...) do { } while (0) |
| 157 | #define recycle_stat_inc(...) do { } while (0) |
| 158 | #define recycle_stat_add(...) do { } while (0) |
| 159 | #endif |
| 160 | |
| 161 | static bool page_pool_producer_lock(struct page_pool *pool) |
| 162 | __acquires(&pool->ring.producer_lock) |
| 163 | { |
| 164 | bool in_softirq = in_softirq(); |
| 165 | |
| 166 | if (in_softirq) |
| 167 | spin_lock(&pool->ring.producer_lock); |
| 168 | else |
| 169 | spin_lock_bh(&pool->ring.producer_lock); |
| 170 | |
| 171 | return in_softirq; |
| 172 | } |
| 173 | |
| 174 | static void page_pool_producer_unlock(struct page_pool *pool, |
| 175 | bool in_softirq) |
| 176 | __releases(&pool->ring.producer_lock) |
| 177 | { |
| 178 | if (in_softirq) |
| 179 | spin_unlock(&pool->ring.producer_lock); |
| 180 | else |
| 181 | spin_unlock_bh(&pool->ring.producer_lock); |
| 182 | } |
| 183 | |
| 184 | static void page_pool_struct_check(void) |
| 185 | { |
| 186 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); |
| 187 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); |
| 188 | CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); |
| 189 | CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, |
| 190 | PAGE_POOL_FRAG_GROUP_ALIGN); |
| 191 | } |
| 192 | |
| 193 | static int page_pool_init(struct page_pool *pool, |
| 194 | const struct page_pool_params *params, |
| 195 | int cpuid) |
| 196 | { |
| 197 | unsigned int ring_qsize = 1024; /* Default */ |
| 198 | struct netdev_rx_queue *rxq; |
| 199 | int err; |
| 200 | |
| 201 | page_pool_struct_check(); |
| 202 | |
| 203 | memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); |
| 204 | memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); |
| 205 | |
| 206 | pool->cpuid = cpuid; |
| 207 | pool->dma_sync_for_cpu = true; |
| 208 | |
| 209 | /* Validate only known flags were used */ |
| 210 | if (pool->slow.flags & ~PP_FLAG_ALL) |
| 211 | return -EINVAL; |
| 212 | |
| 213 | if (pool->p.pool_size) |
| 214 | ring_qsize = pool->p.pool_size; |
| 215 | |
| 216 | /* Sanity limit mem that can be pinned down */ |
| 217 | if (ring_qsize > 32768) |
| 218 | return -E2BIG; |
| 219 | |
| 220 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. |
| 221 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, |
| 222 | * which is the XDP_TX use-case. |
| 223 | */ |
| 224 | if (pool->slow.flags & PP_FLAG_DMA_MAP) { |
| 225 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && |
| 226 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) |
| 227 | return -EINVAL; |
| 228 | |
| 229 | pool->dma_map = true; |
| 230 | } |
| 231 | |
| 232 | if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { |
| 233 | /* In order to request DMA-sync-for-device the page |
| 234 | * needs to be mapped |
| 235 | */ |
| 236 | if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) |
| 237 | return -EINVAL; |
| 238 | |
| 239 | if (!pool->p.max_len) |
| 240 | return -EINVAL; |
| 241 | |
| 242 | pool->dma_sync = true; |
| 243 | |
| 244 | /* pool->p.offset has to be set according to the address |
| 245 | * offset used by the DMA engine to start copying rx data |
| 246 | */ |
| 247 | } |
| 248 | |
| 249 | pool->has_init_callback = !!pool->slow.init_callback; |
| 250 | |
| 251 | #ifdef CONFIG_PAGE_POOL_STATS |
| 252 | if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { |
| 253 | pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); |
| 254 | if (!pool->recycle_stats) |
| 255 | return -ENOMEM; |
| 256 | } else { |
| 257 | /* For system page pool instance we use a singular stats object |
| 258 | * instead of allocating a separate percpu variable for each |
| 259 | * (also percpu) page pool instance. |
| 260 | */ |
| 261 | pool->recycle_stats = &pp_system_recycle_stats; |
| 262 | pool->system = true; |
| 263 | } |
| 264 | #endif |
| 265 | |
| 266 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { |
| 267 | #ifdef CONFIG_PAGE_POOL_STATS |
| 268 | if (!pool->system) |
| 269 | free_percpu(pool->recycle_stats); |
| 270 | #endif |
| 271 | return -ENOMEM; |
| 272 | } |
| 273 | |
| 274 | atomic_set(&pool->pages_state_release_cnt, 0); |
| 275 | |
| 276 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
| 277 | refcount_set(&pool->user_cnt, 1); |
| 278 | |
| 279 | xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); |
| 280 | |
| 281 | if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { |
| 282 | netdev_assert_locked(pool->slow.netdev); |
| 283 | rxq = __netif_get_rx_queue(pool->slow.netdev, |
| 284 | pool->slow.queue_idx); |
| 285 | pool->mp_priv = rxq->mp_params.mp_priv; |
| 286 | pool->mp_ops = rxq->mp_params.mp_ops; |
| 287 | } |
| 288 | |
| 289 | if (pool->mp_ops) { |
| 290 | if (!pool->dma_map || !pool->dma_sync) |
| 291 | return -EOPNOTSUPP; |
| 292 | |
| 293 | if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { |
| 294 | err = -EFAULT; |
| 295 | goto free_ptr_ring; |
| 296 | } |
| 297 | |
| 298 | err = pool->mp_ops->init(pool); |
| 299 | if (err) { |
| 300 | pr_warn("%s() mem-provider init failed %d\n", __func__, |
| 301 | err); |
| 302 | goto free_ptr_ring; |
| 303 | } |
| 304 | |
| 305 | static_branch_inc(&page_pool_mem_providers); |
| 306 | } |
| 307 | |
| 308 | return 0; |
| 309 | |
| 310 | free_ptr_ring: |
| 311 | ptr_ring_cleanup(&pool->ring, NULL); |
| 312 | #ifdef CONFIG_PAGE_POOL_STATS |
| 313 | if (!pool->system) |
| 314 | free_percpu(pool->recycle_stats); |
| 315 | #endif |
| 316 | return err; |
| 317 | } |
| 318 | |
| 319 | static void page_pool_uninit(struct page_pool *pool) |
| 320 | { |
| 321 | ptr_ring_cleanup(&pool->ring, NULL); |
| 322 | xa_destroy(&pool->dma_mapped); |
| 323 | |
| 324 | #ifdef CONFIG_PAGE_POOL_STATS |
| 325 | if (!pool->system) |
| 326 | free_percpu(pool->recycle_stats); |
| 327 | #endif |
| 328 | } |
| 329 | |
| 330 | /** |
| 331 | * page_pool_create_percpu() - create a page pool for a given cpu. |
| 332 | * @params: parameters, see struct page_pool_params |
| 333 | * @cpuid: cpu identifier |
| 334 | */ |
| 335 | struct page_pool * |
| 336 | page_pool_create_percpu(const struct page_pool_params *params, int cpuid) |
| 337 | { |
| 338 | struct page_pool *pool; |
| 339 | int err; |
| 340 | |
| 341 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); |
| 342 | if (!pool) |
| 343 | return ERR_PTR(-ENOMEM); |
| 344 | |
| 345 | err = page_pool_init(pool, params, cpuid); |
| 346 | if (err < 0) |
| 347 | goto err_free; |
| 348 | |
| 349 | err = page_pool_list(pool); |
| 350 | if (err) |
| 351 | goto err_uninit; |
| 352 | |
| 353 | return pool; |
| 354 | |
| 355 | err_uninit: |
| 356 | page_pool_uninit(pool); |
| 357 | err_free: |
| 358 | pr_warn("%s() gave up with errno %d\n", __func__, err); |
| 359 | kfree(pool); |
| 360 | return ERR_PTR(err); |
| 361 | } |
| 362 | EXPORT_SYMBOL(page_pool_create_percpu); |
| 363 | |
| 364 | /** |
| 365 | * page_pool_create() - create a page pool |
| 366 | * @params: parameters, see struct page_pool_params |
| 367 | */ |
| 368 | struct page_pool *page_pool_create(const struct page_pool_params *params) |
| 369 | { |
| 370 | return page_pool_create_percpu(params, -1); |
| 371 | } |
| 372 | EXPORT_SYMBOL(page_pool_create); |
| 373 | |
| 374 | static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); |
| 375 | |
| 376 | static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) |
| 377 | { |
| 378 | struct ptr_ring *r = &pool->ring; |
| 379 | netmem_ref netmem; |
| 380 | int pref_nid; /* preferred NUMA node */ |
| 381 | |
| 382 | /* Quicker fallback, avoid locks when ring is empty */ |
| 383 | if (__ptr_ring_empty(r)) { |
| 384 | alloc_stat_inc(pool, empty); |
| 385 | return 0; |
| 386 | } |
| 387 | |
| 388 | /* Softirq guarantee CPU and thus NUMA node is stable. This, |
| 389 | * assumes CPU refilling driver RX-ring will also run RX-NAPI. |
| 390 | */ |
| 391 | #ifdef CONFIG_NUMA |
| 392 | pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; |
| 393 | #else |
| 394 | /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ |
| 395 | pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ |
| 396 | #endif |
| 397 | |
| 398 | /* Refill alloc array, but only if NUMA match */ |
| 399 | do { |
| 400 | netmem = (__force netmem_ref)__ptr_ring_consume(r); |
| 401 | if (unlikely(!netmem)) |
| 402 | break; |
| 403 | |
| 404 | if (likely(netmem_is_pref_nid(netmem, pref_nid))) { |
| 405 | pool->alloc.cache[pool->alloc.count++] = netmem; |
| 406 | } else { |
| 407 | /* NUMA mismatch; |
| 408 | * (1) release 1 page to page-allocator and |
| 409 | * (2) break out to fallthrough to alloc_pages_node. |
| 410 | * This limit stress on page buddy alloactor. |
| 411 | */ |
| 412 | page_pool_return_page(pool, netmem); |
| 413 | alloc_stat_inc(pool, waive); |
| 414 | netmem = 0; |
| 415 | break; |
| 416 | } |
| 417 | } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); |
| 418 | |
| 419 | /* Return last page */ |
| 420 | if (likely(pool->alloc.count > 0)) { |
| 421 | netmem = pool->alloc.cache[--pool->alloc.count]; |
| 422 | alloc_stat_inc(pool, refill); |
| 423 | } |
| 424 | |
| 425 | return netmem; |
| 426 | } |
| 427 | |
| 428 | /* fast path */ |
| 429 | static netmem_ref __page_pool_get_cached(struct page_pool *pool) |
| 430 | { |
| 431 | netmem_ref netmem; |
| 432 | |
| 433 | /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ |
| 434 | if (likely(pool->alloc.count)) { |
| 435 | /* Fast-path */ |
| 436 | netmem = pool->alloc.cache[--pool->alloc.count]; |
| 437 | alloc_stat_inc(pool, fast); |
| 438 | } else { |
| 439 | netmem = page_pool_refill_alloc_cache(pool); |
| 440 | } |
| 441 | |
| 442 | return netmem; |
| 443 | } |
| 444 | |
| 445 | static void __page_pool_dma_sync_for_device(const struct page_pool *pool, |
| 446 | netmem_ref netmem, |
| 447 | u32 dma_sync_size) |
| 448 | { |
| 449 | #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) |
| 450 | dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); |
| 451 | |
| 452 | dma_sync_size = min(dma_sync_size, pool->p.max_len); |
| 453 | __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, |
| 454 | dma_sync_size, pool->p.dma_dir); |
| 455 | #endif |
| 456 | } |
| 457 | |
| 458 | static __always_inline void |
| 459 | page_pool_dma_sync_for_device(const struct page_pool *pool, |
| 460 | netmem_ref netmem, |
| 461 | u32 dma_sync_size) |
| 462 | { |
| 463 | if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { |
| 464 | rcu_read_lock(); |
| 465 | /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ |
| 466 | if (pool->dma_sync) |
| 467 | __page_pool_dma_sync_for_device(pool, netmem, |
| 468 | dma_sync_size); |
| 469 | rcu_read_unlock(); |
| 470 | } |
| 471 | } |
| 472 | |
| 473 | static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) |
| 474 | { |
| 475 | dma_addr_t dma; |
| 476 | int err; |
| 477 | u32 id; |
| 478 | |
| 479 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr |
| 480 | * since dma_addr_t can be either 32 or 64 bits and does not always fit |
| 481 | * into page private data (i.e 32bit cpu with 64bit DMA caps) |
| 482 | * This mapping is kept for lifetime of page, until leaving pool. |
| 483 | */ |
| 484 | dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, |
| 485 | (PAGE_SIZE << pool->p.order), pool->p.dma_dir, |
| 486 | DMA_ATTR_SKIP_CPU_SYNC | |
| 487 | DMA_ATTR_WEAK_ORDERING); |
| 488 | if (dma_mapping_error(pool->p.dev, dma)) |
| 489 | return false; |
| 490 | |
| 491 | if (page_pool_set_dma_addr_netmem(netmem, dma)) { |
| 492 | WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); |
| 493 | goto unmap_failed; |
| 494 | } |
| 495 | |
| 496 | if (in_softirq()) |
| 497 | err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), |
| 498 | PP_DMA_INDEX_LIMIT, gfp); |
| 499 | else |
| 500 | err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), |
| 501 | PP_DMA_INDEX_LIMIT, gfp); |
| 502 | if (err) { |
| 503 | WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); |
| 504 | goto unset_failed; |
| 505 | } |
| 506 | |
| 507 | netmem_set_dma_index(netmem, id); |
| 508 | page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); |
| 509 | |
| 510 | return true; |
| 511 | |
| 512 | unset_failed: |
| 513 | page_pool_set_dma_addr_netmem(netmem, 0); |
| 514 | unmap_failed: |
| 515 | dma_unmap_page_attrs(pool->p.dev, dma, |
| 516 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, |
| 517 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); |
| 518 | return false; |
| 519 | } |
| 520 | |
| 521 | static struct page *__page_pool_alloc_page_order(struct page_pool *pool, |
| 522 | gfp_t gfp) |
| 523 | { |
| 524 | struct page *page; |
| 525 | |
| 526 | gfp |= __GFP_COMP; |
| 527 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); |
| 528 | if (unlikely(!page)) |
| 529 | return NULL; |
| 530 | |
| 531 | if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { |
| 532 | put_page(page); |
| 533 | return NULL; |
| 534 | } |
| 535 | |
| 536 | alloc_stat_inc(pool, slow_high_order); |
| 537 | page_pool_set_pp_info(pool, page_to_netmem(page)); |
| 538 | |
| 539 | /* Track how many pages are held 'in-flight' */ |
| 540 | pool->pages_state_hold_cnt++; |
| 541 | trace_page_pool_state_hold(pool, page_to_netmem(page), |
| 542 | pool->pages_state_hold_cnt); |
| 543 | return page; |
| 544 | } |
| 545 | |
| 546 | /* slow path */ |
| 547 | static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, |
| 548 | gfp_t gfp) |
| 549 | { |
| 550 | const int bulk = PP_ALLOC_CACHE_REFILL; |
| 551 | unsigned int pp_order = pool->p.order; |
| 552 | bool dma_map = pool->dma_map; |
| 553 | netmem_ref netmem; |
| 554 | int i, nr_pages; |
| 555 | |
| 556 | /* Don't support bulk alloc for high-order pages */ |
| 557 | if (unlikely(pp_order)) |
| 558 | return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); |
| 559 | |
| 560 | /* Unnecessary as alloc cache is empty, but guarantees zero count */ |
| 561 | if (unlikely(pool->alloc.count > 0)) |
| 562 | return pool->alloc.cache[--pool->alloc.count]; |
| 563 | |
| 564 | /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ |
| 565 | memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); |
| 566 | |
| 567 | nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, |
| 568 | (struct page **)pool->alloc.cache); |
| 569 | if (unlikely(!nr_pages)) |
| 570 | return 0; |
| 571 | |
| 572 | /* Pages have been filled into alloc.cache array, but count is zero and |
| 573 | * page element have not been (possibly) DMA mapped. |
| 574 | */ |
| 575 | for (i = 0; i < nr_pages; i++) { |
| 576 | netmem = pool->alloc.cache[i]; |
| 577 | if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { |
| 578 | put_page(netmem_to_page(netmem)); |
| 579 | continue; |
| 580 | } |
| 581 | |
| 582 | page_pool_set_pp_info(pool, netmem); |
| 583 | pool->alloc.cache[pool->alloc.count++] = netmem; |
| 584 | /* Track how many pages are held 'in-flight' */ |
| 585 | pool->pages_state_hold_cnt++; |
| 586 | trace_page_pool_state_hold(pool, netmem, |
| 587 | pool->pages_state_hold_cnt); |
| 588 | } |
| 589 | |
| 590 | /* Return last page */ |
| 591 | if (likely(pool->alloc.count > 0)) { |
| 592 | netmem = pool->alloc.cache[--pool->alloc.count]; |
| 593 | alloc_stat_inc(pool, slow); |
| 594 | } else { |
| 595 | netmem = 0; |
| 596 | } |
| 597 | |
| 598 | /* When page just alloc'ed is should/must have refcnt 1. */ |
| 599 | return netmem; |
| 600 | } |
| 601 | |
| 602 | /* For using page_pool replace: alloc_pages() API calls, but provide |
| 603 | * synchronization guarantee for allocation side. |
| 604 | */ |
| 605 | netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) |
| 606 | { |
| 607 | netmem_ref netmem; |
| 608 | |
| 609 | /* Fast-path: Get a page from cache */ |
| 610 | netmem = __page_pool_get_cached(pool); |
| 611 | if (netmem) |
| 612 | return netmem; |
| 613 | |
| 614 | /* Slow-path: cache empty, do real allocation */ |
| 615 | if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) |
| 616 | netmem = pool->mp_ops->alloc_netmems(pool, gfp); |
| 617 | else |
| 618 | netmem = __page_pool_alloc_pages_slow(pool, gfp); |
| 619 | return netmem; |
| 620 | } |
| 621 | EXPORT_SYMBOL(page_pool_alloc_netmems); |
| 622 | ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); |
| 623 | |
| 624 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) |
| 625 | { |
| 626 | return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); |
| 627 | } |
| 628 | EXPORT_SYMBOL(page_pool_alloc_pages); |
| 629 | |
| 630 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
| 631 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution |
| 632 | */ |
| 633 | #define _distance(a, b) (s32)((a) - (b)) |
| 634 | |
| 635 | s32 page_pool_inflight(const struct page_pool *pool, bool strict) |
| 636 | { |
| 637 | u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); |
| 638 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); |
| 639 | s32 inflight; |
| 640 | |
| 641 | inflight = _distance(hold_cnt, release_cnt); |
| 642 | |
| 643 | if (strict) { |
| 644 | trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); |
| 645 | WARN(inflight < 0, "Negative(%d) inflight packet-pages", |
| 646 | inflight); |
| 647 | } else { |
| 648 | inflight = max(0, inflight); |
| 649 | } |
| 650 | |
| 651 | return inflight; |
| 652 | } |
| 653 | |
| 654 | void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) |
| 655 | { |
| 656 | netmem_set_pp(netmem, pool); |
| 657 | netmem_or_pp_magic(netmem, PP_SIGNATURE); |
| 658 | |
| 659 | /* Ensuring all pages have been split into one fragment initially: |
| 660 | * page_pool_set_pp_info() is only called once for every page when it |
| 661 | * is allocated from the page allocator and page_pool_fragment_page() |
| 662 | * is dirtying the same cache line as the page->pp_magic above, so |
| 663 | * the overhead is negligible. |
| 664 | */ |
| 665 | page_pool_fragment_netmem(netmem, 1); |
| 666 | if (pool->has_init_callback) |
| 667 | pool->slow.init_callback(netmem, pool->slow.init_arg); |
| 668 | } |
| 669 | |
| 670 | void page_pool_clear_pp_info(netmem_ref netmem) |
| 671 | { |
| 672 | netmem_clear_pp_magic(netmem); |
| 673 | netmem_set_pp(netmem, NULL); |
| 674 | } |
| 675 | |
| 676 | static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, |
| 677 | netmem_ref netmem) |
| 678 | { |
| 679 | struct page *old, *page = netmem_to_page(netmem); |
| 680 | unsigned long id; |
| 681 | dma_addr_t dma; |
| 682 | |
| 683 | if (!pool->dma_map) |
| 684 | /* Always account for inflight pages, even if we didn't |
| 685 | * map them |
| 686 | */ |
| 687 | return; |
| 688 | |
| 689 | id = netmem_get_dma_index(netmem); |
| 690 | if (!id) |
| 691 | return; |
| 692 | |
| 693 | if (in_softirq()) |
| 694 | old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); |
| 695 | else |
| 696 | old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); |
| 697 | if (old != page) |
| 698 | return; |
| 699 | |
| 700 | dma = page_pool_get_dma_addr_netmem(netmem); |
| 701 | |
| 702 | /* When page is unmapped, it cannot be returned to our pool */ |
| 703 | dma_unmap_page_attrs(pool->p.dev, dma, |
| 704 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, |
| 705 | DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); |
| 706 | page_pool_set_dma_addr_netmem(netmem, 0); |
| 707 | netmem_set_dma_index(netmem, 0); |
| 708 | } |
| 709 | |
| 710 | /* Disconnects a page (from a page_pool). API users can have a need |
| 711 | * to disconnect a page (from a page_pool), to allow it to be used as |
| 712 | * a regular page (that will eventually be returned to the normal |
| 713 | * page-allocator via put_page). |
| 714 | */ |
| 715 | void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) |
| 716 | { |
| 717 | int count; |
| 718 | bool put; |
| 719 | |
| 720 | put = true; |
| 721 | if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) |
| 722 | put = pool->mp_ops->release_netmem(pool, netmem); |
| 723 | else |
| 724 | __page_pool_release_page_dma(pool, netmem); |
| 725 | |
| 726 | /* This may be the last page returned, releasing the pool, so |
| 727 | * it is not safe to reference pool afterwards. |
| 728 | */ |
| 729 | count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); |
| 730 | trace_page_pool_state_release(pool, netmem, count); |
| 731 | |
| 732 | if (put) { |
| 733 | page_pool_clear_pp_info(netmem); |
| 734 | put_page(netmem_to_page(netmem)); |
| 735 | } |
| 736 | /* An optimization would be to call __free_pages(page, pool->p.order) |
| 737 | * knowing page is not part of page-cache (thus avoiding a |
| 738 | * __page_cache_release() call). |
| 739 | */ |
| 740 | } |
| 741 | |
| 742 | static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) |
| 743 | { |
| 744 | bool in_softirq, ret; |
| 745 | |
| 746 | /* BH protection not needed if current is softirq */ |
| 747 | in_softirq = page_pool_producer_lock(pool); |
| 748 | ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); |
| 749 | if (ret) |
| 750 | recycle_stat_inc(pool, ring); |
| 751 | page_pool_producer_unlock(pool, in_softirq); |
| 752 | |
| 753 | return ret; |
| 754 | } |
| 755 | |
| 756 | /* Only allow direct recycling in special circumstances, into the |
| 757 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. |
| 758 | * |
| 759 | * Caller must provide appropriate safe context. |
| 760 | */ |
| 761 | static bool page_pool_recycle_in_cache(netmem_ref netmem, |
| 762 | struct page_pool *pool) |
| 763 | { |
| 764 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { |
| 765 | recycle_stat_inc(pool, cache_full); |
| 766 | return false; |
| 767 | } |
| 768 | |
| 769 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ |
| 770 | pool->alloc.cache[pool->alloc.count++] = netmem; |
| 771 | recycle_stat_inc(pool, cached); |
| 772 | return true; |
| 773 | } |
| 774 | |
| 775 | static bool __page_pool_page_can_be_recycled(netmem_ref netmem) |
| 776 | { |
| 777 | return netmem_is_net_iov(netmem) || |
| 778 | (page_ref_count(netmem_to_page(netmem)) == 1 && |
| 779 | !page_is_pfmemalloc(netmem_to_page(netmem))); |
| 780 | } |
| 781 | |
| 782 | /* If the page refcnt == 1, this will try to recycle the page. |
| 783 | * If pool->dma_sync is set, we'll try to sync the DMA area for |
| 784 | * the configured size min(dma_sync_size, pool->max_len). |
| 785 | * If the page refcnt != 1, then the page will be returned to memory |
| 786 | * subsystem. |
| 787 | */ |
| 788 | static __always_inline netmem_ref |
| 789 | __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, |
| 790 | unsigned int dma_sync_size, bool allow_direct) |
| 791 | { |
| 792 | lockdep_assert_no_hardirq(); |
| 793 | |
| 794 | /* This allocator is optimized for the XDP mode that uses |
| 795 | * one-frame-per-page, but have fallbacks that act like the |
| 796 | * regular page allocator APIs. |
| 797 | * |
| 798 | * refcnt == 1 means page_pool owns page, and can recycle it. |
| 799 | * |
| 800 | * page is NOT reusable when allocated when system is under |
| 801 | * some pressure. (page_is_pfmemalloc) |
| 802 | */ |
| 803 | if (likely(__page_pool_page_can_be_recycled(netmem))) { |
| 804 | /* Read barrier done in page_ref_count / READ_ONCE */ |
| 805 | |
| 806 | page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); |
| 807 | |
| 808 | if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) |
| 809 | return 0; |
| 810 | |
| 811 | /* Page found as candidate for recycling */ |
| 812 | return netmem; |
| 813 | } |
| 814 | |
| 815 | /* Fallback/non-XDP mode: API user have elevated refcnt. |
| 816 | * |
| 817 | * Many drivers split up the page into fragments, and some |
| 818 | * want to keep doing this to save memory and do refcnt based |
| 819 | * recycling. Support this use case too, to ease drivers |
| 820 | * switching between XDP/non-XDP. |
| 821 | * |
| 822 | * In-case page_pool maintains the DMA mapping, API user must |
| 823 | * call page_pool_put_page once. In this elevated refcnt |
| 824 | * case, the DMA is unmapped/released, as driver is likely |
| 825 | * doing refcnt based recycle tricks, meaning another process |
| 826 | * will be invoking put_page. |
| 827 | */ |
| 828 | recycle_stat_inc(pool, released_refcnt); |
| 829 | page_pool_return_page(pool, netmem); |
| 830 | |
| 831 | return 0; |
| 832 | } |
| 833 | |
| 834 | static bool page_pool_napi_local(const struct page_pool *pool) |
| 835 | { |
| 836 | const struct napi_struct *napi; |
| 837 | u32 cpuid; |
| 838 | |
| 839 | /* On PREEMPT_RT the softirq can be preempted by the consumer */ |
| 840 | if (IS_ENABLED(CONFIG_PREEMPT_RT)) |
| 841 | return false; |
| 842 | |
| 843 | if (unlikely(!in_softirq())) |
| 844 | return false; |
| 845 | |
| 846 | /* Allow direct recycle if we have reasons to believe that we are |
| 847 | * in the same context as the consumer would run, so there's |
| 848 | * no possible race. |
| 849 | * __page_pool_put_page() makes sure we're not in hardirq context |
| 850 | * and interrupts are enabled prior to accessing the cache. |
| 851 | */ |
| 852 | cpuid = smp_processor_id(); |
| 853 | if (READ_ONCE(pool->cpuid) == cpuid) |
| 854 | return true; |
| 855 | |
| 856 | napi = READ_ONCE(pool->p.napi); |
| 857 | |
| 858 | return napi && READ_ONCE(napi->list_owner) == cpuid; |
| 859 | } |
| 860 | |
| 861 | void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, |
| 862 | unsigned int dma_sync_size, bool allow_direct) |
| 863 | { |
| 864 | if (!allow_direct) |
| 865 | allow_direct = page_pool_napi_local(pool); |
| 866 | |
| 867 | netmem = __page_pool_put_page(pool, netmem, dma_sync_size, |
| 868 | allow_direct); |
| 869 | if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { |
| 870 | /* Cache full, fallback to free pages */ |
| 871 | recycle_stat_inc(pool, ring_full); |
| 872 | page_pool_return_page(pool, netmem); |
| 873 | } |
| 874 | } |
| 875 | EXPORT_SYMBOL(page_pool_put_unrefed_netmem); |
| 876 | |
| 877 | void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, |
| 878 | unsigned int dma_sync_size, bool allow_direct) |
| 879 | { |
| 880 | page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, |
| 881 | allow_direct); |
| 882 | } |
| 883 | EXPORT_SYMBOL(page_pool_put_unrefed_page); |
| 884 | |
| 885 | static void page_pool_recycle_ring_bulk(struct page_pool *pool, |
| 886 | netmem_ref *bulk, |
| 887 | u32 bulk_len) |
| 888 | { |
| 889 | bool in_softirq; |
| 890 | u32 i; |
| 891 | |
| 892 | /* Bulk produce into ptr_ring page_pool cache */ |
| 893 | in_softirq = page_pool_producer_lock(pool); |
| 894 | |
| 895 | for (i = 0; i < bulk_len; i++) { |
| 896 | if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { |
| 897 | /* ring full */ |
| 898 | recycle_stat_inc(pool, ring_full); |
| 899 | break; |
| 900 | } |
| 901 | } |
| 902 | |
| 903 | page_pool_producer_unlock(pool, in_softirq); |
| 904 | recycle_stat_add(pool, ring, i); |
| 905 | |
| 906 | /* Hopefully all pages were returned into ptr_ring */ |
| 907 | if (likely(i == bulk_len)) |
| 908 | return; |
| 909 | |
| 910 | /* |
| 911 | * ptr_ring cache is full, free remaining pages outside producer lock |
| 912 | * since put_page() with refcnt == 1 can be an expensive operation. |
| 913 | */ |
| 914 | for (; i < bulk_len; i++) |
| 915 | page_pool_return_page(pool, bulk[i]); |
| 916 | } |
| 917 | |
| 918 | /** |
| 919 | * page_pool_put_netmem_bulk() - release references on multiple netmems |
| 920 | * @data: array holding netmem references |
| 921 | * @count: number of entries in @data |
| 922 | * |
| 923 | * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring |
| 924 | * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() |
| 925 | * will release leftover netmems to the memory provider. |
| 926 | * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx |
| 927 | * completion loop for the XDP_REDIRECT use case. |
| 928 | * |
| 929 | * Please note the caller must not use data area after running |
| 930 | * page_pool_put_netmem_bulk(), as this function overwrites it. |
| 931 | */ |
| 932 | void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) |
| 933 | { |
| 934 | u32 bulk_len = 0; |
| 935 | |
| 936 | for (u32 i = 0; i < count; i++) { |
| 937 | netmem_ref netmem = netmem_compound_head(data[i]); |
| 938 | |
| 939 | if (page_pool_unref_and_test(netmem)) |
| 940 | data[bulk_len++] = netmem; |
| 941 | } |
| 942 | |
| 943 | count = bulk_len; |
| 944 | while (count) { |
| 945 | netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; |
| 946 | struct page_pool *pool = NULL; |
| 947 | bool allow_direct; |
| 948 | u32 foreign = 0; |
| 949 | |
| 950 | bulk_len = 0; |
| 951 | |
| 952 | for (u32 i = 0; i < count; i++) { |
| 953 | struct page_pool *netmem_pp; |
| 954 | netmem_ref netmem = data[i]; |
| 955 | |
| 956 | netmem_pp = netmem_get_pp(netmem); |
| 957 | if (unlikely(!pool)) { |
| 958 | pool = netmem_pp; |
| 959 | allow_direct = page_pool_napi_local(pool); |
| 960 | } else if (netmem_pp != pool) { |
| 961 | /* |
| 962 | * If the netmem belongs to a different |
| 963 | * page_pool, save it for another round. |
| 964 | */ |
| 965 | data[foreign++] = netmem; |
| 966 | continue; |
| 967 | } |
| 968 | |
| 969 | netmem = __page_pool_put_page(pool, netmem, -1, |
| 970 | allow_direct); |
| 971 | /* Approved for bulk recycling in ptr_ring cache */ |
| 972 | if (netmem) |
| 973 | bulk[bulk_len++] = netmem; |
| 974 | } |
| 975 | |
| 976 | if (bulk_len) |
| 977 | page_pool_recycle_ring_bulk(pool, bulk, bulk_len); |
| 978 | |
| 979 | count = foreign; |
| 980 | } |
| 981 | } |
| 982 | EXPORT_SYMBOL(page_pool_put_netmem_bulk); |
| 983 | |
| 984 | static netmem_ref page_pool_drain_frag(struct page_pool *pool, |
| 985 | netmem_ref netmem) |
| 986 | { |
| 987 | long drain_count = BIAS_MAX - pool->frag_users; |
| 988 | |
| 989 | /* Some user is still using the page frag */ |
| 990 | if (likely(page_pool_unref_netmem(netmem, drain_count))) |
| 991 | return 0; |
| 992 | |
| 993 | if (__page_pool_page_can_be_recycled(netmem)) { |
| 994 | page_pool_dma_sync_for_device(pool, netmem, -1); |
| 995 | return netmem; |
| 996 | } |
| 997 | |
| 998 | page_pool_return_page(pool, netmem); |
| 999 | return 0; |
| 1000 | } |
| 1001 | |
| 1002 | static void page_pool_free_frag(struct page_pool *pool) |
| 1003 | { |
| 1004 | long drain_count = BIAS_MAX - pool->frag_users; |
| 1005 | netmem_ref netmem = pool->frag_page; |
| 1006 | |
| 1007 | pool->frag_page = 0; |
| 1008 | |
| 1009 | if (!netmem || page_pool_unref_netmem(netmem, drain_count)) |
| 1010 | return; |
| 1011 | |
| 1012 | page_pool_return_page(pool, netmem); |
| 1013 | } |
| 1014 | |
| 1015 | netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, |
| 1016 | unsigned int *offset, unsigned int size, |
| 1017 | gfp_t gfp) |
| 1018 | { |
| 1019 | unsigned int max_size = PAGE_SIZE << pool->p.order; |
| 1020 | netmem_ref netmem = pool->frag_page; |
| 1021 | |
| 1022 | if (WARN_ON(size > max_size)) |
| 1023 | return 0; |
| 1024 | |
| 1025 | size = ALIGN(size, dma_get_cache_alignment()); |
| 1026 | *offset = pool->frag_offset; |
| 1027 | |
| 1028 | if (netmem && *offset + size > max_size) { |
| 1029 | netmem = page_pool_drain_frag(pool, netmem); |
| 1030 | if (netmem) { |
| 1031 | recycle_stat_inc(pool, cached); |
| 1032 | alloc_stat_inc(pool, fast); |
| 1033 | goto frag_reset; |
| 1034 | } |
| 1035 | } |
| 1036 | |
| 1037 | if (!netmem) { |
| 1038 | netmem = page_pool_alloc_netmems(pool, gfp); |
| 1039 | if (unlikely(!netmem)) { |
| 1040 | pool->frag_page = 0; |
| 1041 | return 0; |
| 1042 | } |
| 1043 | |
| 1044 | pool->frag_page = netmem; |
| 1045 | |
| 1046 | frag_reset: |
| 1047 | pool->frag_users = 1; |
| 1048 | *offset = 0; |
| 1049 | pool->frag_offset = size; |
| 1050 | page_pool_fragment_netmem(netmem, BIAS_MAX); |
| 1051 | return netmem; |
| 1052 | } |
| 1053 | |
| 1054 | pool->frag_users++; |
| 1055 | pool->frag_offset = *offset + size; |
| 1056 | return netmem; |
| 1057 | } |
| 1058 | EXPORT_SYMBOL(page_pool_alloc_frag_netmem); |
| 1059 | |
| 1060 | struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, |
| 1061 | unsigned int size, gfp_t gfp) |
| 1062 | { |
| 1063 | return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, |
| 1064 | gfp)); |
| 1065 | } |
| 1066 | EXPORT_SYMBOL(page_pool_alloc_frag); |
| 1067 | |
| 1068 | static void page_pool_empty_ring(struct page_pool *pool) |
| 1069 | { |
| 1070 | netmem_ref netmem; |
| 1071 | |
| 1072 | /* Empty recycle ring */ |
| 1073 | while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { |
| 1074 | /* Verify the refcnt invariant of cached pages */ |
| 1075 | if (!(netmem_ref_count(netmem) == 1)) |
| 1076 | pr_crit("%s() page_pool refcnt %d violation\n", |
| 1077 | __func__, netmem_ref_count(netmem)); |
| 1078 | |
| 1079 | page_pool_return_page(pool, netmem); |
| 1080 | } |
| 1081 | } |
| 1082 | |
| 1083 | static void __page_pool_destroy(struct page_pool *pool) |
| 1084 | { |
| 1085 | if (pool->disconnect) |
| 1086 | pool->disconnect(pool); |
| 1087 | |
| 1088 | page_pool_unlist(pool); |
| 1089 | page_pool_uninit(pool); |
| 1090 | |
| 1091 | if (pool->mp_ops) { |
| 1092 | pool->mp_ops->destroy(pool); |
| 1093 | static_branch_dec(&page_pool_mem_providers); |
| 1094 | } |
| 1095 | |
| 1096 | kfree(pool); |
| 1097 | } |
| 1098 | |
| 1099 | static void page_pool_empty_alloc_cache_once(struct page_pool *pool) |
| 1100 | { |
| 1101 | netmem_ref netmem; |
| 1102 | |
| 1103 | if (pool->destroy_cnt) |
| 1104 | return; |
| 1105 | |
| 1106 | /* Empty alloc cache, assume caller made sure this is |
| 1107 | * no-longer in use, and page_pool_alloc_pages() cannot be |
| 1108 | * call concurrently. |
| 1109 | */ |
| 1110 | while (pool->alloc.count) { |
| 1111 | netmem = pool->alloc.cache[--pool->alloc.count]; |
| 1112 | page_pool_return_page(pool, netmem); |
| 1113 | } |
| 1114 | } |
| 1115 | |
| 1116 | static void page_pool_scrub(struct page_pool *pool) |
| 1117 | { |
| 1118 | unsigned long id; |
| 1119 | void *ptr; |
| 1120 | |
| 1121 | page_pool_empty_alloc_cache_once(pool); |
| 1122 | if (!pool->destroy_cnt++ && pool->dma_map) { |
| 1123 | if (pool->dma_sync) { |
| 1124 | /* Disable page_pool_dma_sync_for_device() */ |
| 1125 | pool->dma_sync = false; |
| 1126 | |
| 1127 | /* Make sure all concurrent returns that may see the old |
| 1128 | * value of dma_sync (and thus perform a sync) have |
| 1129 | * finished before doing the unmapping below. Skip the |
| 1130 | * wait if the device doesn't actually need syncing, or |
| 1131 | * if there are no outstanding mapped pages. |
| 1132 | */ |
| 1133 | if (dma_dev_need_sync(pool->p.dev) && |
| 1134 | !xa_empty(&pool->dma_mapped)) |
| 1135 | synchronize_net(); |
| 1136 | } |
| 1137 | |
| 1138 | xa_for_each(&pool->dma_mapped, id, ptr) |
| 1139 | __page_pool_release_page_dma(pool, page_to_netmem(ptr)); |
| 1140 | } |
| 1141 | |
| 1142 | /* No more consumers should exist, but producers could still |
| 1143 | * be in-flight. |
| 1144 | */ |
| 1145 | page_pool_empty_ring(pool); |
| 1146 | } |
| 1147 | |
| 1148 | static int page_pool_release(struct page_pool *pool) |
| 1149 | { |
| 1150 | bool in_softirq; |
| 1151 | int inflight; |
| 1152 | |
| 1153 | page_pool_scrub(pool); |
| 1154 | inflight = page_pool_inflight(pool, true); |
| 1155 | /* Acquire producer lock to make sure producers have exited. */ |
| 1156 | in_softirq = page_pool_producer_lock(pool); |
| 1157 | page_pool_producer_unlock(pool, in_softirq); |
| 1158 | if (!inflight) |
| 1159 | __page_pool_destroy(pool); |
| 1160 | |
| 1161 | return inflight; |
| 1162 | } |
| 1163 | |
| 1164 | static void page_pool_release_retry(struct work_struct *wq) |
| 1165 | { |
| 1166 | struct delayed_work *dwq = to_delayed_work(wq); |
| 1167 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); |
| 1168 | void *netdev; |
| 1169 | int inflight; |
| 1170 | |
| 1171 | inflight = page_pool_release(pool); |
| 1172 | /* In rare cases, a driver bug may cause inflight to go negative. |
| 1173 | * Don't reschedule release if inflight is 0 or negative. |
| 1174 | * - If 0, the page_pool has been destroyed |
| 1175 | * - if negative, we will never recover |
| 1176 | * in both cases no reschedule is necessary. |
| 1177 | */ |
| 1178 | if (inflight <= 0) |
| 1179 | return; |
| 1180 | |
| 1181 | /* Periodic warning for page pools the user can't see */ |
| 1182 | netdev = READ_ONCE(pool->slow.netdev); |
| 1183 | if (time_after_eq(jiffies, pool->defer_warn) && |
| 1184 | (!netdev || netdev == NET_PTR_POISON)) { |
| 1185 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; |
| 1186 | |
| 1187 | pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", |
| 1188 | __func__, pool->user.id, inflight, sec); |
| 1189 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; |
| 1190 | } |
| 1191 | |
| 1192 | /* Still not ready to be disconnected, retry later */ |
| 1193 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); |
| 1194 | } |
| 1195 | |
| 1196 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), |
| 1197 | const struct xdp_mem_info *mem) |
| 1198 | { |
| 1199 | refcount_inc(&pool->user_cnt); |
| 1200 | pool->disconnect = disconnect; |
| 1201 | pool->xdp_mem_id = mem->id; |
| 1202 | } |
| 1203 | |
| 1204 | void page_pool_disable_direct_recycling(struct page_pool *pool) |
| 1205 | { |
| 1206 | /* Disable direct recycling based on pool->cpuid. |
| 1207 | * Paired with READ_ONCE() in page_pool_napi_local(). |
| 1208 | */ |
| 1209 | WRITE_ONCE(pool->cpuid, -1); |
| 1210 | |
| 1211 | if (!pool->p.napi) |
| 1212 | return; |
| 1213 | |
| 1214 | napi_assert_will_not_race(pool->p.napi); |
| 1215 | |
| 1216 | mutex_lock(&page_pools_lock); |
| 1217 | WRITE_ONCE(pool->p.napi, NULL); |
| 1218 | mutex_unlock(&page_pools_lock); |
| 1219 | } |
| 1220 | EXPORT_SYMBOL(page_pool_disable_direct_recycling); |
| 1221 | |
| 1222 | void page_pool_destroy(struct page_pool *pool) |
| 1223 | { |
| 1224 | if (!pool) |
| 1225 | return; |
| 1226 | |
| 1227 | if (!page_pool_put(pool)) |
| 1228 | return; |
| 1229 | |
| 1230 | page_pool_disable_direct_recycling(pool); |
| 1231 | page_pool_free_frag(pool); |
| 1232 | |
| 1233 | if (!page_pool_release(pool)) |
| 1234 | return; |
| 1235 | |
| 1236 | page_pool_detached(pool); |
| 1237 | pool->defer_start = jiffies; |
| 1238 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; |
| 1239 | |
| 1240 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
| 1241 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); |
| 1242 | } |
| 1243 | EXPORT_SYMBOL(page_pool_destroy); |
| 1244 | |
| 1245 | /* Caller must provide appropriate safe context, e.g. NAPI. */ |
| 1246 | void page_pool_update_nid(struct page_pool *pool, int new_nid) |
| 1247 | { |
| 1248 | netmem_ref netmem; |
| 1249 | |
| 1250 | trace_page_pool_update_nid(pool, new_nid); |
| 1251 | pool->p.nid = new_nid; |
| 1252 | |
| 1253 | /* Flush pool alloc cache, as refill will check NUMA node */ |
| 1254 | while (pool->alloc.count) { |
| 1255 | netmem = pool->alloc.cache[--pool->alloc.count]; |
| 1256 | page_pool_return_page(pool, netmem); |
| 1257 | } |
| 1258 | } |
| 1259 | EXPORT_SYMBOL(page_pool_update_nid); |
| 1260 | |
| 1261 | bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) |
| 1262 | { |
| 1263 | return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); |
| 1264 | } |
| 1265 | |
| 1266 | /* Associate a niov with a page pool. Should follow with a matching |
| 1267 | * net_mp_niov_clear_page_pool() |
| 1268 | */ |
| 1269 | void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) |
| 1270 | { |
| 1271 | netmem_ref netmem = net_iov_to_netmem(niov); |
| 1272 | |
| 1273 | page_pool_set_pp_info(pool, netmem); |
| 1274 | |
| 1275 | pool->pages_state_hold_cnt++; |
| 1276 | trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); |
| 1277 | } |
| 1278 | |
| 1279 | /* Disassociate a niov from a page pool. Should only be used in the |
| 1280 | * ->release_netmem() path. |
| 1281 | */ |
| 1282 | void net_mp_niov_clear_page_pool(struct net_iov *niov) |
| 1283 | { |
| 1284 | netmem_ref netmem = net_iov_to_netmem(niov); |
| 1285 | |
| 1286 | page_pool_clear_pp_info(netmem); |
| 1287 | } |