net/core/page_pool.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * page_pool.c
   4  *      Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
   5  *      Copyright (C) 2016 Red Hat, Inc.
   6  */
   7
   8 #include <linux/types.h>
   9 #include <linux/kernel.h>
  10 #include <linux/slab.h>
  11 #include <linux/device.h>
  12
  13 #include <net/page_pool.h>
  14 #include <linux/dma-direction.h>
  15 #include <linux/dma-mapping.h>
  16 #include <linux/page-flags.h>
  17 #include <linux/mm.h> /* for __put_page() */
  18
  19 #include <trace/events/page_pool.h>
  20
  21 #define DEFER_TIME (msecs_to_jiffies(1000))
  22 #define DEFER_WARN_INTERVAL (60 * HZ)
  23
  24 static int page_pool_init(struct page_pool *pool,
  25                           const struct page_pool_params *params)
  26 {
  27         unsigned int ring_qsize = 1024; /* Default */
  28
  29         memcpy(&pool->p, params, sizeof(pool->p));
  30
  31         /* Validate only known flags were used */
  32         if (pool->p.flags & ~(PP_FLAG_ALL))
  33                 return -EINVAL;
  34
  35         if (pool->p.pool_size)
  36                 ring_qsize = pool->p.pool_size;
  37
  38         /* Sanity limit mem that can be pinned down */
  39         if (ring_qsize > 32768)
  40                 return -E2BIG;
  41
  42         /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
  43          * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
  44          * which is the XDP_TX use-case.
  45          */
  46         if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
  47             (pool->p.dma_dir != DMA_BIDIRECTIONAL))
  48                 return -EINVAL;
  49
  50         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
  51                 /* In order to request DMA-sync-for-device the page
  52                  * needs to be mapped
  53                  */
  54                 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
  55                         return -EINVAL;
  56
  57                 if (!pool->p.max_len)
  58                         return -EINVAL;
  59
  60                 /* pool->p.offset has to be set according to the address
  61                  * offset used by the DMA engine to start copying rx data
  62                  */
  63         }
  64
  65         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
  66                 return -ENOMEM;
  67
  68         atomic_set(&pool->pages_state_release_cnt, 0);
  69
  70         /* Driver calling page_pool_create() also call page_pool_destroy() */
  71         refcount_set(&pool->user_cnt, 1);
  72
  73         if (pool->p.flags & PP_FLAG_DMA_MAP)
  74                 get_device(pool->p.dev);
  75
  76         return 0;
  77 }
  78
  79 struct page_pool *page_pool_create(const struct page_pool_params *params)
  80 {
  81         struct page_pool *pool;
  82         int err;
  83
  84         pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
  85         if (!pool)
  86                 return ERR_PTR(-ENOMEM);
  87
  88         err = page_pool_init(pool, params);
  89         if (err < 0) {
  90                 pr_warn("%s() gave up with errno %d\n", __func__, err);
  91                 kfree(pool);
  92                 return ERR_PTR(err);
  93         }
  94
  95         return pool;
  96 }
  97 EXPORT_SYMBOL(page_pool_create);
  98
  99 /* fast path */
 100 static struct page *__page_pool_get_cached(struct page_pool *pool)
 101 {
 102         struct ptr_ring *r = &pool->ring;
 103         bool refill = false;
 104         struct page *page;
 105
 106         /* Test for safe-context, caller should provide this guarantee */
 107         if (likely(in_serving_softirq())) {
 108                 if (likely(pool->alloc.count)) {
 109                         /* Fast-path */
 110                         page = pool->alloc.cache[--pool->alloc.count];
 111                         return page;
 112                 }
 113                 refill = true;
 114         }
 115
 116         /* Quicker fallback, avoid locks when ring is empty */
 117         if (__ptr_ring_empty(r))
 118                 return NULL;
 119
 120         /* Slow-path: Get page from locked ring queue,
 121          * refill alloc array if requested.
 122          */
 123         spin_lock(&r->consumer_lock);
 124         page = __ptr_ring_consume(r);
 125         if (refill)
 126                 pool->alloc.count = __ptr_ring_consume_batched(r,
 127                                                         pool->alloc.cache,
 128                                                         PP_ALLOC_CACHE_REFILL);
 129         spin_unlock(&r->consumer_lock);
 130         return page;
 131 }
 132
 133 static void page_pool_dma_sync_for_device(struct page_pool *pool,
 134                                           struct page *page,
 135                                           unsigned int dma_sync_size)
 136 {
 137         dma_sync_size = min(dma_sync_size, pool->p.max_len);
 138         dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
 139                                          pool->p.offset, dma_sync_size,
 140                                          pool->p.dma_dir);
 141 }
 142
 143 /* slow path */
 144 noinline
 145 static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 146                                                  gfp_t _gfp)
 147 {
 148         struct page *page;
 149         gfp_t gfp = _gfp;
 150         dma_addr_t dma;
 151
 152         /* We could always set __GFP_COMP, and avoid this branch, as
 153          * prep_new_page() can handle order-0 with __GFP_COMP.
 154          */
 155         if (pool->p.order)
 156                 gfp |= __GFP_COMP;
 157
 158         /* FUTURE development:
 159          *
 160          * Current slow-path essentially falls back to single page
 161          * allocations, which doesn't improve performance.  This code
 162          * need bulk allocation support from the page allocator code.
 163          */
 164
 165         /* Cache was empty, do real allocation */
 166         page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 167         if (!page)
 168                 return NULL;
 169
 170         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 171                 goto skip_dma_map;
 172
 173         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 174          * since dma_addr_t can be either 32 or 64 bits and does not always fit
 175          * into page private data (i.e 32bit cpu with 64bit DMA caps)
 176          * This mapping is kept for lifetime of page, until leaving pool.
 177          */
 178         dma = dma_map_page_attrs(pool->p.dev, page, 0,
 179                                  (PAGE_SIZE << pool->p.order),
 180                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
 181         if (dma_mapping_error(pool->p.dev, dma)) {
 182                 put_page(page);
 183                 return NULL;
 184         }
 185         page->dma_addr = dma;
 186
 187         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 188                 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 189
 190 skip_dma_map:
 191         /* Track how many pages are held 'in-flight' */
 192         pool->pages_state_hold_cnt++;
 193
 194         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 195
 196         /* When page just alloc'ed is should/must have refcnt 1. */
 197         return page;
 198 }
 199
 200 /* For using page_pool replace: alloc_pages() API calls, but provide
 201  * synchronization guarantee for allocation side.
 202  */
 203 struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
 204 {
 205         struct page *page;
 206
 207         /* Fast-path: Get a page from cache */
 208         page = __page_pool_get_cached(pool);
 209         if (page)
 210                 return page;
 211
 212         /* Slow-path: cache empty, do real allocation */
 213         page = __page_pool_alloc_pages_slow(pool, gfp);
 214         return page;
 215 }
 216 EXPORT_SYMBOL(page_pool_alloc_pages);
 217
 218 /* Calculate distance between two u32 values, valid if distance is below 2^(31)
 219  *  https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
 220  */
 221 #define _distance(a, b) (s32)((a) - (b))
 222
 223 static s32 page_pool_inflight(struct page_pool *pool)
 224 {
 225         u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
 226         u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
 227         s32 inflight;
 228
 229         inflight = _distance(hold_cnt, release_cnt);
 230
 231         trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
 232         WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
 233
 234         return inflight;
 235 }
 236
 237 /* Cleanup page_pool state from page */
 238 static void __page_pool_clean_page(struct page_pool *pool,
 239                                    struct page *page)
 240 {
 241         dma_addr_t dma;
 242         int count;
 243
 244         if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 245                 goto skip_dma_unmap;
 246
 247         dma = page->dma_addr;
 248         /* DMA unmap */
 249         dma_unmap_page_attrs(pool->p.dev, dma,
 250                              PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 251                              DMA_ATTR_SKIP_CPU_SYNC);
 252         page->dma_addr = 0;
 253 skip_dma_unmap:
 254         /* This may be the last page returned, releasing the pool, so
 255          * it is not safe to reference pool afterwards.
 256          */
 257         count = atomic_inc_return(&pool->pages_state_release_cnt);
 258         trace_page_pool_state_release(pool, page, count);
 259 }
 260
 261 /* unmap the page and clean our state */
 262 void page_pool_unmap_page(struct page_pool *pool, struct page *page)
 263 {
 264         /* When page is unmapped, this implies page will not be
 265          * returned to page_pool.
 266          */
 267         __page_pool_clean_page(pool, page);
 268 }
 269 EXPORT_SYMBOL(page_pool_unmap_page);
 270
 271 /* Return a page to the page allocator, cleaning up our state */
 272 static void __page_pool_return_page(struct page_pool *pool, struct page *page)
 273 {
 274         __page_pool_clean_page(pool, page);
 275
 276         put_page(page);
 277         /* An optimization would be to call __free_pages(page, pool->p.order)
 278          * knowing page is not part of page-cache (thus avoiding a
 279          * __page_cache_release() call).
 280          */
 281 }
 282
 283 static bool __page_pool_recycle_into_ring(struct page_pool *pool,
 284                                    struct page *page)
 285 {
 286         int ret;
 287         /* BH protection not needed if current is serving softirq */
 288         if (in_serving_softirq())
 289                 ret = ptr_ring_produce(&pool->ring, page);
 290         else
 291                 ret = ptr_ring_produce_bh(&pool->ring, page);
 292
 293         return (ret == 0) ? true : false;
 294 }
 295
 296 /* Only allow direct recycling in special circumstances, into the
 297  * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
 298  *
 299  * Caller must provide appropriate safe context.
 300  */
 301 static bool __page_pool_recycle_direct(struct page *page,
 302                                        struct page_pool *pool)
 303 {
 304         if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
 305                 return false;
 306
 307         /* Caller MUST have verified/know (page_ref_count(page) == 1) */
 308         pool->alloc.cache[pool->alloc.count++] = page;
 309         return true;
 310 }
 311
 312 /* page is NOT reusable when:
 313  * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
 314  * 2) belongs to a different NUMA node than pool->p.nid.
 315  *
 316  * To update pool->p.nid users must call page_pool_update_nid.
 317  */
 318 static bool pool_page_reusable(struct page_pool *pool, struct page *page)
 319 {
 320         return !page_is_pfmemalloc(page) && page_to_nid(page) == pool->p.nid;
 321 }
 322
 323 void __page_pool_put_page(struct page_pool *pool, struct page *page,
 324                           unsigned int dma_sync_size, bool allow_direct)
 325 {
 326         /* This allocator is optimized for the XDP mode that uses
 327          * one-frame-per-page, but have fallbacks that act like the
 328          * regular page allocator APIs.
 329          *
 330          * refcnt == 1 means page_pool owns page, and can recycle it.
 331          */
 332         if (likely(page_ref_count(page) == 1 &&
 333                    pool_page_reusable(pool, page))) {
 334                 /* Read barrier done in page_ref_count / READ_ONCE */
 335
 336                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 337                         page_pool_dma_sync_for_device(pool, page,
 338                                                       dma_sync_size);
 339
 340                 if (allow_direct && in_serving_softirq())
 341                         if (__page_pool_recycle_direct(page, pool))
 342                                 return;
 343
 344                 if (!__page_pool_recycle_into_ring(pool, page)) {
 345                         /* Cache full, fallback to free pages */
 346                         __page_pool_return_page(pool, page);
 347                 }
 348                 return;
 349         }
 350         /* Fallback/non-XDP mode: API user have elevated refcnt.
 351          *
 352          * Many drivers split up the page into fragments, and some
 353          * want to keep doing this to save memory and do refcnt based
 354          * recycling. Support this use case too, to ease drivers
 355          * switching between XDP/non-XDP.
 356          *
 357          * In-case page_pool maintains the DMA mapping, API user must
 358          * call page_pool_put_page once.  In this elevated refcnt
 359          * case, the DMA is unmapped/released, as driver is likely
 360          * doing refcnt based recycle tricks, meaning another process
 361          * will be invoking put_page.
 362          */
 363         __page_pool_clean_page(pool, page);
 364         put_page(page);
 365 }
 366 EXPORT_SYMBOL(__page_pool_put_page);
 367
 368 static void __page_pool_empty_ring(struct page_pool *pool)
 369 {
 370         struct page *page;
 371
 372         /* Empty recycle ring */
 373         while ((page = ptr_ring_consume_bh(&pool->ring))) {
 374                 /* Verify the refcnt invariant of cached pages */
 375                 if (!(page_ref_count(page) == 1))
 376                         pr_crit("%s() page_pool refcnt %d violation\n",
 377                                 __func__, page_ref_count(page));
 378
 379                 __page_pool_return_page(pool, page);
 380         }
 381 }
 382
 383 static void page_pool_free(struct page_pool *pool)
 384 {
 385         if (pool->disconnect)
 386                 pool->disconnect(pool);
 387
 388         ptr_ring_cleanup(&pool->ring, NULL);
 389
 390         if (pool->p.flags & PP_FLAG_DMA_MAP)
 391                 put_device(pool->p.dev);
 392
 393         kfree(pool);
 394 }
 395
 396 static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
 397 {
 398         struct page *page;
 399
 400         if (pool->destroy_cnt)
 401                 return;
 402
 403         /* Empty alloc cache, assume caller made sure this is
 404          * no-longer in use, and page_pool_alloc_pages() cannot be
 405          * call concurrently.
 406          */
 407         while (pool->alloc.count) {
 408                 page = pool->alloc.cache[--pool->alloc.count];
 409                 __page_pool_return_page(pool, page);
 410         }
 411 }
 412
 413 static void page_pool_scrub(struct page_pool *pool)
 414 {
 415         page_pool_empty_alloc_cache_once(pool);
 416         pool->destroy_cnt++;
 417
 418         /* No more consumers should exist, but producers could still
 419          * be in-flight.
 420          */
 421         __page_pool_empty_ring(pool);
 422 }
 423
 424 static int page_pool_release(struct page_pool *pool)
 425 {
 426         int inflight;
 427
 428         page_pool_scrub(pool);
 429         inflight = page_pool_inflight(pool);
 430         if (!inflight)
 431                 page_pool_free(pool);
 432
 433         return inflight;
 434 }
 435
 436 static void page_pool_release_retry(struct work_struct *wq)
 437 {
 438         struct delayed_work *dwq = to_delayed_work(wq);
 439         struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
 440         int inflight;
 441
 442         inflight = page_pool_release(pool);
 443         if (!inflight)
 444                 return;
 445
 446         /* Periodic warning */
 447         if (time_after_eq(jiffies, pool->defer_warn)) {
 448                 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
 449
 450                 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
 451                         __func__, inflight, sec);
 452                 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
 453         }
 454
 455         /* Still not ready to be disconnected, retry later */
 456         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 457 }
 458
 459 void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
 460 {
 461         refcount_inc(&pool->user_cnt);
 462         pool->disconnect = disconnect;
 463 }
 464
 465 void page_pool_destroy(struct page_pool *pool)
 466 {
 467         if (!pool)
 468                 return;
 469
 470         if (!page_pool_put(pool))
 471                 return;
 472
 473         if (!page_pool_release(pool))
 474                 return;
 475
 476         pool->defer_start = jiffies;
 477         pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL;
 478
 479         INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
 480         schedule_delayed_work(&pool->release_dw, DEFER_TIME);
 481 }
 482 EXPORT_SYMBOL(page_pool_destroy);
 483
 484 /* Caller must provide appropriate safe context, e.g. NAPI. */
 485 void page_pool_update_nid(struct page_pool *pool, int new_nid)
 486 {
 487         trace_page_pool_update_nid(pool, new_nid);
 488         pool->p.nid = new_nid;
 489 }
 490 EXPORT_SYMBOL(page_pool_update_nid);