page_pool: handle page recycle for NUMA_NO_NODE condition
[linux-2.6-block.git] / net / core / page_pool.c
CommitLineData
ff7d6b27
JDB
1/* SPDX-License-Identifier: GPL-2.0
2 *
3 * page_pool.c
4 * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
5 * Copyright (C) 2016 Red Hat, Inc.
6 */
32c28f7e 7
ff7d6b27
JDB
8#include <linux/types.h>
9#include <linux/kernel.h>
10#include <linux/slab.h>
f71fec47 11#include <linux/device.h>
ff7d6b27
JDB
12
13#include <net/page_pool.h>
14#include <linux/dma-direction.h>
15#include <linux/dma-mapping.h>
16#include <linux/page-flags.h>
17#include <linux/mm.h> /* for __put_page() */
18
32c28f7e
JDB
19#include <trace/events/page_pool.h>
20
c3f812ce
JL
21#define DEFER_TIME (msecs_to_jiffies(1000))
22#define DEFER_WARN_INTERVAL (60 * HZ)
23
ff7d6b27
JDB
24static int page_pool_init(struct page_pool *pool,
25 const struct page_pool_params *params)
26{
27 unsigned int ring_qsize = 1024; /* Default */
28
29 memcpy(&pool->p, params, sizeof(pool->p));
30
31 /* Validate only known flags were used */
32 if (pool->p.flags & ~(PP_FLAG_ALL))
33 return -EINVAL;
34
35 if (pool->p.pool_size)
36 ring_qsize = pool->p.pool_size;
37
38 /* Sanity limit mem that can be pinned down */
39 if (ring_qsize > 32768)
40 return -E2BIG;
41
42 /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
43 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
44 * which is the XDP_TX use-case.
45 */
46 if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
47 (pool->p.dma_dir != DMA_BIDIRECTIONAL))
48 return -EINVAL;
49
e68bc756
LB
50 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
51 /* In order to request DMA-sync-for-device the page
52 * needs to be mapped
53 */
54 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
55 return -EINVAL;
56
57 if (!pool->p.max_len)
58 return -EINVAL;
59
60 /* pool->p.offset has to be set according to the address
61 * offset used by the DMA engine to start copying rx data
62 */
63 }
64
ff7d6b27
JDB
65 if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
66 return -ENOMEM;
67
99c07c43
JDB
68 atomic_set(&pool->pages_state_release_cnt, 0);
69
1da4bbef
IK
70 /* Driver calling page_pool_create() also call page_pool_destroy() */
71 refcount_set(&pool->user_cnt, 1);
72
f71fec47
JDB
73 if (pool->p.flags & PP_FLAG_DMA_MAP)
74 get_device(pool->p.dev);
75
ff7d6b27
JDB
76 return 0;
77}
78
79struct page_pool *page_pool_create(const struct page_pool_params *params)
80{
81 struct page_pool *pool;
873343e7 82 int err;
ff7d6b27
JDB
83
84 pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
85 if (!pool)
86 return ERR_PTR(-ENOMEM);
87
88 err = page_pool_init(pool, params);
89 if (err < 0) {
90 pr_warn("%s() gave up with errno %d\n", __func__, err);
91 kfree(pool);
92 return ERR_PTR(err);
93 }
1da4bbef 94
ff7d6b27
JDB
95 return pool;
96}
97EXPORT_SYMBOL(page_pool_create);
98
44768dec
JDB
99static void __page_pool_return_page(struct page_pool *pool, struct page *page);
100
101noinline
102static struct page *page_pool_refill_alloc_cache(struct page_pool *pool,
103 bool refill)
104{
105 struct ptr_ring *r = &pool->ring;
106 struct page *page;
107 int pref_nid; /* preferred NUMA node */
108
109 /* Quicker fallback, avoid locks when ring is empty */
110 if (__ptr_ring_empty(r))
111 return NULL;
112
113 /* Softirq guarantee CPU and thus NUMA node is stable. This,
114 * assumes CPU refilling driver RX-ring will also run RX-NAPI.
115 */
116 pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
117
118 /* Slower-path: Get pages from locked ring queue */
119 spin_lock(&r->consumer_lock);
120
121 /* Refill alloc array, but only if NUMA match */
122 do {
123 page = __ptr_ring_consume(r);
124 if (unlikely(!page))
125 break;
126
127 if (likely(page_to_nid(page) == pref_nid)) {
128 pool->alloc.cache[pool->alloc.count++] = page;
129 } else {
130 /* NUMA mismatch;
131 * (1) release 1 page to page-allocator and
132 * (2) break out to fallthrough to alloc_pages_node.
133 * This limit stress on page buddy alloactor.
134 */
135 __page_pool_return_page(pool, page);
136 page = NULL;
137 break;
138 }
139 } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL &&
140 refill);
141
142 /* Return last page */
143 if (likely(pool->alloc.count > 0))
144 page = pool->alloc.cache[--pool->alloc.count];
145
146 spin_unlock(&r->consumer_lock);
147 return page;
148}
149
ff7d6b27
JDB
150/* fast path */
151static struct page *__page_pool_get_cached(struct page_pool *pool)
152{
8d73f8f2 153 bool refill = false;
ff7d6b27
JDB
154 struct page *page;
155
ff7d6b27
JDB
156 /* Test for safe-context, caller should provide this guarantee */
157 if (likely(in_serving_softirq())) {
158 if (likely(pool->alloc.count)) {
159 /* Fast-path */
160 page = pool->alloc.cache[--pool->alloc.count];
161 return page;
162 }
8d73f8f2 163 refill = true;
ff7d6b27
JDB
164 }
165
44768dec 166 page = page_pool_refill_alloc_cache(pool, refill);
ff7d6b27
JDB
167 return page;
168}
169
e68bc756
LB
170static void page_pool_dma_sync_for_device(struct page_pool *pool,
171 struct page *page,
172 unsigned int dma_sync_size)
173{
174 dma_sync_size = min(dma_sync_size, pool->p.max_len);
175 dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
176 pool->p.offset, dma_sync_size,
177 pool->p.dma_dir);
178}
179
ff7d6b27
JDB
180/* slow path */
181noinline
182static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
183 gfp_t _gfp)
184{
185 struct page *page;
186 gfp_t gfp = _gfp;
187 dma_addr_t dma;
188
189 /* We could always set __GFP_COMP, and avoid this branch, as
190 * prep_new_page() can handle order-0 with __GFP_COMP.
191 */
192 if (pool->p.order)
193 gfp |= __GFP_COMP;
194
195 /* FUTURE development:
196 *
197 * Current slow-path essentially falls back to single page
198 * allocations, which doesn't improve performance. This code
199 * need bulk allocation support from the page allocator code.
200 */
201
202 /* Cache was empty, do real allocation */
203 page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
204 if (!page)
205 return NULL;
206
207 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
208 goto skip_dma_map;
209
1567b85e
IA
210 /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
211 * since dma_addr_t can be either 32 or 64 bits and does not always fit
212 * into page private data (i.e 32bit cpu with 64bit DMA caps)
ff7d6b27
JDB
213 * This mapping is kept for lifetime of page, until leaving pool.
214 */
13f16d9d
JDB
215 dma = dma_map_page_attrs(pool->p.dev, page, 0,
216 (PAGE_SIZE << pool->p.order),
217 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
ff7d6b27
JDB
218 if (dma_mapping_error(pool->p.dev, dma)) {
219 put_page(page);
220 return NULL;
221 }
1567b85e 222 page->dma_addr = dma;
ff7d6b27 223
e68bc756
LB
224 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
225 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
226
ff7d6b27 227skip_dma_map:
99c07c43
JDB
228 /* Track how many pages are held 'in-flight' */
229 pool->pages_state_hold_cnt++;
230
32c28f7e
JDB
231 trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
232
ff7d6b27
JDB
233 /* When page just alloc'ed is should/must have refcnt 1. */
234 return page;
235}
236
237/* For using page_pool replace: alloc_pages() API calls, but provide
238 * synchronization guarantee for allocation side.
239 */
240struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
241{
242 struct page *page;
243
244 /* Fast-path: Get a page from cache */
245 page = __page_pool_get_cached(pool);
246 if (page)
247 return page;
248
249 /* Slow-path: cache empty, do real allocation */
250 page = __page_pool_alloc_pages_slow(pool, gfp);
251 return page;
252}
253EXPORT_SYMBOL(page_pool_alloc_pages);
254
99c07c43
JDB
255/* Calculate distance between two u32 values, valid if distance is below 2^(31)
256 * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
257 */
258#define _distance(a, b) (s32)((a) - (b))
259
260static s32 page_pool_inflight(struct page_pool *pool)
261{
262 u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
263 u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
c3f812ce 264 s32 inflight;
99c07c43 265
c3f812ce 266 inflight = _distance(hold_cnt, release_cnt);
99c07c43 267
7c9e6942 268 trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
99c07c43
JDB
269 WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
270
c3f812ce 271 return inflight;
99c07c43
JDB
272}
273
ff7d6b27
JDB
274/* Cleanup page_pool state from page */
275static void __page_pool_clean_page(struct page_pool *pool,
276 struct page *page)
277{
1567b85e 278 dma_addr_t dma;
c3f812ce 279 int count;
1567b85e 280
ff7d6b27 281 if (!(pool->p.flags & PP_FLAG_DMA_MAP))
99c07c43 282 goto skip_dma_unmap;
ff7d6b27 283
1567b85e 284 dma = page->dma_addr;
ff7d6b27 285 /* DMA unmap */
13f16d9d
JDB
286 dma_unmap_page_attrs(pool->p.dev, dma,
287 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
288 DMA_ATTR_SKIP_CPU_SYNC);
1567b85e 289 page->dma_addr = 0;
99c07c43 290skip_dma_unmap:
c3f812ce
JL
291 /* This may be the last page returned, releasing the pool, so
292 * it is not safe to reference pool afterwards.
293 */
294 count = atomic_inc_return(&pool->pages_state_release_cnt);
295 trace_page_pool_state_release(pool, page, count);
ff7d6b27
JDB
296}
297
a25d50bf
IA
298/* unmap the page and clean our state */
299void page_pool_unmap_page(struct page_pool *pool, struct page *page)
300{
99c07c43
JDB
301 /* When page is unmapped, this implies page will not be
302 * returned to page_pool.
303 */
a25d50bf
IA
304 __page_pool_clean_page(pool, page);
305}
306EXPORT_SYMBOL(page_pool_unmap_page);
307
ff7d6b27
JDB
308/* Return a page to the page allocator, cleaning up our state */
309static void __page_pool_return_page(struct page_pool *pool, struct page *page)
310{
311 __page_pool_clean_page(pool, page);
99c07c43 312
ff7d6b27
JDB
313 put_page(page);
314 /* An optimization would be to call __free_pages(page, pool->p.order)
315 * knowing page is not part of page-cache (thus avoiding a
316 * __page_cache_release() call).
317 */
318}
319
320static bool __page_pool_recycle_into_ring(struct page_pool *pool,
321 struct page *page)
322{
323 int ret;
324 /* BH protection not needed if current is serving softirq */
325 if (in_serving_softirq())
326 ret = ptr_ring_produce(&pool->ring, page);
327 else
328 ret = ptr_ring_produce_bh(&pool->ring, page);
329
330 return (ret == 0) ? true : false;
331}
332
333/* Only allow direct recycling in special circumstances, into the
334 * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case.
335 *
336 * Caller must provide appropriate safe context.
337 */
338static bool __page_pool_recycle_direct(struct page *page,
339 struct page_pool *pool)
340{
341 if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
342 return false;
343
344 /* Caller MUST have verified/know (page_ref_count(page) == 1) */
345 pool->alloc.cache[pool->alloc.count++] = page;
346 return true;
347}
348
d5394610
SM
349/* page is NOT reusable when:
350 * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
d5394610
SM
351 */
352static bool pool_page_reusable(struct page_pool *pool, struct page *page)
353{
44768dec 354 return !page_is_pfmemalloc(page);
d5394610
SM
355}
356
e68bc756
LB
357void __page_pool_put_page(struct page_pool *pool, struct page *page,
358 unsigned int dma_sync_size, bool allow_direct)
ff7d6b27
JDB
359{
360 /* This allocator is optimized for the XDP mode that uses
361 * one-frame-per-page, but have fallbacks that act like the
362 * regular page allocator APIs.
363 *
364 * refcnt == 1 means page_pool owns page, and can recycle it.
365 */
d5394610
SM
366 if (likely(page_ref_count(page) == 1 &&
367 pool_page_reusable(pool, page))) {
ff7d6b27
JDB
368 /* Read barrier done in page_ref_count / READ_ONCE */
369
e68bc756
LB
370 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
371 page_pool_dma_sync_for_device(pool, page,
372 dma_sync_size);
373
ff7d6b27
JDB
374 if (allow_direct && in_serving_softirq())
375 if (__page_pool_recycle_direct(page, pool))
376 return;
377
378 if (!__page_pool_recycle_into_ring(pool, page)) {
379 /* Cache full, fallback to free pages */
380 __page_pool_return_page(pool, page);
381 }
382 return;
383 }
384 /* Fallback/non-XDP mode: API user have elevated refcnt.
385 *
386 * Many drivers split up the page into fragments, and some
387 * want to keep doing this to save memory and do refcnt based
388 * recycling. Support this use case too, to ease drivers
389 * switching between XDP/non-XDP.
390 *
391 * In-case page_pool maintains the DMA mapping, API user must
392 * call page_pool_put_page once. In this elevated refcnt
393 * case, the DMA is unmapped/released, as driver is likely
394 * doing refcnt based recycle tricks, meaning another process
395 * will be invoking put_page.
396 */
397 __page_pool_clean_page(pool, page);
398 put_page(page);
399}
400EXPORT_SYMBOL(__page_pool_put_page);
401
402static void __page_pool_empty_ring(struct page_pool *pool)
403{
404 struct page *page;
405
406 /* Empty recycle ring */
4905bd9a 407 while ((page = ptr_ring_consume_bh(&pool->ring))) {
ff7d6b27
JDB
408 /* Verify the refcnt invariant of cached pages */
409 if (!(page_ref_count(page) == 1))
410 pr_crit("%s() page_pool refcnt %d violation\n",
411 __func__, page_ref_count(page));
412
413 __page_pool_return_page(pool, page);
414 }
415}
416
c3f812ce 417static void page_pool_free(struct page_pool *pool)
d956a048 418{
c3f812ce
JL
419 if (pool->disconnect)
420 pool->disconnect(pool);
e54cfd7e
JDB
421
422 ptr_ring_cleanup(&pool->ring, NULL);
f71fec47
JDB
423
424 if (pool->p.flags & PP_FLAG_DMA_MAP)
425 put_device(pool->p.dev);
426
e54cfd7e
JDB
427 kfree(pool);
428}
e54cfd7e 429
7c9e6942 430static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
ff7d6b27
JDB
431{
432 struct page *page;
433
7c9e6942
JDB
434 if (pool->destroy_cnt)
435 return;
436
ff7d6b27
JDB
437 /* Empty alloc cache, assume caller made sure this is
438 * no-longer in use, and page_pool_alloc_pages() cannot be
439 * call concurrently.
440 */
441 while (pool->alloc.count) {
442 page = pool->alloc.cache[--pool->alloc.count];
443 __page_pool_return_page(pool, page);
444 }
7c9e6942
JDB
445}
446
447static void page_pool_scrub(struct page_pool *pool)
448{
449 page_pool_empty_alloc_cache_once(pool);
450 pool->destroy_cnt++;
ff7d6b27
JDB
451
452 /* No more consumers should exist, but producers could still
453 * be in-flight.
454 */
455 __page_pool_empty_ring(pool);
c3f812ce
JL
456}
457
458static int page_pool_release(struct page_pool *pool)
459{
460 int inflight;
461
462 page_pool_scrub(pool);
463 inflight = page_pool_inflight(pool);
464 if (!inflight)
465 page_pool_free(pool);
466
467 return inflight;
468}
469
470static void page_pool_release_retry(struct work_struct *wq)
471{
472 struct delayed_work *dwq = to_delayed_work(wq);
473 struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
474 int inflight;
475
476 inflight = page_pool_release(pool);
477 if (!inflight)
478 return;
479
480 /* Periodic warning */
481 if (time_after_eq(jiffies, pool->defer_warn)) {
482 int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
483
484 pr_warn("%s() stalled pool shutdown %d inflight %d sec\n",
485 __func__, inflight, sec);
486 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
487 }
488
489 /* Still not ready to be disconnected, retry later */
490 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
491}
492
493void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *))
494{
495 refcount_inc(&pool->user_cnt);
496 pool->disconnect = disconnect;
497}
498
499void page_pool_destroy(struct page_pool *pool)
500{
501 if (!pool)
502 return;
503
504 if (!page_pool_put(pool))
505 return;
506
507 if (!page_pool_release(pool))
508 return;
509
510 pool->defer_start = jiffies;
511 pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
ff7d6b27 512
c3f812ce
JL
513 INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
514 schedule_delayed_work(&pool->release_dw, DEFER_TIME);
ff7d6b27 515}
c3f812ce 516EXPORT_SYMBOL(page_pool_destroy);
bc836748
SM
517
518/* Caller must provide appropriate safe context, e.g. NAPI. */
519void page_pool_update_nid(struct page_pool *pool, int new_nid)
520{
44768dec
JDB
521 struct page *page;
522
bc836748
SM
523 trace_page_pool_update_nid(pool, new_nid);
524 pool->p.nid = new_nid;
44768dec
JDB
525
526 /* Flush pool alloc cache, as refill will check NUMA node */
527 while (pool->alloc.count) {
528 page = pool->alloc.cache[--pool->alloc.count];
529 __page_pool_return_page(pool, page);
530 }
bc836748
SM
531}
532EXPORT_SYMBOL(page_pool_update_nid);