Commit | Line | Data |
---|---|---|
ff7d6b27 JDB |
1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * | |
3 | * page_pool.c | |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> | |
5 | * Copyright (C) 2016 Red Hat, Inc. | |
6 | */ | |
32c28f7e | 7 | |
ff7d6b27 JDB |
8 | #include <linux/types.h> |
9 | #include <linux/kernel.h> | |
10 | #include <linux/slab.h> | |
f71fec47 | 11 | #include <linux/device.h> |
ff7d6b27 JDB |
12 | |
13 | #include <net/page_pool.h> | |
14 | #include <linux/dma-direction.h> | |
15 | #include <linux/dma-mapping.h> | |
16 | #include <linux/page-flags.h> | |
17 | #include <linux/mm.h> /* for __put_page() */ | |
18 | ||
32c28f7e JDB |
19 | #include <trace/events/page_pool.h> |
20 | ||
c3f812ce JL |
21 | #define DEFER_TIME (msecs_to_jiffies(1000)) |
22 | #define DEFER_WARN_INTERVAL (60 * HZ) | |
23 | ||
ff7d6b27 JDB |
24 | static int page_pool_init(struct page_pool *pool, |
25 | const struct page_pool_params *params) | |
26 | { | |
27 | unsigned int ring_qsize = 1024; /* Default */ | |
28 | ||
29 | memcpy(&pool->p, params, sizeof(pool->p)); | |
30 | ||
31 | /* Validate only known flags were used */ | |
32 | if (pool->p.flags & ~(PP_FLAG_ALL)) | |
33 | return -EINVAL; | |
34 | ||
35 | if (pool->p.pool_size) | |
36 | ring_qsize = pool->p.pool_size; | |
37 | ||
38 | /* Sanity limit mem that can be pinned down */ | |
39 | if (ring_qsize > 32768) | |
40 | return -E2BIG; | |
41 | ||
42 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. | |
43 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, | |
44 | * which is the XDP_TX use-case. | |
45 | */ | |
46 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && | |
47 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) | |
48 | return -EINVAL; | |
49 | ||
50 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) | |
51 | return -ENOMEM; | |
52 | ||
99c07c43 JDB |
53 | atomic_set(&pool->pages_state_release_cnt, 0); |
54 | ||
1da4bbef IK |
55 | /* Driver calling page_pool_create() also call page_pool_destroy() */ |
56 | refcount_set(&pool->user_cnt, 1); | |
57 | ||
f71fec47 JDB |
58 | if (pool->p.flags & PP_FLAG_DMA_MAP) |
59 | get_device(pool->p.dev); | |
60 | ||
ff7d6b27 JDB |
61 | return 0; |
62 | } | |
63 | ||
64 | struct page_pool *page_pool_create(const struct page_pool_params *params) | |
65 | { | |
66 | struct page_pool *pool; | |
873343e7 | 67 | int err; |
ff7d6b27 JDB |
68 | |
69 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); | |
70 | if (!pool) | |
71 | return ERR_PTR(-ENOMEM); | |
72 | ||
73 | err = page_pool_init(pool, params); | |
74 | if (err < 0) { | |
75 | pr_warn("%s() gave up with errno %d\n", __func__, err); | |
76 | kfree(pool); | |
77 | return ERR_PTR(err); | |
78 | } | |
1da4bbef | 79 | |
ff7d6b27 JDB |
80 | return pool; |
81 | } | |
82 | EXPORT_SYMBOL(page_pool_create); | |
83 | ||
84 | /* fast path */ | |
85 | static struct page *__page_pool_get_cached(struct page_pool *pool) | |
86 | { | |
87 | struct ptr_ring *r = &pool->ring; | |
8d73f8f2 | 88 | bool refill = false; |
ff7d6b27 JDB |
89 | struct page *page; |
90 | ||
ff7d6b27 JDB |
91 | /* Test for safe-context, caller should provide this guarantee */ |
92 | if (likely(in_serving_softirq())) { | |
93 | if (likely(pool->alloc.count)) { | |
94 | /* Fast-path */ | |
95 | page = pool->alloc.cache[--pool->alloc.count]; | |
96 | return page; | |
97 | } | |
8d73f8f2 | 98 | refill = true; |
ff7d6b27 JDB |
99 | } |
100 | ||
8d73f8f2 JL |
101 | /* Quicker fallback, avoid locks when ring is empty */ |
102 | if (__ptr_ring_empty(r)) | |
103 | return NULL; | |
104 | ||
105 | /* Slow-path: Get page from locked ring queue, | |
106 | * refill alloc array if requested. | |
107 | */ | |
108 | spin_lock(&r->consumer_lock); | |
109 | page = __ptr_ring_consume(r); | |
110 | if (refill) | |
111 | pool->alloc.count = __ptr_ring_consume_batched(r, | |
112 | pool->alloc.cache, | |
113 | PP_ALLOC_CACHE_REFILL); | |
114 | spin_unlock(&r->consumer_lock); | |
ff7d6b27 JDB |
115 | return page; |
116 | } | |
117 | ||
118 | /* slow path */ | |
119 | noinline | |
120 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, | |
121 | gfp_t _gfp) | |
122 | { | |
123 | struct page *page; | |
124 | gfp_t gfp = _gfp; | |
125 | dma_addr_t dma; | |
126 | ||
127 | /* We could always set __GFP_COMP, and avoid this branch, as | |
128 | * prep_new_page() can handle order-0 with __GFP_COMP. | |
129 | */ | |
130 | if (pool->p.order) | |
131 | gfp |= __GFP_COMP; | |
132 | ||
133 | /* FUTURE development: | |
134 | * | |
135 | * Current slow-path essentially falls back to single page | |
136 | * allocations, which doesn't improve performance. This code | |
137 | * need bulk allocation support from the page allocator code. | |
138 | */ | |
139 | ||
140 | /* Cache was empty, do real allocation */ | |
141 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); | |
142 | if (!page) | |
143 | return NULL; | |
144 | ||
145 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) | |
146 | goto skip_dma_map; | |
147 | ||
1567b85e IA |
148 | /* Setup DMA mapping: use 'struct page' area for storing DMA-addr |
149 | * since dma_addr_t can be either 32 or 64 bits and does not always fit | |
150 | * into page private data (i.e 32bit cpu with 64bit DMA caps) | |
ff7d6b27 JDB |
151 | * This mapping is kept for lifetime of page, until leaving pool. |
152 | */ | |
13f16d9d JDB |
153 | dma = dma_map_page_attrs(pool->p.dev, page, 0, |
154 | (PAGE_SIZE << pool->p.order), | |
155 | pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC); | |
ff7d6b27 JDB |
156 | if (dma_mapping_error(pool->p.dev, dma)) { |
157 | put_page(page); | |
158 | return NULL; | |
159 | } | |
1567b85e | 160 | page->dma_addr = dma; |
ff7d6b27 JDB |
161 | |
162 | skip_dma_map: | |
99c07c43 JDB |
163 | /* Track how many pages are held 'in-flight' */ |
164 | pool->pages_state_hold_cnt++; | |
165 | ||
32c28f7e JDB |
166 | trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); |
167 | ||
ff7d6b27 JDB |
168 | /* When page just alloc'ed is should/must have refcnt 1. */ |
169 | return page; | |
170 | } | |
171 | ||
172 | /* For using page_pool replace: alloc_pages() API calls, but provide | |
173 | * synchronization guarantee for allocation side. | |
174 | */ | |
175 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) | |
176 | { | |
177 | struct page *page; | |
178 | ||
179 | /* Fast-path: Get a page from cache */ | |
180 | page = __page_pool_get_cached(pool); | |
181 | if (page) | |
182 | return page; | |
183 | ||
184 | /* Slow-path: cache empty, do real allocation */ | |
185 | page = __page_pool_alloc_pages_slow(pool, gfp); | |
186 | return page; | |
187 | } | |
188 | EXPORT_SYMBOL(page_pool_alloc_pages); | |
189 | ||
99c07c43 JDB |
190 | /* Calculate distance between two u32 values, valid if distance is below 2^(31) |
191 | * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution | |
192 | */ | |
193 | #define _distance(a, b) (s32)((a) - (b)) | |
194 | ||
195 | static s32 page_pool_inflight(struct page_pool *pool) | |
196 | { | |
197 | u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); | |
198 | u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); | |
c3f812ce | 199 | s32 inflight; |
99c07c43 | 200 | |
c3f812ce | 201 | inflight = _distance(hold_cnt, release_cnt); |
99c07c43 | 202 | |
c3f812ce | 203 | trace_page_pool_inflight(pool, inflight, hold_cnt, release_cnt); |
99c07c43 JDB |
204 | WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); |
205 | ||
c3f812ce | 206 | return inflight; |
99c07c43 JDB |
207 | } |
208 | ||
ff7d6b27 JDB |
209 | /* Cleanup page_pool state from page */ |
210 | static void __page_pool_clean_page(struct page_pool *pool, | |
211 | struct page *page) | |
212 | { | |
1567b85e | 213 | dma_addr_t dma; |
c3f812ce | 214 | int count; |
1567b85e | 215 | |
ff7d6b27 | 216 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) |
99c07c43 | 217 | goto skip_dma_unmap; |
ff7d6b27 | 218 | |
1567b85e | 219 | dma = page->dma_addr; |
ff7d6b27 | 220 | /* DMA unmap */ |
13f16d9d JDB |
221 | dma_unmap_page_attrs(pool->p.dev, dma, |
222 | PAGE_SIZE << pool->p.order, pool->p.dma_dir, | |
223 | DMA_ATTR_SKIP_CPU_SYNC); | |
1567b85e | 224 | page->dma_addr = 0; |
99c07c43 | 225 | skip_dma_unmap: |
c3f812ce JL |
226 | /* This may be the last page returned, releasing the pool, so |
227 | * it is not safe to reference pool afterwards. | |
228 | */ | |
229 | count = atomic_inc_return(&pool->pages_state_release_cnt); | |
230 | trace_page_pool_state_release(pool, page, count); | |
ff7d6b27 JDB |
231 | } |
232 | ||
a25d50bf IA |
233 | /* unmap the page and clean our state */ |
234 | void page_pool_unmap_page(struct page_pool *pool, struct page *page) | |
235 | { | |
99c07c43 JDB |
236 | /* When page is unmapped, this implies page will not be |
237 | * returned to page_pool. | |
238 | */ | |
a25d50bf IA |
239 | __page_pool_clean_page(pool, page); |
240 | } | |
241 | EXPORT_SYMBOL(page_pool_unmap_page); | |
242 | ||
ff7d6b27 JDB |
243 | /* Return a page to the page allocator, cleaning up our state */ |
244 | static void __page_pool_return_page(struct page_pool *pool, struct page *page) | |
245 | { | |
246 | __page_pool_clean_page(pool, page); | |
99c07c43 | 247 | |
ff7d6b27 JDB |
248 | put_page(page); |
249 | /* An optimization would be to call __free_pages(page, pool->p.order) | |
250 | * knowing page is not part of page-cache (thus avoiding a | |
251 | * __page_cache_release() call). | |
252 | */ | |
253 | } | |
254 | ||
255 | static bool __page_pool_recycle_into_ring(struct page_pool *pool, | |
256 | struct page *page) | |
257 | { | |
258 | int ret; | |
259 | /* BH protection not needed if current is serving softirq */ | |
260 | if (in_serving_softirq()) | |
261 | ret = ptr_ring_produce(&pool->ring, page); | |
262 | else | |
263 | ret = ptr_ring_produce_bh(&pool->ring, page); | |
264 | ||
265 | return (ret == 0) ? true : false; | |
266 | } | |
267 | ||
268 | /* Only allow direct recycling in special circumstances, into the | |
269 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. | |
270 | * | |
271 | * Caller must provide appropriate safe context. | |
272 | */ | |
273 | static bool __page_pool_recycle_direct(struct page *page, | |
274 | struct page_pool *pool) | |
275 | { | |
276 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) | |
277 | return false; | |
278 | ||
279 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ | |
280 | pool->alloc.cache[pool->alloc.count++] = page; | |
281 | return true; | |
282 | } | |
283 | ||
284 | void __page_pool_put_page(struct page_pool *pool, | |
285 | struct page *page, bool allow_direct) | |
286 | { | |
287 | /* This allocator is optimized for the XDP mode that uses | |
288 | * one-frame-per-page, but have fallbacks that act like the | |
289 | * regular page allocator APIs. | |
290 | * | |
291 | * refcnt == 1 means page_pool owns page, and can recycle it. | |
292 | */ | |
293 | if (likely(page_ref_count(page) == 1)) { | |
294 | /* Read barrier done in page_ref_count / READ_ONCE */ | |
295 | ||
296 | if (allow_direct && in_serving_softirq()) | |
297 | if (__page_pool_recycle_direct(page, pool)) | |
298 | return; | |
299 | ||
300 | if (!__page_pool_recycle_into_ring(pool, page)) { | |
301 | /* Cache full, fallback to free pages */ | |
302 | __page_pool_return_page(pool, page); | |
303 | } | |
304 | return; | |
305 | } | |
306 | /* Fallback/non-XDP mode: API user have elevated refcnt. | |
307 | * | |
308 | * Many drivers split up the page into fragments, and some | |
309 | * want to keep doing this to save memory and do refcnt based | |
310 | * recycling. Support this use case too, to ease drivers | |
311 | * switching between XDP/non-XDP. | |
312 | * | |
313 | * In-case page_pool maintains the DMA mapping, API user must | |
314 | * call page_pool_put_page once. In this elevated refcnt | |
315 | * case, the DMA is unmapped/released, as driver is likely | |
316 | * doing refcnt based recycle tricks, meaning another process | |
317 | * will be invoking put_page. | |
318 | */ | |
319 | __page_pool_clean_page(pool, page); | |
320 | put_page(page); | |
321 | } | |
322 | EXPORT_SYMBOL(__page_pool_put_page); | |
323 | ||
324 | static void __page_pool_empty_ring(struct page_pool *pool) | |
325 | { | |
326 | struct page *page; | |
327 | ||
328 | /* Empty recycle ring */ | |
4905bd9a | 329 | while ((page = ptr_ring_consume_bh(&pool->ring))) { |
ff7d6b27 JDB |
330 | /* Verify the refcnt invariant of cached pages */ |
331 | if (!(page_ref_count(page) == 1)) | |
332 | pr_crit("%s() page_pool refcnt %d violation\n", | |
333 | __func__, page_ref_count(page)); | |
334 | ||
335 | __page_pool_return_page(pool, page); | |
336 | } | |
337 | } | |
338 | ||
c3f812ce | 339 | static void page_pool_free(struct page_pool *pool) |
d956a048 | 340 | { |
c3f812ce JL |
341 | if (pool->disconnect) |
342 | pool->disconnect(pool); | |
e54cfd7e JDB |
343 | |
344 | ptr_ring_cleanup(&pool->ring, NULL); | |
f71fec47 JDB |
345 | |
346 | if (pool->p.flags & PP_FLAG_DMA_MAP) | |
347 | put_device(pool->p.dev); | |
348 | ||
e54cfd7e JDB |
349 | kfree(pool); |
350 | } | |
e54cfd7e | 351 | |
c3f812ce | 352 | static void page_pool_scrub(struct page_pool *pool) |
ff7d6b27 JDB |
353 | { |
354 | struct page *page; | |
355 | ||
356 | /* Empty alloc cache, assume caller made sure this is | |
357 | * no-longer in use, and page_pool_alloc_pages() cannot be | |
358 | * call concurrently. | |
359 | */ | |
360 | while (pool->alloc.count) { | |
361 | page = pool->alloc.cache[--pool->alloc.count]; | |
362 | __page_pool_return_page(pool, page); | |
363 | } | |
364 | ||
365 | /* No more consumers should exist, but producers could still | |
366 | * be in-flight. | |
367 | */ | |
368 | __page_pool_empty_ring(pool); | |
c3f812ce JL |
369 | } |
370 | ||
371 | static int page_pool_release(struct page_pool *pool) | |
372 | { | |
373 | int inflight; | |
374 | ||
375 | page_pool_scrub(pool); | |
376 | inflight = page_pool_inflight(pool); | |
377 | if (!inflight) | |
378 | page_pool_free(pool); | |
379 | ||
380 | return inflight; | |
381 | } | |
382 | ||
383 | static void page_pool_release_retry(struct work_struct *wq) | |
384 | { | |
385 | struct delayed_work *dwq = to_delayed_work(wq); | |
386 | struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); | |
387 | int inflight; | |
388 | ||
389 | inflight = page_pool_release(pool); | |
390 | if (!inflight) | |
391 | return; | |
392 | ||
393 | /* Periodic warning */ | |
394 | if (time_after_eq(jiffies, pool->defer_warn)) { | |
395 | int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; | |
396 | ||
397 | pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", | |
398 | __func__, inflight, sec); | |
399 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
400 | } | |
401 | ||
402 | /* Still not ready to be disconnected, retry later */ | |
403 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
404 | } | |
405 | ||
406 | void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) | |
407 | { | |
408 | refcount_inc(&pool->user_cnt); | |
409 | pool->disconnect = disconnect; | |
410 | } | |
411 | ||
412 | void page_pool_destroy(struct page_pool *pool) | |
413 | { | |
414 | if (!pool) | |
415 | return; | |
416 | ||
417 | if (!page_pool_put(pool)) | |
418 | return; | |
419 | ||
420 | if (!page_pool_release(pool)) | |
421 | return; | |
422 | ||
423 | pool->defer_start = jiffies; | |
424 | pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; | |
ff7d6b27 | 425 | |
c3f812ce JL |
426 | INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); |
427 | schedule_delayed_work(&pool->release_dw, DEFER_TIME); | |
ff7d6b27 | 428 | } |
c3f812ce | 429 | EXPORT_SYMBOL(page_pool_destroy); |