Commit | Line | Data |
---|---|---|
ff7d6b27 JDB |
1 | /* SPDX-License-Identifier: GPL-2.0 |
2 | * | |
3 | * page_pool.c | |
4 | * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> | |
5 | * Copyright (C) 2016 Red Hat, Inc. | |
6 | */ | |
7 | #include <linux/types.h> | |
8 | #include <linux/kernel.h> | |
9 | #include <linux/slab.h> | |
10 | ||
11 | #include <net/page_pool.h> | |
12 | #include <linux/dma-direction.h> | |
13 | #include <linux/dma-mapping.h> | |
14 | #include <linux/page-flags.h> | |
15 | #include <linux/mm.h> /* for __put_page() */ | |
16 | ||
17 | static int page_pool_init(struct page_pool *pool, | |
18 | const struct page_pool_params *params) | |
19 | { | |
20 | unsigned int ring_qsize = 1024; /* Default */ | |
21 | ||
22 | memcpy(&pool->p, params, sizeof(pool->p)); | |
23 | ||
24 | /* Validate only known flags were used */ | |
25 | if (pool->p.flags & ~(PP_FLAG_ALL)) | |
26 | return -EINVAL; | |
27 | ||
28 | if (pool->p.pool_size) | |
29 | ring_qsize = pool->p.pool_size; | |
30 | ||
31 | /* Sanity limit mem that can be pinned down */ | |
32 | if (ring_qsize > 32768) | |
33 | return -E2BIG; | |
34 | ||
35 | /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. | |
36 | * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, | |
37 | * which is the XDP_TX use-case. | |
38 | */ | |
39 | if ((pool->p.dma_dir != DMA_FROM_DEVICE) && | |
40 | (pool->p.dma_dir != DMA_BIDIRECTIONAL)) | |
41 | return -EINVAL; | |
42 | ||
43 | if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) | |
44 | return -ENOMEM; | |
45 | ||
46 | return 0; | |
47 | } | |
48 | ||
49 | struct page_pool *page_pool_create(const struct page_pool_params *params) | |
50 | { | |
51 | struct page_pool *pool; | |
52 | int err = 0; | |
53 | ||
54 | pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); | |
55 | if (!pool) | |
56 | return ERR_PTR(-ENOMEM); | |
57 | ||
58 | err = page_pool_init(pool, params); | |
59 | if (err < 0) { | |
60 | pr_warn("%s() gave up with errno %d\n", __func__, err); | |
61 | kfree(pool); | |
62 | return ERR_PTR(err); | |
63 | } | |
64 | return pool; | |
65 | } | |
66 | EXPORT_SYMBOL(page_pool_create); | |
67 | ||
68 | /* fast path */ | |
69 | static struct page *__page_pool_get_cached(struct page_pool *pool) | |
70 | { | |
71 | struct ptr_ring *r = &pool->ring; | |
72 | struct page *page; | |
73 | ||
74 | /* Quicker fallback, avoid locks when ring is empty */ | |
75 | if (__ptr_ring_empty(r)) | |
76 | return NULL; | |
77 | ||
78 | /* Test for safe-context, caller should provide this guarantee */ | |
79 | if (likely(in_serving_softirq())) { | |
80 | if (likely(pool->alloc.count)) { | |
81 | /* Fast-path */ | |
82 | page = pool->alloc.cache[--pool->alloc.count]; | |
83 | return page; | |
84 | } | |
85 | /* Slower-path: Alloc array empty, time to refill | |
86 | * | |
87 | * Open-coded bulk ptr_ring consumer. | |
88 | * | |
89 | * Discussion: the ring consumer lock is not really | |
90 | * needed due to the softirq/NAPI protection, but | |
91 | * later need the ability to reclaim pages on the | |
92 | * ring. Thus, keeping the locks. | |
93 | */ | |
94 | spin_lock(&r->consumer_lock); | |
95 | while ((page = __ptr_ring_consume(r))) { | |
96 | if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) | |
97 | break; | |
98 | pool->alloc.cache[pool->alloc.count++] = page; | |
99 | } | |
100 | spin_unlock(&r->consumer_lock); | |
101 | return page; | |
102 | } | |
103 | ||
104 | /* Slow-path: Get page from locked ring queue */ | |
105 | page = ptr_ring_consume(&pool->ring); | |
106 | return page; | |
107 | } | |
108 | ||
109 | /* slow path */ | |
110 | noinline | |
111 | static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, | |
112 | gfp_t _gfp) | |
113 | { | |
114 | struct page *page; | |
115 | gfp_t gfp = _gfp; | |
116 | dma_addr_t dma; | |
117 | ||
118 | /* We could always set __GFP_COMP, and avoid this branch, as | |
119 | * prep_new_page() can handle order-0 with __GFP_COMP. | |
120 | */ | |
121 | if (pool->p.order) | |
122 | gfp |= __GFP_COMP; | |
123 | ||
124 | /* FUTURE development: | |
125 | * | |
126 | * Current slow-path essentially falls back to single page | |
127 | * allocations, which doesn't improve performance. This code | |
128 | * need bulk allocation support from the page allocator code. | |
129 | */ | |
130 | ||
131 | /* Cache was empty, do real allocation */ | |
132 | page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); | |
133 | if (!page) | |
134 | return NULL; | |
135 | ||
136 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) | |
137 | goto skip_dma_map; | |
138 | ||
139 | /* Setup DMA mapping: use page->private for DMA-addr | |
140 | * This mapping is kept for lifetime of page, until leaving pool. | |
141 | */ | |
142 | dma = dma_map_page(pool->p.dev, page, 0, | |
143 | (PAGE_SIZE << pool->p.order), | |
144 | pool->p.dma_dir); | |
145 | if (dma_mapping_error(pool->p.dev, dma)) { | |
146 | put_page(page); | |
147 | return NULL; | |
148 | } | |
149 | set_page_private(page, dma); /* page->private = dma; */ | |
150 | ||
151 | skip_dma_map: | |
152 | /* When page just alloc'ed is should/must have refcnt 1. */ | |
153 | return page; | |
154 | } | |
155 | ||
156 | /* For using page_pool replace: alloc_pages() API calls, but provide | |
157 | * synchronization guarantee for allocation side. | |
158 | */ | |
159 | struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) | |
160 | { | |
161 | struct page *page; | |
162 | ||
163 | /* Fast-path: Get a page from cache */ | |
164 | page = __page_pool_get_cached(pool); | |
165 | if (page) | |
166 | return page; | |
167 | ||
168 | /* Slow-path: cache empty, do real allocation */ | |
169 | page = __page_pool_alloc_pages_slow(pool, gfp); | |
170 | return page; | |
171 | } | |
172 | EXPORT_SYMBOL(page_pool_alloc_pages); | |
173 | ||
174 | /* Cleanup page_pool state from page */ | |
175 | static void __page_pool_clean_page(struct page_pool *pool, | |
176 | struct page *page) | |
177 | { | |
178 | if (!(pool->p.flags & PP_FLAG_DMA_MAP)) | |
179 | return; | |
180 | ||
181 | /* DMA unmap */ | |
182 | dma_unmap_page(pool->p.dev, page_private(page), | |
183 | PAGE_SIZE << pool->p.order, pool->p.dma_dir); | |
184 | set_page_private(page, 0); | |
185 | } | |
186 | ||
187 | /* Return a page to the page allocator, cleaning up our state */ | |
188 | static void __page_pool_return_page(struct page_pool *pool, struct page *page) | |
189 | { | |
190 | __page_pool_clean_page(pool, page); | |
191 | put_page(page); | |
192 | /* An optimization would be to call __free_pages(page, pool->p.order) | |
193 | * knowing page is not part of page-cache (thus avoiding a | |
194 | * __page_cache_release() call). | |
195 | */ | |
196 | } | |
197 | ||
198 | static bool __page_pool_recycle_into_ring(struct page_pool *pool, | |
199 | struct page *page) | |
200 | { | |
201 | int ret; | |
202 | /* BH protection not needed if current is serving softirq */ | |
203 | if (in_serving_softirq()) | |
204 | ret = ptr_ring_produce(&pool->ring, page); | |
205 | else | |
206 | ret = ptr_ring_produce_bh(&pool->ring, page); | |
207 | ||
208 | return (ret == 0) ? true : false; | |
209 | } | |
210 | ||
211 | /* Only allow direct recycling in special circumstances, into the | |
212 | * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. | |
213 | * | |
214 | * Caller must provide appropriate safe context. | |
215 | */ | |
216 | static bool __page_pool_recycle_direct(struct page *page, | |
217 | struct page_pool *pool) | |
218 | { | |
219 | if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) | |
220 | return false; | |
221 | ||
222 | /* Caller MUST have verified/know (page_ref_count(page) == 1) */ | |
223 | pool->alloc.cache[pool->alloc.count++] = page; | |
224 | return true; | |
225 | } | |
226 | ||
227 | void __page_pool_put_page(struct page_pool *pool, | |
228 | struct page *page, bool allow_direct) | |
229 | { | |
230 | /* This allocator is optimized for the XDP mode that uses | |
231 | * one-frame-per-page, but have fallbacks that act like the | |
232 | * regular page allocator APIs. | |
233 | * | |
234 | * refcnt == 1 means page_pool owns page, and can recycle it. | |
235 | */ | |
236 | if (likely(page_ref_count(page) == 1)) { | |
237 | /* Read barrier done in page_ref_count / READ_ONCE */ | |
238 | ||
239 | if (allow_direct && in_serving_softirq()) | |
240 | if (__page_pool_recycle_direct(page, pool)) | |
241 | return; | |
242 | ||
243 | if (!__page_pool_recycle_into_ring(pool, page)) { | |
244 | /* Cache full, fallback to free pages */ | |
245 | __page_pool_return_page(pool, page); | |
246 | } | |
247 | return; | |
248 | } | |
249 | /* Fallback/non-XDP mode: API user have elevated refcnt. | |
250 | * | |
251 | * Many drivers split up the page into fragments, and some | |
252 | * want to keep doing this to save memory and do refcnt based | |
253 | * recycling. Support this use case too, to ease drivers | |
254 | * switching between XDP/non-XDP. | |
255 | * | |
256 | * In-case page_pool maintains the DMA mapping, API user must | |
257 | * call page_pool_put_page once. In this elevated refcnt | |
258 | * case, the DMA is unmapped/released, as driver is likely | |
259 | * doing refcnt based recycle tricks, meaning another process | |
260 | * will be invoking put_page. | |
261 | */ | |
262 | __page_pool_clean_page(pool, page); | |
263 | put_page(page); | |
264 | } | |
265 | EXPORT_SYMBOL(__page_pool_put_page); | |
266 | ||
267 | static void __page_pool_empty_ring(struct page_pool *pool) | |
268 | { | |
269 | struct page *page; | |
270 | ||
271 | /* Empty recycle ring */ | |
4905bd9a | 272 | while ((page = ptr_ring_consume_bh(&pool->ring))) { |
ff7d6b27 JDB |
273 | /* Verify the refcnt invariant of cached pages */ |
274 | if (!(page_ref_count(page) == 1)) | |
275 | pr_crit("%s() page_pool refcnt %d violation\n", | |
276 | __func__, page_ref_count(page)); | |
277 | ||
278 | __page_pool_return_page(pool, page); | |
279 | } | |
280 | } | |
281 | ||
282 | static void __page_pool_destroy_rcu(struct rcu_head *rcu) | |
283 | { | |
284 | struct page_pool *pool; | |
285 | ||
286 | pool = container_of(rcu, struct page_pool, rcu); | |
287 | ||
288 | WARN(pool->alloc.count, "API usage violation"); | |
289 | ||
290 | __page_pool_empty_ring(pool); | |
291 | ptr_ring_cleanup(&pool->ring, NULL); | |
292 | kfree(pool); | |
293 | } | |
294 | ||
295 | /* Cleanup and release resources */ | |
296 | void page_pool_destroy(struct page_pool *pool) | |
297 | { | |
298 | struct page *page; | |
299 | ||
300 | /* Empty alloc cache, assume caller made sure this is | |
301 | * no-longer in use, and page_pool_alloc_pages() cannot be | |
302 | * call concurrently. | |
303 | */ | |
304 | while (pool->alloc.count) { | |
305 | page = pool->alloc.cache[--pool->alloc.count]; | |
306 | __page_pool_return_page(pool, page); | |
307 | } | |
308 | ||
309 | /* No more consumers should exist, but producers could still | |
310 | * be in-flight. | |
311 | */ | |
312 | __page_pool_empty_ring(pool); | |
313 | ||
314 | /* An xdp_mem_allocator can still ref page_pool pointer */ | |
315 | call_rcu(&pool->rcu, __page_pool_destroy_rcu); | |
316 | } | |
317 | EXPORT_SYMBOL(page_pool_destroy); |