Commit | Line | Data |
---|---|---|
c0c77d8f BT |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* XDP user-space packet buffer | |
3 | * Copyright(c) 2018 Intel Corporation. | |
c0c77d8f BT |
4 | */ |
5 | ||
6 | #include <linux/init.h> | |
7 | #include <linux/sched/mm.h> | |
8 | #include <linux/sched/signal.h> | |
9 | #include <linux/sched/task.h> | |
10 | #include <linux/uaccess.h> | |
11 | #include <linux/slab.h> | |
12 | #include <linux/bpf.h> | |
13 | #include <linux/mm.h> | |
84c6b868 JK |
14 | #include <linux/netdevice.h> |
15 | #include <linux/rtnetlink.h> | |
c0c77d8f BT |
16 | |
17 | #include "xdp_umem.h" | |
e61e62b9 | 18 | #include "xsk_queue.h" |
c0c77d8f | 19 | |
bbff2f32 | 20 | #define XDP_UMEM_MIN_CHUNK_SIZE 2048 |
c0c77d8f | 21 | |
ac98d8aa MK |
22 | void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) |
23 | { | |
24 | unsigned long flags; | |
25 | ||
26 | spin_lock_irqsave(&umem->xsk_list_lock, flags); | |
27 | list_add_rcu(&xs->list, &umem->xsk_list); | |
28 | spin_unlock_irqrestore(&umem->xsk_list_lock, flags); | |
29 | } | |
30 | ||
31 | void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) | |
32 | { | |
33 | unsigned long flags; | |
34 | ||
35 | if (xs->dev) { | |
36 | spin_lock_irqsave(&umem->xsk_list_lock, flags); | |
37 | list_del_rcu(&xs->list); | |
38 | spin_unlock_irqrestore(&umem->xsk_list_lock, flags); | |
39 | ||
40 | if (umem->zc) | |
41 | synchronize_net(); | |
42 | } | |
43 | } | |
44 | ||
84c6b868 JK |
45 | int xdp_umem_query(struct net_device *dev, u16 queue_id) |
46 | { | |
47 | struct netdev_bpf bpf; | |
48 | ||
49 | ASSERT_RTNL(); | |
50 | ||
51 | memset(&bpf, 0, sizeof(bpf)); | |
52 | bpf.command = XDP_QUERY_XSK_UMEM; | |
53 | bpf.xsk.queue_id = queue_id; | |
54 | ||
55 | if (!dev->netdev_ops->ndo_bpf) | |
56 | return 0; | |
57 | return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem; | |
58 | } | |
59 | ||
173d3adb BT |
60 | int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, |
61 | u32 queue_id, u16 flags) | |
62 | { | |
63 | bool force_zc, force_copy; | |
64 | struct netdev_bpf bpf; | |
65 | int err; | |
66 | ||
67 | force_zc = flags & XDP_ZEROCOPY; | |
68 | force_copy = flags & XDP_COPY; | |
69 | ||
70 | if (force_zc && force_copy) | |
71 | return -EINVAL; | |
72 | ||
73 | if (force_copy) | |
74 | return 0; | |
75 | ||
f734607e | 76 | if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit) |
96c26e04 | 77 | return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */ |
173d3adb | 78 | |
f734607e | 79 | rtnl_lock(); |
84c6b868 JK |
80 | err = xdp_umem_query(dev, queue_id); |
81 | if (err) { | |
96c26e04 | 82 | err = err < 0 ? -EOPNOTSUPP : -EBUSY; |
84c6b868 JK |
83 | goto err_rtnl_unlock; |
84 | } | |
173d3adb | 85 | |
f734607e JK |
86 | bpf.command = XDP_SETUP_XSK_UMEM; |
87 | bpf.xsk.umem = umem; | |
88 | bpf.xsk.queue_id = queue_id; | |
173d3adb | 89 | |
f734607e | 90 | err = dev->netdev_ops->ndo_bpf(dev, &bpf); |
f734607e | 91 | if (err) |
84c6b868 JK |
92 | goto err_rtnl_unlock; |
93 | rtnl_unlock(); | |
173d3adb | 94 | |
f734607e JK |
95 | dev_hold(dev); |
96 | umem->dev = dev; | |
97 | umem->queue_id = queue_id; | |
98 | umem->zc = true; | |
99 | return 0; | |
84c6b868 JK |
100 | |
101 | err_rtnl_unlock: | |
102 | rtnl_unlock(); | |
103 | return force_zc ? err : 0; /* fail or fallback */ | |
173d3adb BT |
104 | } |
105 | ||
ac98d8aa | 106 | static void xdp_umem_clear_dev(struct xdp_umem *umem) |
173d3adb BT |
107 | { |
108 | struct netdev_bpf bpf; | |
109 | int err; | |
110 | ||
111 | if (umem->dev) { | |
112 | bpf.command = XDP_SETUP_XSK_UMEM; | |
113 | bpf.xsk.umem = NULL; | |
114 | bpf.xsk.queue_id = umem->queue_id; | |
115 | ||
116 | rtnl_lock(); | |
117 | err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); | |
118 | rtnl_unlock(); | |
119 | ||
120 | if (err) | |
121 | WARN(1, "failed to disable umem!\n"); | |
122 | ||
123 | dev_put(umem->dev); | |
124 | umem->dev = NULL; | |
125 | } | |
126 | } | |
127 | ||
c0c77d8f BT |
128 | static void xdp_umem_unpin_pages(struct xdp_umem *umem) |
129 | { | |
130 | unsigned int i; | |
131 | ||
a49049ea BT |
132 | for (i = 0; i < umem->npgs; i++) { |
133 | struct page *page = umem->pgs[i]; | |
c0c77d8f | 134 | |
a49049ea BT |
135 | set_page_dirty_lock(page); |
136 | put_page(page); | |
c0c77d8f | 137 | } |
a49049ea BT |
138 | |
139 | kfree(umem->pgs); | |
140 | umem->pgs = NULL; | |
c0c77d8f BT |
141 | } |
142 | ||
143 | static void xdp_umem_unaccount_pages(struct xdp_umem *umem) | |
144 | { | |
c09290c5 DB |
145 | if (umem->user) { |
146 | atomic_long_sub(umem->npgs, &umem->user->locked_vm); | |
147 | free_uid(umem->user); | |
148 | } | |
c0c77d8f BT |
149 | } |
150 | ||
151 | static void xdp_umem_release(struct xdp_umem *umem) | |
152 | { | |
153 | struct task_struct *task; | |
154 | struct mm_struct *mm; | |
155 | ||
173d3adb BT |
156 | xdp_umem_clear_dev(umem); |
157 | ||
423f3832 MK |
158 | if (umem->fq) { |
159 | xskq_destroy(umem->fq); | |
160 | umem->fq = NULL; | |
161 | } | |
162 | ||
fe230832 MK |
163 | if (umem->cq) { |
164 | xskq_destroy(umem->cq); | |
165 | umem->cq = NULL; | |
166 | } | |
167 | ||
a49049ea | 168 | xdp_umem_unpin_pages(umem); |
c0c77d8f | 169 | |
a49049ea BT |
170 | task = get_pid_task(umem->pid, PIDTYPE_PID); |
171 | put_pid(umem->pid); | |
172 | if (!task) | |
173 | goto out; | |
174 | mm = get_task_mm(task); | |
175 | put_task_struct(task); | |
176 | if (!mm) | |
177 | goto out; | |
c0c77d8f | 178 | |
a49049ea | 179 | mmput(mm); |
8aef7340 BT |
180 | kfree(umem->pages); |
181 | umem->pages = NULL; | |
182 | ||
c0c77d8f BT |
183 | xdp_umem_unaccount_pages(umem); |
184 | out: | |
185 | kfree(umem); | |
186 | } | |
187 | ||
188 | static void xdp_umem_release_deferred(struct work_struct *work) | |
189 | { | |
190 | struct xdp_umem *umem = container_of(work, struct xdp_umem, work); | |
191 | ||
192 | xdp_umem_release(umem); | |
193 | } | |
194 | ||
195 | void xdp_get_umem(struct xdp_umem *umem) | |
196 | { | |
d3b42f14 | 197 | refcount_inc(&umem->users); |
c0c77d8f BT |
198 | } |
199 | ||
200 | void xdp_put_umem(struct xdp_umem *umem) | |
201 | { | |
202 | if (!umem) | |
203 | return; | |
204 | ||
d3b42f14 | 205 | if (refcount_dec_and_test(&umem->users)) { |
c0c77d8f BT |
206 | INIT_WORK(&umem->work, xdp_umem_release_deferred); |
207 | schedule_work(&umem->work); | |
208 | } | |
209 | } | |
210 | ||
211 | static int xdp_umem_pin_pages(struct xdp_umem *umem) | |
212 | { | |
213 | unsigned int gup_flags = FOLL_WRITE; | |
214 | long npgs; | |
215 | int err; | |
216 | ||
a343993c BT |
217 | umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), |
218 | GFP_KERNEL | __GFP_NOWARN); | |
c0c77d8f BT |
219 | if (!umem->pgs) |
220 | return -ENOMEM; | |
221 | ||
222 | down_write(¤t->mm->mmap_sem); | |
223 | npgs = get_user_pages(umem->address, umem->npgs, | |
224 | gup_flags, &umem->pgs[0], NULL); | |
225 | up_write(¤t->mm->mmap_sem); | |
226 | ||
227 | if (npgs != umem->npgs) { | |
228 | if (npgs >= 0) { | |
229 | umem->npgs = npgs; | |
230 | err = -ENOMEM; | |
231 | goto out_pin; | |
232 | } | |
233 | err = npgs; | |
234 | goto out_pgs; | |
235 | } | |
236 | return 0; | |
237 | ||
238 | out_pin: | |
239 | xdp_umem_unpin_pages(umem); | |
240 | out_pgs: | |
241 | kfree(umem->pgs); | |
242 | umem->pgs = NULL; | |
243 | return err; | |
244 | } | |
245 | ||
246 | static int xdp_umem_account_pages(struct xdp_umem *umem) | |
247 | { | |
248 | unsigned long lock_limit, new_npgs, old_npgs; | |
249 | ||
250 | if (capable(CAP_IPC_LOCK)) | |
251 | return 0; | |
252 | ||
253 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | |
254 | umem->user = get_uid(current_user()); | |
255 | ||
256 | do { | |
257 | old_npgs = atomic_long_read(&umem->user->locked_vm); | |
258 | new_npgs = old_npgs + umem->npgs; | |
259 | if (new_npgs > lock_limit) { | |
260 | free_uid(umem->user); | |
261 | umem->user = NULL; | |
262 | return -ENOBUFS; | |
263 | } | |
264 | } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, | |
265 | new_npgs) != old_npgs); | |
266 | return 0; | |
267 | } | |
268 | ||
a49049ea | 269 | static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) |
c0c77d8f | 270 | { |
bbff2f32 BT |
271 | u32 chunk_size = mr->chunk_size, headroom = mr->headroom; |
272 | unsigned int chunks, chunks_per_page; | |
c0c77d8f | 273 | u64 addr = mr->addr, size = mr->len; |
8aef7340 | 274 | int size_chk, err, i; |
c0c77d8f | 275 | |
bbff2f32 | 276 | if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { |
c0c77d8f BT |
277 | /* Strictly speaking we could support this, if: |
278 | * - huge pages, or* | |
279 | * - using an IOMMU, or | |
280 | * - making sure the memory area is consecutive | |
281 | * but for now, we simply say "computer says no". | |
282 | */ | |
283 | return -EINVAL; | |
284 | } | |
285 | ||
bbff2f32 | 286 | if (!is_power_of_2(chunk_size)) |
c0c77d8f BT |
287 | return -EINVAL; |
288 | ||
289 | if (!PAGE_ALIGNED(addr)) { | |
290 | /* Memory area has to be page size aligned. For | |
291 | * simplicity, this might change. | |
292 | */ | |
293 | return -EINVAL; | |
294 | } | |
295 | ||
296 | if ((addr + size) < addr) | |
297 | return -EINVAL; | |
298 | ||
bbff2f32 BT |
299 | chunks = (unsigned int)div_u64(size, chunk_size); |
300 | if (chunks == 0) | |
c0c77d8f BT |
301 | return -EINVAL; |
302 | ||
bbff2f32 BT |
303 | chunks_per_page = PAGE_SIZE / chunk_size; |
304 | if (chunks < chunks_per_page || chunks % chunks_per_page) | |
c0c77d8f BT |
305 | return -EINVAL; |
306 | ||
bbff2f32 | 307 | headroom = ALIGN(headroom, 64); |
c0c77d8f | 308 | |
bbff2f32 | 309 | size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; |
c0c77d8f BT |
310 | if (size_chk < 0) |
311 | return -EINVAL; | |
312 | ||
313 | umem->pid = get_task_pid(current, PIDTYPE_PID); | |
c0c77d8f | 314 | umem->address = (unsigned long)addr; |
93ee30f3 MK |
315 | umem->chunk_mask = ~((u64)chunk_size - 1); |
316 | umem->size = size; | |
bbff2f32 BT |
317 | umem->headroom = headroom; |
318 | umem->chunk_size_nohr = chunk_size - headroom; | |
c0c77d8f BT |
319 | umem->npgs = size / PAGE_SIZE; |
320 | umem->pgs = NULL; | |
321 | umem->user = NULL; | |
ac98d8aa MK |
322 | INIT_LIST_HEAD(&umem->xsk_list); |
323 | spin_lock_init(&umem->xsk_list_lock); | |
c0c77d8f | 324 | |
d3b42f14 | 325 | refcount_set(&umem->users, 1); |
c0c77d8f BT |
326 | |
327 | err = xdp_umem_account_pages(umem); | |
328 | if (err) | |
329 | goto out; | |
330 | ||
331 | err = xdp_umem_pin_pages(umem); | |
332 | if (err) | |
333 | goto out_account; | |
8aef7340 BT |
334 | |
335 | umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); | |
336 | if (!umem->pages) { | |
337 | err = -ENOMEM; | |
338 | goto out_account; | |
339 | } | |
340 | ||
341 | for (i = 0; i < umem->npgs; i++) | |
342 | umem->pages[i].addr = page_address(umem->pgs[i]); | |
343 | ||
c0c77d8f BT |
344 | return 0; |
345 | ||
346 | out_account: | |
347 | xdp_umem_unaccount_pages(umem); | |
348 | out: | |
349 | put_pid(umem->pid); | |
350 | return err; | |
351 | } | |
965a9909 | 352 | |
a49049ea BT |
353 | struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) |
354 | { | |
355 | struct xdp_umem *umem; | |
356 | int err; | |
357 | ||
358 | umem = kzalloc(sizeof(*umem), GFP_KERNEL); | |
359 | if (!umem) | |
360 | return ERR_PTR(-ENOMEM); | |
361 | ||
362 | err = xdp_umem_reg(umem, mr); | |
363 | if (err) { | |
364 | kfree(umem); | |
365 | return ERR_PTR(err); | |
366 | } | |
367 | ||
368 | return umem; | |
369 | } | |
370 | ||
965a9909 MK |
371 | bool xdp_umem_validate_queues(struct xdp_umem *umem) |
372 | { | |
da60cf00 | 373 | return umem->fq && umem->cq; |
965a9909 | 374 | } |