Commit | Line | Data |
---|---|---|
c0c77d8f BT |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* XDP user-space packet buffer | |
3 | * Copyright(c) 2018 Intel Corporation. | |
c0c77d8f BT |
4 | */ |
5 | ||
6 | #include <linux/init.h> | |
7 | #include <linux/sched/mm.h> | |
8 | #include <linux/sched/signal.h> | |
9 | #include <linux/sched/task.h> | |
10 | #include <linux/uaccess.h> | |
11 | #include <linux/slab.h> | |
12 | #include <linux/bpf.h> | |
13 | #include <linux/mm.h> | |
84c6b868 JK |
14 | #include <linux/netdevice.h> |
15 | #include <linux/rtnetlink.h> | |
c0c77d8f BT |
16 | |
17 | #include "xdp_umem.h" | |
e61e62b9 | 18 | #include "xsk_queue.h" |
c0c77d8f | 19 | |
bbff2f32 | 20 | #define XDP_UMEM_MIN_CHUNK_SIZE 2048 |
c0c77d8f | 21 | |
ac98d8aa MK |
22 | void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) |
23 | { | |
24 | unsigned long flags; | |
25 | ||
26 | spin_lock_irqsave(&umem->xsk_list_lock, flags); | |
27 | list_add_rcu(&xs->list, &umem->xsk_list); | |
28 | spin_unlock_irqrestore(&umem->xsk_list_lock, flags); | |
29 | } | |
30 | ||
31 | void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) | |
32 | { | |
33 | unsigned long flags; | |
34 | ||
541d7fdd BT |
35 | spin_lock_irqsave(&umem->xsk_list_lock, flags); |
36 | list_del_rcu(&xs->list); | |
37 | spin_unlock_irqrestore(&umem->xsk_list_lock, flags); | |
ac98d8aa MK |
38 | } |
39 | ||
c9b47cc1 MK |
40 | /* The umem is stored both in the _rx struct and the _tx struct as we do |
41 | * not know if the device has more tx queues than rx, or the opposite. | |
42 | * This might also change during run time. | |
43 | */ | |
cc5b5d35 KK |
44 | static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem, |
45 | u16 queue_id) | |
84c6b868 | 46 | { |
cc5b5d35 KK |
47 | if (queue_id >= max_t(unsigned int, |
48 | dev->real_num_rx_queues, | |
49 | dev->real_num_tx_queues)) | |
50 | return -EINVAL; | |
51 | ||
c9b47cc1 MK |
52 | if (queue_id < dev->real_num_rx_queues) |
53 | dev->_rx[queue_id].umem = umem; | |
54 | if (queue_id < dev->real_num_tx_queues) | |
55 | dev->_tx[queue_id].umem = umem; | |
cc5b5d35 KK |
56 | |
57 | return 0; | |
c9b47cc1 | 58 | } |
84c6b868 | 59 | |
1661d346 JK |
60 | struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, |
61 | u16 queue_id) | |
c9b47cc1 MK |
62 | { |
63 | if (queue_id < dev->real_num_rx_queues) | |
64 | return dev->_rx[queue_id].umem; | |
65 | if (queue_id < dev->real_num_tx_queues) | |
66 | return dev->_tx[queue_id].umem; | |
84c6b868 | 67 | |
c9b47cc1 MK |
68 | return NULL; |
69 | } | |
84c6b868 | 70 | |
c9b47cc1 MK |
71 | static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id) |
72 | { | |
a41b4f3c | 73 | if (queue_id < dev->real_num_rx_queues) |
c9b47cc1 | 74 | dev->_rx[queue_id].umem = NULL; |
a41b4f3c | 75 | if (queue_id < dev->real_num_tx_queues) |
c9b47cc1 | 76 | dev->_tx[queue_id].umem = NULL; |
84c6b868 JK |
77 | } |
78 | ||
173d3adb | 79 | int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, |
c9b47cc1 | 80 | u16 queue_id, u16 flags) |
173d3adb BT |
81 | { |
82 | bool force_zc, force_copy; | |
83 | struct netdev_bpf bpf; | |
c9b47cc1 | 84 | int err = 0; |
173d3adb BT |
85 | |
86 | force_zc = flags & XDP_ZEROCOPY; | |
87 | force_copy = flags & XDP_COPY; | |
88 | ||
89 | if (force_zc && force_copy) | |
90 | return -EINVAL; | |
91 | ||
c9b47cc1 MK |
92 | rtnl_lock(); |
93 | if (xdp_get_umem_from_qid(dev, queue_id)) { | |
94 | err = -EBUSY; | |
95 | goto out_rtnl_unlock; | |
96 | } | |
173d3adb | 97 | |
cc5b5d35 KK |
98 | err = xdp_reg_umem_at_qid(dev, umem, queue_id); |
99 | if (err) | |
100 | goto out_rtnl_unlock; | |
101 | ||
c9b47cc1 MK |
102 | umem->dev = dev; |
103 | umem->queue_id = queue_id; | |
104 | if (force_copy) | |
105 | /* For copy-mode, we are done. */ | |
106 | goto out_rtnl_unlock; | |
173d3adb | 107 | |
c9b47cc1 MK |
108 | if (!dev->netdev_ops->ndo_bpf || |
109 | !dev->netdev_ops->ndo_xsk_async_xmit) { | |
110 | err = -EOPNOTSUPP; | |
111 | goto err_unreg_umem; | |
84c6b868 | 112 | } |
173d3adb | 113 | |
f734607e JK |
114 | bpf.command = XDP_SETUP_XSK_UMEM; |
115 | bpf.xsk.umem = umem; | |
116 | bpf.xsk.queue_id = queue_id; | |
173d3adb | 117 | |
f734607e | 118 | err = dev->netdev_ops->ndo_bpf(dev, &bpf); |
f734607e | 119 | if (err) |
c9b47cc1 | 120 | goto err_unreg_umem; |
84c6b868 | 121 | rtnl_unlock(); |
173d3adb | 122 | |
f734607e | 123 | dev_hold(dev); |
f734607e JK |
124 | umem->zc = true; |
125 | return 0; | |
84c6b868 | 126 | |
c9b47cc1 MK |
127 | err_unreg_umem: |
128 | xdp_clear_umem_at_qid(dev, queue_id); | |
129 | if (!force_zc) | |
130 | err = 0; /* fallback to copy mode */ | |
131 | out_rtnl_unlock: | |
84c6b868 | 132 | rtnl_unlock(); |
c9b47cc1 | 133 | return err; |
173d3adb BT |
134 | } |
135 | ||
ac98d8aa | 136 | static void xdp_umem_clear_dev(struct xdp_umem *umem) |
173d3adb BT |
137 | { |
138 | struct netdev_bpf bpf; | |
139 | int err; | |
140 | ||
c9b47cc1 | 141 | if (umem->zc) { |
173d3adb BT |
142 | bpf.command = XDP_SETUP_XSK_UMEM; |
143 | bpf.xsk.umem = NULL; | |
144 | bpf.xsk.queue_id = umem->queue_id; | |
145 | ||
146 | rtnl_lock(); | |
147 | err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); | |
148 | rtnl_unlock(); | |
149 | ||
150 | if (err) | |
151 | WARN(1, "failed to disable umem!\n"); | |
c9b47cc1 MK |
152 | } |
153 | ||
154 | if (umem->dev) { | |
155 | rtnl_lock(); | |
156 | xdp_clear_umem_at_qid(umem->dev, umem->queue_id); | |
157 | rtnl_unlock(); | |
158 | } | |
173d3adb | 159 | |
c9b47cc1 | 160 | if (umem->zc) { |
173d3adb | 161 | dev_put(umem->dev); |
c9b47cc1 | 162 | umem->zc = false; |
173d3adb BT |
163 | } |
164 | } | |
165 | ||
c0c77d8f BT |
166 | static void xdp_umem_unpin_pages(struct xdp_umem *umem) |
167 | { | |
168 | unsigned int i; | |
169 | ||
a49049ea BT |
170 | for (i = 0; i < umem->npgs; i++) { |
171 | struct page *page = umem->pgs[i]; | |
c0c77d8f | 172 | |
a49049ea BT |
173 | set_page_dirty_lock(page); |
174 | put_page(page); | |
c0c77d8f | 175 | } |
a49049ea BT |
176 | |
177 | kfree(umem->pgs); | |
178 | umem->pgs = NULL; | |
c0c77d8f BT |
179 | } |
180 | ||
181 | static void xdp_umem_unaccount_pages(struct xdp_umem *umem) | |
182 | { | |
c09290c5 DB |
183 | if (umem->user) { |
184 | atomic_long_sub(umem->npgs, &umem->user->locked_vm); | |
185 | free_uid(umem->user); | |
186 | } | |
c0c77d8f BT |
187 | } |
188 | ||
189 | static void xdp_umem_release(struct xdp_umem *umem) | |
190 | { | |
191 | struct task_struct *task; | |
192 | struct mm_struct *mm; | |
193 | ||
173d3adb BT |
194 | xdp_umem_clear_dev(umem); |
195 | ||
423f3832 MK |
196 | if (umem->fq) { |
197 | xskq_destroy(umem->fq); | |
198 | umem->fq = NULL; | |
199 | } | |
200 | ||
fe230832 MK |
201 | if (umem->cq) { |
202 | xskq_destroy(umem->cq); | |
203 | umem->cq = NULL; | |
204 | } | |
205 | ||
f5bd9138 JK |
206 | xsk_reuseq_destroy(umem); |
207 | ||
a49049ea | 208 | xdp_umem_unpin_pages(umem); |
c0c77d8f | 209 | |
a49049ea BT |
210 | task = get_pid_task(umem->pid, PIDTYPE_PID); |
211 | put_pid(umem->pid); | |
212 | if (!task) | |
213 | goto out; | |
214 | mm = get_task_mm(task); | |
215 | put_task_struct(task); | |
216 | if (!mm) | |
217 | goto out; | |
c0c77d8f | 218 | |
a49049ea | 219 | mmput(mm); |
8aef7340 BT |
220 | kfree(umem->pages); |
221 | umem->pages = NULL; | |
222 | ||
c0c77d8f BT |
223 | xdp_umem_unaccount_pages(umem); |
224 | out: | |
225 | kfree(umem); | |
226 | } | |
227 | ||
228 | static void xdp_umem_release_deferred(struct work_struct *work) | |
229 | { | |
230 | struct xdp_umem *umem = container_of(work, struct xdp_umem, work); | |
231 | ||
232 | xdp_umem_release(umem); | |
233 | } | |
234 | ||
235 | void xdp_get_umem(struct xdp_umem *umem) | |
236 | { | |
d3b42f14 | 237 | refcount_inc(&umem->users); |
c0c77d8f BT |
238 | } |
239 | ||
240 | void xdp_put_umem(struct xdp_umem *umem) | |
241 | { | |
242 | if (!umem) | |
243 | return; | |
244 | ||
d3b42f14 | 245 | if (refcount_dec_and_test(&umem->users)) { |
c0c77d8f BT |
246 | INIT_WORK(&umem->work, xdp_umem_release_deferred); |
247 | schedule_work(&umem->work); | |
248 | } | |
249 | } | |
250 | ||
251 | static int xdp_umem_pin_pages(struct xdp_umem *umem) | |
252 | { | |
253 | unsigned int gup_flags = FOLL_WRITE; | |
254 | long npgs; | |
255 | int err; | |
256 | ||
a343993c BT |
257 | umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs), |
258 | GFP_KERNEL | __GFP_NOWARN); | |
c0c77d8f BT |
259 | if (!umem->pgs) |
260 | return -ENOMEM; | |
261 | ||
262 | down_write(¤t->mm->mmap_sem); | |
263 | npgs = get_user_pages(umem->address, umem->npgs, | |
264 | gup_flags, &umem->pgs[0], NULL); | |
265 | up_write(¤t->mm->mmap_sem); | |
266 | ||
267 | if (npgs != umem->npgs) { | |
268 | if (npgs >= 0) { | |
269 | umem->npgs = npgs; | |
270 | err = -ENOMEM; | |
271 | goto out_pin; | |
272 | } | |
273 | err = npgs; | |
274 | goto out_pgs; | |
275 | } | |
276 | return 0; | |
277 | ||
278 | out_pin: | |
279 | xdp_umem_unpin_pages(umem); | |
280 | out_pgs: | |
281 | kfree(umem->pgs); | |
282 | umem->pgs = NULL; | |
283 | return err; | |
284 | } | |
285 | ||
286 | static int xdp_umem_account_pages(struct xdp_umem *umem) | |
287 | { | |
288 | unsigned long lock_limit, new_npgs, old_npgs; | |
289 | ||
290 | if (capable(CAP_IPC_LOCK)) | |
291 | return 0; | |
292 | ||
293 | lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | |
294 | umem->user = get_uid(current_user()); | |
295 | ||
296 | do { | |
297 | old_npgs = atomic_long_read(&umem->user->locked_vm); | |
298 | new_npgs = old_npgs + umem->npgs; | |
299 | if (new_npgs > lock_limit) { | |
300 | free_uid(umem->user); | |
301 | umem->user = NULL; | |
302 | return -ENOBUFS; | |
303 | } | |
304 | } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, | |
305 | new_npgs) != old_npgs); | |
306 | return 0; | |
307 | } | |
308 | ||
a49049ea | 309 | static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) |
c0c77d8f | 310 | { |
bbff2f32 BT |
311 | u32 chunk_size = mr->chunk_size, headroom = mr->headroom; |
312 | unsigned int chunks, chunks_per_page; | |
c0c77d8f | 313 | u64 addr = mr->addr, size = mr->len; |
8aef7340 | 314 | int size_chk, err, i; |
c0c77d8f | 315 | |
bbff2f32 | 316 | if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { |
c0c77d8f BT |
317 | /* Strictly speaking we could support this, if: |
318 | * - huge pages, or* | |
319 | * - using an IOMMU, or | |
320 | * - making sure the memory area is consecutive | |
321 | * but for now, we simply say "computer says no". | |
322 | */ | |
323 | return -EINVAL; | |
324 | } | |
325 | ||
bbff2f32 | 326 | if (!is_power_of_2(chunk_size)) |
c0c77d8f BT |
327 | return -EINVAL; |
328 | ||
329 | if (!PAGE_ALIGNED(addr)) { | |
330 | /* Memory area has to be page size aligned. For | |
331 | * simplicity, this might change. | |
332 | */ | |
333 | return -EINVAL; | |
334 | } | |
335 | ||
336 | if ((addr + size) < addr) | |
337 | return -EINVAL; | |
338 | ||
bbff2f32 BT |
339 | chunks = (unsigned int)div_u64(size, chunk_size); |
340 | if (chunks == 0) | |
c0c77d8f BT |
341 | return -EINVAL; |
342 | ||
bbff2f32 BT |
343 | chunks_per_page = PAGE_SIZE / chunk_size; |
344 | if (chunks < chunks_per_page || chunks % chunks_per_page) | |
c0c77d8f BT |
345 | return -EINVAL; |
346 | ||
bbff2f32 | 347 | headroom = ALIGN(headroom, 64); |
c0c77d8f | 348 | |
bbff2f32 | 349 | size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM; |
c0c77d8f BT |
350 | if (size_chk < 0) |
351 | return -EINVAL; | |
352 | ||
353 | umem->pid = get_task_pid(current, PIDTYPE_PID); | |
c0c77d8f | 354 | umem->address = (unsigned long)addr; |
93ee30f3 MK |
355 | umem->chunk_mask = ~((u64)chunk_size - 1); |
356 | umem->size = size; | |
bbff2f32 BT |
357 | umem->headroom = headroom; |
358 | umem->chunk_size_nohr = chunk_size - headroom; | |
c0c77d8f BT |
359 | umem->npgs = size / PAGE_SIZE; |
360 | umem->pgs = NULL; | |
361 | umem->user = NULL; | |
ac98d8aa MK |
362 | INIT_LIST_HEAD(&umem->xsk_list); |
363 | spin_lock_init(&umem->xsk_list_lock); | |
c0c77d8f | 364 | |
d3b42f14 | 365 | refcount_set(&umem->users, 1); |
c0c77d8f BT |
366 | |
367 | err = xdp_umem_account_pages(umem); | |
368 | if (err) | |
369 | goto out; | |
370 | ||
371 | err = xdp_umem_pin_pages(umem); | |
372 | if (err) | |
373 | goto out_account; | |
8aef7340 BT |
374 | |
375 | umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); | |
376 | if (!umem->pages) { | |
377 | err = -ENOMEM; | |
378 | goto out_account; | |
379 | } | |
380 | ||
381 | for (i = 0; i < umem->npgs; i++) | |
382 | umem->pages[i].addr = page_address(umem->pgs[i]); | |
383 | ||
c0c77d8f BT |
384 | return 0; |
385 | ||
386 | out_account: | |
387 | xdp_umem_unaccount_pages(umem); | |
388 | out: | |
389 | put_pid(umem->pid); | |
390 | return err; | |
391 | } | |
965a9909 | 392 | |
a49049ea BT |
393 | struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) |
394 | { | |
395 | struct xdp_umem *umem; | |
396 | int err; | |
397 | ||
398 | umem = kzalloc(sizeof(*umem), GFP_KERNEL); | |
399 | if (!umem) | |
400 | return ERR_PTR(-ENOMEM); | |
401 | ||
402 | err = xdp_umem_reg(umem, mr); | |
403 | if (err) { | |
404 | kfree(umem); | |
405 | return ERR_PTR(err); | |
406 | } | |
407 | ||
408 | return umem; | |
409 | } | |
410 | ||
965a9909 MK |
411 | bool xdp_umem_validate_queues(struct xdp_umem *umem) |
412 | { | |
da60cf00 | 413 | return umem->fq && umem->cq; |
965a9909 | 414 | } |