i40e: clean zero-copy XDP Tx ring on shutdown/reset
[linux-block.git] / net / xdp / xdp_umem.c
CommitLineData
c0c77d8f
BT
1// SPDX-License-Identifier: GPL-2.0
2/* XDP user-space packet buffer
3 * Copyright(c) 2018 Intel Corporation.
c0c77d8f
BT
4 */
5
6#include <linux/init.h>
7#include <linux/sched/mm.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/task.h>
10#include <linux/uaccess.h>
11#include <linux/slab.h>
12#include <linux/bpf.h>
13#include <linux/mm.h>
84c6b868
JK
14#include <linux/netdevice.h>
15#include <linux/rtnetlink.h>
c0c77d8f
BT
16
17#include "xdp_umem.h"
e61e62b9 18#include "xsk_queue.h"
c0c77d8f 19
bbff2f32 20#define XDP_UMEM_MIN_CHUNK_SIZE 2048
c0c77d8f 21
ac98d8aa
MK
22void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
23{
24 unsigned long flags;
25
26 spin_lock_irqsave(&umem->xsk_list_lock, flags);
27 list_add_rcu(&xs->list, &umem->xsk_list);
28 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
29}
30
31void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
32{
33 unsigned long flags;
34
35 if (xs->dev) {
36 spin_lock_irqsave(&umem->xsk_list_lock, flags);
37 list_del_rcu(&xs->list);
38 spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
39
40 if (umem->zc)
41 synchronize_net();
42 }
43}
44
84c6b868
JK
45int xdp_umem_query(struct net_device *dev, u16 queue_id)
46{
47 struct netdev_bpf bpf;
48
49 ASSERT_RTNL();
50
51 memset(&bpf, 0, sizeof(bpf));
52 bpf.command = XDP_QUERY_XSK_UMEM;
53 bpf.xsk.queue_id = queue_id;
54
55 if (!dev->netdev_ops->ndo_bpf)
56 return 0;
57 return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
58}
59
173d3adb
BT
60int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
61 u32 queue_id, u16 flags)
62{
63 bool force_zc, force_copy;
64 struct netdev_bpf bpf;
65 int err;
66
67 force_zc = flags & XDP_ZEROCOPY;
68 force_copy = flags & XDP_COPY;
69
70 if (force_zc && force_copy)
71 return -EINVAL;
72
73 if (force_copy)
74 return 0;
75
f734607e 76 if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
96c26e04 77 return force_zc ? -EOPNOTSUPP : 0; /* fail or fallback */
173d3adb 78
f734607e 79 rtnl_lock();
84c6b868
JK
80 err = xdp_umem_query(dev, queue_id);
81 if (err) {
96c26e04 82 err = err < 0 ? -EOPNOTSUPP : -EBUSY;
84c6b868
JK
83 goto err_rtnl_unlock;
84 }
173d3adb 85
f734607e
JK
86 bpf.command = XDP_SETUP_XSK_UMEM;
87 bpf.xsk.umem = umem;
88 bpf.xsk.queue_id = queue_id;
173d3adb 89
f734607e 90 err = dev->netdev_ops->ndo_bpf(dev, &bpf);
f734607e 91 if (err)
84c6b868
JK
92 goto err_rtnl_unlock;
93 rtnl_unlock();
173d3adb 94
f734607e
JK
95 dev_hold(dev);
96 umem->dev = dev;
97 umem->queue_id = queue_id;
98 umem->zc = true;
99 return 0;
84c6b868
JK
100
101err_rtnl_unlock:
102 rtnl_unlock();
103 return force_zc ? err : 0; /* fail or fallback */
173d3adb
BT
104}
105
ac98d8aa 106static void xdp_umem_clear_dev(struct xdp_umem *umem)
173d3adb
BT
107{
108 struct netdev_bpf bpf;
109 int err;
110
111 if (umem->dev) {
112 bpf.command = XDP_SETUP_XSK_UMEM;
113 bpf.xsk.umem = NULL;
114 bpf.xsk.queue_id = umem->queue_id;
115
116 rtnl_lock();
117 err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
118 rtnl_unlock();
119
120 if (err)
121 WARN(1, "failed to disable umem!\n");
122
123 dev_put(umem->dev);
124 umem->dev = NULL;
125 }
126}
127
c0c77d8f
BT
128static void xdp_umem_unpin_pages(struct xdp_umem *umem)
129{
130 unsigned int i;
131
a49049ea
BT
132 for (i = 0; i < umem->npgs; i++) {
133 struct page *page = umem->pgs[i];
c0c77d8f 134
a49049ea
BT
135 set_page_dirty_lock(page);
136 put_page(page);
c0c77d8f 137 }
a49049ea
BT
138
139 kfree(umem->pgs);
140 umem->pgs = NULL;
c0c77d8f
BT
141}
142
143static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
144{
c09290c5
DB
145 if (umem->user) {
146 atomic_long_sub(umem->npgs, &umem->user->locked_vm);
147 free_uid(umem->user);
148 }
c0c77d8f
BT
149}
150
151static void xdp_umem_release(struct xdp_umem *umem)
152{
153 struct task_struct *task;
154 struct mm_struct *mm;
155
173d3adb
BT
156 xdp_umem_clear_dev(umem);
157
423f3832
MK
158 if (umem->fq) {
159 xskq_destroy(umem->fq);
160 umem->fq = NULL;
161 }
162
fe230832
MK
163 if (umem->cq) {
164 xskq_destroy(umem->cq);
165 umem->cq = NULL;
166 }
167
a49049ea 168 xdp_umem_unpin_pages(umem);
c0c77d8f 169
a49049ea
BT
170 task = get_pid_task(umem->pid, PIDTYPE_PID);
171 put_pid(umem->pid);
172 if (!task)
173 goto out;
174 mm = get_task_mm(task);
175 put_task_struct(task);
176 if (!mm)
177 goto out;
c0c77d8f 178
a49049ea 179 mmput(mm);
8aef7340
BT
180 kfree(umem->pages);
181 umem->pages = NULL;
182
c0c77d8f
BT
183 xdp_umem_unaccount_pages(umem);
184out:
185 kfree(umem);
186}
187
188static void xdp_umem_release_deferred(struct work_struct *work)
189{
190 struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
191
192 xdp_umem_release(umem);
193}
194
195void xdp_get_umem(struct xdp_umem *umem)
196{
d3b42f14 197 refcount_inc(&umem->users);
c0c77d8f
BT
198}
199
200void xdp_put_umem(struct xdp_umem *umem)
201{
202 if (!umem)
203 return;
204
d3b42f14 205 if (refcount_dec_and_test(&umem->users)) {
c0c77d8f
BT
206 INIT_WORK(&umem->work, xdp_umem_release_deferred);
207 schedule_work(&umem->work);
208 }
209}
210
211static int xdp_umem_pin_pages(struct xdp_umem *umem)
212{
213 unsigned int gup_flags = FOLL_WRITE;
214 long npgs;
215 int err;
216
a343993c
BT
217 umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
218 GFP_KERNEL | __GFP_NOWARN);
c0c77d8f
BT
219 if (!umem->pgs)
220 return -ENOMEM;
221
222 down_write(&current->mm->mmap_sem);
223 npgs = get_user_pages(umem->address, umem->npgs,
224 gup_flags, &umem->pgs[0], NULL);
225 up_write(&current->mm->mmap_sem);
226
227 if (npgs != umem->npgs) {
228 if (npgs >= 0) {
229 umem->npgs = npgs;
230 err = -ENOMEM;
231 goto out_pin;
232 }
233 err = npgs;
234 goto out_pgs;
235 }
236 return 0;
237
238out_pin:
239 xdp_umem_unpin_pages(umem);
240out_pgs:
241 kfree(umem->pgs);
242 umem->pgs = NULL;
243 return err;
244}
245
246static int xdp_umem_account_pages(struct xdp_umem *umem)
247{
248 unsigned long lock_limit, new_npgs, old_npgs;
249
250 if (capable(CAP_IPC_LOCK))
251 return 0;
252
253 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
254 umem->user = get_uid(current_user());
255
256 do {
257 old_npgs = atomic_long_read(&umem->user->locked_vm);
258 new_npgs = old_npgs + umem->npgs;
259 if (new_npgs > lock_limit) {
260 free_uid(umem->user);
261 umem->user = NULL;
262 return -ENOBUFS;
263 }
264 } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
265 new_npgs) != old_npgs);
266 return 0;
267}
268
a49049ea 269static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
c0c77d8f 270{
bbff2f32
BT
271 u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
272 unsigned int chunks, chunks_per_page;
c0c77d8f 273 u64 addr = mr->addr, size = mr->len;
8aef7340 274 int size_chk, err, i;
c0c77d8f 275
bbff2f32 276 if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
c0c77d8f
BT
277 /* Strictly speaking we could support this, if:
278 * - huge pages, or*
279 * - using an IOMMU, or
280 * - making sure the memory area is consecutive
281 * but for now, we simply say "computer says no".
282 */
283 return -EINVAL;
284 }
285
bbff2f32 286 if (!is_power_of_2(chunk_size))
c0c77d8f
BT
287 return -EINVAL;
288
289 if (!PAGE_ALIGNED(addr)) {
290 /* Memory area has to be page size aligned. For
291 * simplicity, this might change.
292 */
293 return -EINVAL;
294 }
295
296 if ((addr + size) < addr)
297 return -EINVAL;
298
bbff2f32
BT
299 chunks = (unsigned int)div_u64(size, chunk_size);
300 if (chunks == 0)
c0c77d8f
BT
301 return -EINVAL;
302
bbff2f32
BT
303 chunks_per_page = PAGE_SIZE / chunk_size;
304 if (chunks < chunks_per_page || chunks % chunks_per_page)
c0c77d8f
BT
305 return -EINVAL;
306
bbff2f32 307 headroom = ALIGN(headroom, 64);
c0c77d8f 308
bbff2f32 309 size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
c0c77d8f
BT
310 if (size_chk < 0)
311 return -EINVAL;
312
313 umem->pid = get_task_pid(current, PIDTYPE_PID);
c0c77d8f 314 umem->address = (unsigned long)addr;
93ee30f3
MK
315 umem->chunk_mask = ~((u64)chunk_size - 1);
316 umem->size = size;
bbff2f32
BT
317 umem->headroom = headroom;
318 umem->chunk_size_nohr = chunk_size - headroom;
c0c77d8f
BT
319 umem->npgs = size / PAGE_SIZE;
320 umem->pgs = NULL;
321 umem->user = NULL;
ac98d8aa
MK
322 INIT_LIST_HEAD(&umem->xsk_list);
323 spin_lock_init(&umem->xsk_list_lock);
c0c77d8f 324
d3b42f14 325 refcount_set(&umem->users, 1);
c0c77d8f
BT
326
327 err = xdp_umem_account_pages(umem);
328 if (err)
329 goto out;
330
331 err = xdp_umem_pin_pages(umem);
332 if (err)
333 goto out_account;
8aef7340
BT
334
335 umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
336 if (!umem->pages) {
337 err = -ENOMEM;
338 goto out_account;
339 }
340
341 for (i = 0; i < umem->npgs; i++)
342 umem->pages[i].addr = page_address(umem->pgs[i]);
343
c0c77d8f
BT
344 return 0;
345
346out_account:
347 xdp_umem_unaccount_pages(umem);
348out:
349 put_pid(umem->pid);
350 return err;
351}
965a9909 352
a49049ea
BT
353struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
354{
355 struct xdp_umem *umem;
356 int err;
357
358 umem = kzalloc(sizeof(*umem), GFP_KERNEL);
359 if (!umem)
360 return ERR_PTR(-ENOMEM);
361
362 err = xdp_umem_reg(umem, mr);
363 if (err) {
364 kfree(umem);
365 return ERR_PTR(err);
366 }
367
368 return umem;
369}
370
965a9909
MK
371bool xdp_umem_validate_queues(struct xdp_umem *umem)
372{
da60cf00 373 return umem->fq && umem->cq;
965a9909 374}