Commit | Line | Data |
---|---|---|
5b497af4 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
546ac1ff | 2 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
546ac1ff JF |
3 | */ |
4 | ||
5 | /* Devmaps primary use is as a backend map for XDP BPF helper call | |
6 | * bpf_redirect_map(). Because XDP is mostly concerned with performance we | |
7 | * spent some effort to ensure the datapath with redirect maps does not use | |
8 | * any locking. This is a quick note on the details. | |
9 | * | |
10 | * We have three possible paths to get into the devmap control plane bpf | |
11 | * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall | |
12 | * will invoke an update, delete, or lookup operation. To ensure updates and | |
13 | * deletes appear atomic from the datapath side xchg() is used to modify the | |
14 | * netdev_map array. Then because the datapath does a lookup into the netdev_map | |
15 | * array (read-only) from an RCU critical section we use call_rcu() to wait for | |
16 | * an rcu grace period before free'ing the old data structures. This ensures the | |
17 | * datapath always has a valid copy. However, the datapath does a "flush" | |
18 | * operation that pushes any pending packets in the driver outside the RCU | |
19 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | |
d5df2830 THJ |
20 | * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until |
21 | * this list is empty, indicating outstanding flush operations have completed. | |
546ac1ff JF |
22 | * |
23 | * BPF syscalls may race with BPF program calls on any of the update, delete | |
24 | * or lookup operations. As noted above the xchg() operation also keep the | |
25 | * netdev_map consistent in this case. From the devmap side BPF programs | |
26 | * calling into these operations are the same as multiple user space threads | |
27 | * making system calls. | |
2ddf71e2 JF |
28 | * |
29 | * Finally, any of the above may race with a netdev_unregister notifier. The | |
30 | * unregister notifier must search for net devices in the map structure that | |
31 | * contain a reference to the net device and remove them. This is a two step | |
32 | * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) | |
33 | * check to see if the ifindex is the same as the net_device being removed. | |
4cc7b954 JF |
34 | * When removing the dev a cmpxchg() is used to ensure the correct dev is |
35 | * removed, in the case of a concurrent update or delete operation it is | |
36 | * possible that the initially referenced dev is no longer in the map. As the | |
37 | * notifier hook walks the map we know that new dev references can not be | |
38 | * added by the user because core infrastructure ensures dev_get_by_index() | |
39 | * calls will fail at this point. | |
6f9d451a THJ |
40 | * |
41 | * The devmap_hash type is a map type which interprets keys as ifindexes and | |
42 | * indexes these using a hashmap. This allows maps that use ifindex as key to be | |
43 | * densely packed instead of having holes in the lookup array for unused | |
44 | * ifindexes. The setup and packet enqueue/send code is shared between the two | |
45 | * types of devmap; only the lookup and insertion is different. | |
546ac1ff JF |
46 | */ |
47 | #include <linux/bpf.h> | |
67f29e07 | 48 | #include <net/xdp.h> |
546ac1ff | 49 | #include <linux/filter.h> |
67f29e07 | 50 | #include <trace/events/xdp.h> |
c317ab71 | 51 | #include <linux/btf_ids.h> |
546ac1ff | 52 | |
6e71b04a CF |
53 | #define DEV_CREATE_FLAG_MASK \ |
54 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) | |
55 | ||
75ccae62 | 56 | struct xdp_dev_bulk_queue { |
5d053f9d | 57 | struct xdp_frame *q[DEV_MAP_BULK_SIZE]; |
d5df2830 | 58 | struct list_head flush_node; |
75ccae62 | 59 | struct net_device *dev; |
38edddb8 | 60 | struct net_device *dev_rx; |
cb261b59 | 61 | struct bpf_prog *xdp_prog; |
5d053f9d JDB |
62 | unsigned int count; |
63 | }; | |
64 | ||
546ac1ff | 65 | struct bpf_dtab_netdev { |
67f29e07 | 66 | struct net_device *dev; /* must be first member, due to tracepoint */ |
6f9d451a | 67 | struct hlist_node index_hlist; |
546ac1ff | 68 | struct bpf_dtab *dtab; |
fbee97fe | 69 | struct bpf_prog *xdp_prog; |
af4d045c | 70 | struct rcu_head rcu; |
75ccae62 | 71 | unsigned int idx; |
7f1c0426 | 72 | struct bpf_devmap_val val; |
546ac1ff JF |
73 | }; |
74 | ||
75 | struct bpf_dtab { | |
76 | struct bpf_map map; | |
782347b6 | 77 | struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */ |
2ddf71e2 | 78 | struct list_head list; |
6f9d451a THJ |
79 | |
80 | /* these are only used for DEVMAP_HASH type maps */ | |
81 | struct hlist_head *dev_index_head; | |
82 | spinlock_t index_lock; | |
83 | unsigned int items; | |
84 | u32 n_buckets; | |
546ac1ff JF |
85 | }; |
86 | ||
1d233886 | 87 | static DEFINE_PER_CPU(struct list_head, dev_flush_list); |
4cc7b954 | 88 | static DEFINE_SPINLOCK(dev_map_lock); |
2ddf71e2 JF |
89 | static LIST_HEAD(dev_map_list); |
90 | ||
99c51064 THJ |
91 | static struct hlist_head *dev_map_create_hash(unsigned int entries, |
92 | int numa_node) | |
6f9d451a THJ |
93 | { |
94 | int i; | |
95 | struct hlist_head *hash; | |
96 | ||
7dd5d437 | 97 | hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node); |
6f9d451a THJ |
98 | if (hash != NULL) |
99 | for (i = 0; i < entries; i++) | |
100 | INIT_HLIST_HEAD(&hash[i]); | |
101 | ||
102 | return hash; | |
103 | } | |
104 | ||
071cdece THJ |
105 | static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, |
106 | int idx) | |
107 | { | |
108 | return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; | |
109 | } | |
110 | ||
fca16e51 | 111 | static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) |
546ac1ff | 112 | { |
fbee97fe | 113 | u32 valsize = attr->value_size; |
546ac1ff | 114 | |
fbee97fe DA |
115 | /* check sanity of attributes. 2 value sizes supported: |
116 | * 4 bytes: ifindex | |
117 | * 8 bytes: ifindex + prog fd | |
118 | */ | |
546ac1ff | 119 | if (attr->max_entries == 0 || attr->key_size != 4 || |
fbee97fe DA |
120 | (valsize != offsetofend(struct bpf_devmap_val, ifindex) && |
121 | valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || | |
122 | attr->map_flags & ~DEV_CREATE_FLAG_MASK) | |
fca16e51 | 123 | return -EINVAL; |
546ac1ff | 124 | |
0cdbb4b0 THJ |
125 | /* Lookup returns a pointer straight to dev->ifindex, so make sure the |
126 | * verifier prevents writes from the BPF side | |
127 | */ | |
128 | attr->map_flags |= BPF_F_RDONLY_PROG; | |
129 | ||
546ac1ff | 130 | |
bd475643 | 131 | bpf_map_init_from_attr(&dtab->map, attr); |
546ac1ff | 132 | |
6f9d451a THJ |
133 | if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
134 | dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); | |
135 | ||
136 | if (!dtab->n_buckets) /* Overflow check */ | |
137 | return -EINVAL; | |
6f9d451a THJ |
138 | } |
139 | ||
6f9d451a | 140 | if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
99c51064 THJ |
141 | dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, |
142 | dtab->map.numa_node); | |
6f9d451a | 143 | if (!dtab->dev_index_head) |
844f157f | 144 | return -ENOMEM; |
6f9d451a THJ |
145 | |
146 | spin_lock_init(&dtab->index_lock); | |
071cdece | 147 | } else { |
7dd5d437 | 148 | dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries * |
071cdece THJ |
149 | sizeof(struct bpf_dtab_netdev *), |
150 | dtab->map.numa_node); | |
151 | if (!dtab->netdev_map) | |
844f157f | 152 | return -ENOMEM; |
6f9d451a THJ |
153 | } |
154 | ||
fca16e51 | 155 | return 0; |
fca16e51 THJ |
156 | } |
157 | ||
158 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |
159 | { | |
160 | struct bpf_dtab *dtab; | |
161 | int err; | |
162 | ||
163 | if (!capable(CAP_NET_ADMIN)) | |
164 | return ERR_PTR(-EPERM); | |
165 | ||
73cf09a3 | 166 | dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE); |
fca16e51 THJ |
167 | if (!dtab) |
168 | return ERR_PTR(-ENOMEM); | |
169 | ||
170 | err = dev_map_init_map(dtab, attr); | |
171 | if (err) { | |
73cf09a3 | 172 | bpf_map_area_free(dtab); |
fca16e51 THJ |
173 | return ERR_PTR(err); |
174 | } | |
175 | ||
176 | spin_lock(&dev_map_lock); | |
177 | list_add_tail_rcu(&dtab->list, &dev_map_list); | |
178 | spin_unlock(&dev_map_lock); | |
179 | ||
180 | return &dtab->map; | |
546ac1ff JF |
181 | } |
182 | ||
183 | static void dev_map_free(struct bpf_map *map) | |
184 | { | |
185 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
0536b852 | 186 | int i; |
546ac1ff JF |
187 | |
188 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | |
189 | * so the programs (can be more than one that used this map) were | |
42a84a8c JF |
190 | * disconnected from events. The following synchronize_rcu() guarantees |
191 | * both rcu read critical sections complete and waits for | |
192 | * preempt-disable regions (NAPI being the relevant context here) so we | |
193 | * are certain there will be no further reads against the netdev_map and | |
194 | * all flush operations are complete. Flush operations can only be done | |
195 | * from NAPI context for this reason. | |
546ac1ff | 196 | */ |
274043c6 DB |
197 | |
198 | spin_lock(&dev_map_lock); | |
199 | list_del_rcu(&dtab->list); | |
200 | spin_unlock(&dev_map_lock); | |
201 | ||
e624d4ed | 202 | bpf_clear_redirect_map(map); |
546ac1ff JF |
203 | synchronize_rcu(); |
204 | ||
2baae354 ED |
205 | /* Make sure prior __dev_map_entry_free() have completed. */ |
206 | rcu_barrier(); | |
207 | ||
071cdece THJ |
208 | if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
209 | for (i = 0; i < dtab->n_buckets; i++) { | |
210 | struct bpf_dtab_netdev *dev; | |
211 | struct hlist_head *head; | |
212 | struct hlist_node *next; | |
213 | ||
214 | head = dev_map_index_hash(dtab, i); | |
215 | ||
216 | hlist_for_each_entry_safe(dev, next, head, index_hlist) { | |
217 | hlist_del_rcu(&dev->index_hlist); | |
fbee97fe DA |
218 | if (dev->xdp_prog) |
219 | bpf_prog_put(dev->xdp_prog); | |
071cdece THJ |
220 | dev_put(dev->dev); |
221 | kfree(dev); | |
222 | } | |
223 | } | |
224 | ||
99c51064 | 225 | bpf_map_area_free(dtab->dev_index_head); |
071cdece THJ |
226 | } else { |
227 | for (i = 0; i < dtab->map.max_entries; i++) { | |
228 | struct bpf_dtab_netdev *dev; | |
229 | ||
782347b6 | 230 | dev = rcu_dereference_raw(dtab->netdev_map[i]); |
071cdece THJ |
231 | if (!dev) |
232 | continue; | |
233 | ||
fbee97fe DA |
234 | if (dev->xdp_prog) |
235 | bpf_prog_put(dev->xdp_prog); | |
071cdece THJ |
236 | dev_put(dev->dev); |
237 | kfree(dev); | |
238 | } | |
239 | ||
240 | bpf_map_area_free(dtab->netdev_map); | |
546ac1ff JF |
241 | } |
242 | ||
73cf09a3 | 243 | bpf_map_area_free(dtab); |
546ac1ff JF |
244 | } |
245 | ||
246 | static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
247 | { | |
248 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
249 | u32 index = key ? *(u32 *)key : U32_MAX; | |
af4d045c | 250 | u32 *next = next_key; |
546ac1ff JF |
251 | |
252 | if (index >= dtab->map.max_entries) { | |
253 | *next = 0; | |
254 | return 0; | |
255 | } | |
256 | ||
257 | if (index == dtab->map.max_entries - 1) | |
258 | return -ENOENT; | |
546ac1ff JF |
259 | *next = index + 1; |
260 | return 0; | |
261 | } | |
262 | ||
782347b6 THJ |
263 | /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or |
264 | * by local_bh_disable() (from XDP calls inside NAPI). The | |
265 | * rcu_read_lock_bh_held() below makes lockdep accept both. | |
266 | */ | |
e6a4750f | 267 | static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) |
6f9d451a THJ |
268 | { |
269 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
270 | struct hlist_head *head = dev_map_index_hash(dtab, key); | |
271 | struct bpf_dtab_netdev *dev; | |
272 | ||
485ec2ea AG |
273 | hlist_for_each_entry_rcu(dev, head, index_hlist, |
274 | lockdep_is_held(&dtab->index_lock)) | |
6f9d451a THJ |
275 | if (dev->idx == key) |
276 | return dev; | |
277 | ||
278 | return NULL; | |
279 | } | |
280 | ||
281 | static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, | |
282 | void *next_key) | |
283 | { | |
284 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
285 | u32 idx, *next = next_key; | |
286 | struct bpf_dtab_netdev *dev, *next_dev; | |
287 | struct hlist_head *head; | |
288 | int i = 0; | |
289 | ||
290 | if (!key) | |
291 | goto find_first; | |
292 | ||
293 | idx = *(u32 *)key; | |
294 | ||
295 | dev = __dev_map_hash_lookup_elem(map, idx); | |
296 | if (!dev) | |
297 | goto find_first; | |
298 | ||
299 | next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), | |
300 | struct bpf_dtab_netdev, index_hlist); | |
301 | ||
302 | if (next_dev) { | |
303 | *next = next_dev->idx; | |
304 | return 0; | |
305 | } | |
306 | ||
307 | i = idx & (dtab->n_buckets - 1); | |
308 | i++; | |
309 | ||
310 | find_first: | |
311 | for (; i < dtab->n_buckets; i++) { | |
312 | head = dev_map_index_hash(dtab, i); | |
313 | ||
314 | next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), | |
315 | struct bpf_dtab_netdev, | |
316 | index_hlist); | |
317 | if (next_dev) { | |
318 | *next = next_dev->idx; | |
319 | return 0; | |
320 | } | |
321 | } | |
322 | ||
323 | return -ENOENT; | |
324 | } | |
325 | ||
cb261b59 JDB |
326 | static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, |
327 | struct xdp_frame **frames, int n, | |
328 | struct net_device *dev) | |
329 | { | |
330 | struct xdp_txq_info txq = { .dev = dev }; | |
331 | struct xdp_buff xdp; | |
332 | int i, nframes = 0; | |
333 | ||
334 | for (i = 0; i < n; i++) { | |
335 | struct xdp_frame *xdpf = frames[i]; | |
336 | u32 act; | |
337 | int err; | |
338 | ||
339 | xdp_convert_frame_to_buff(xdpf, &xdp); | |
340 | xdp.txq = &txq; | |
341 | ||
342 | act = bpf_prog_run_xdp(xdp_prog, &xdp); | |
343 | switch (act) { | |
344 | case XDP_PASS: | |
345 | err = xdp_update_frame_from_buff(&xdp, xdpf); | |
346 | if (unlikely(err < 0)) | |
347 | xdp_return_frame_rx_napi(xdpf); | |
348 | else | |
349 | frames[nframes++] = xdpf; | |
350 | break; | |
351 | default: | |
c8064e5b | 352 | bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); |
cb261b59 JDB |
353 | fallthrough; |
354 | case XDP_ABORTED: | |
355 | trace_xdp_exception(dev, xdp_prog, act); | |
356 | fallthrough; | |
357 | case XDP_DROP: | |
358 | xdp_return_frame_rx_napi(xdpf); | |
359 | break; | |
360 | } | |
361 | } | |
362 | return nframes; /* sent frames count */ | |
363 | } | |
364 | ||
ebc4ecd4 | 365 | static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) |
5d053f9d | 366 | { |
75ccae62 | 367 | struct net_device *dev = bq->dev; |
cb261b59 | 368 | unsigned int cnt = bq->count; |
e8e0f0f4 | 369 | int sent = 0, err = 0; |
cb261b59 | 370 | int to_send = cnt; |
5d053f9d JDB |
371 | int i; |
372 | ||
cb261b59 | 373 | if (unlikely(!cnt)) |
ebc4ecd4 | 374 | return; |
5d053f9d | 375 | |
cb261b59 | 376 | for (i = 0; i < cnt; i++) { |
5d053f9d JDB |
377 | struct xdp_frame *xdpf = bq->q[i]; |
378 | ||
379 | prefetch(xdpf); | |
380 | } | |
381 | ||
cb261b59 JDB |
382 | if (bq->xdp_prog) { |
383 | to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); | |
384 | if (!to_send) | |
385 | goto out; | |
cb261b59 JDB |
386 | } |
387 | ||
388 | sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags); | |
735fc405 | 389 | if (sent < 0) { |
fdc13979 LB |
390 | /* If ndo_xdp_xmit fails with an errno, no frames have |
391 | * been xmit'ed. | |
392 | */ | |
e74de52e | 393 | err = sent; |
735fc405 | 394 | sent = 0; |
5d053f9d | 395 | } |
5d053f9d | 396 | |
fdc13979 LB |
397 | /* If not all frames have been transmitted, it is our |
398 | * responsibility to free them | |
735fc405 | 399 | */ |
cb261b59 | 400 | for (i = sent; unlikely(i < to_send); i++) |
fdc13979 | 401 | xdp_return_frame_rx_napi(bq->q[i]); |
735fc405 | 402 | |
cb261b59 | 403 | out: |
fdc13979 | 404 | bq->count = 0; |
e8e0f0f4 | 405 | trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err); |
5d053f9d JDB |
406 | } |
407 | ||
782347b6 THJ |
408 | /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the |
409 | * driver before returning from its napi->poll() routine. See the comment above | |
410 | * xdp_do_flush() in filter.c. | |
11393cc9 | 411 | */ |
1d233886 | 412 | void __dev_flush(void) |
11393cc9 | 413 | { |
1d233886 | 414 | struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
75ccae62 | 415 | struct xdp_dev_bulk_queue *bq, *tmp; |
11393cc9 | 416 | |
cb261b59 | 417 | list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { |
0536b852 | 418 | bq_xmit_all(bq, XDP_XMIT_FLUSH); |
cb261b59 JDB |
419 | bq->dev_rx = NULL; |
420 | bq->xdp_prog = NULL; | |
421 | __list_del_clearprev(&bq->flush_node); | |
422 | } | |
11393cc9 JF |
423 | } |
424 | ||
782347b6 THJ |
425 | /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or |
426 | * by local_bh_disable() (from XDP calls inside NAPI). The | |
427 | * rcu_read_lock_bh_held() below makes lockdep accept both. | |
546ac1ff | 428 | */ |
e6a4750f | 429 | static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key) |
546ac1ff JF |
430 | { |
431 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
67f29e07 | 432 | struct bpf_dtab_netdev *obj; |
546ac1ff | 433 | |
af4d045c | 434 | if (key >= map->max_entries) |
546ac1ff JF |
435 | return NULL; |
436 | ||
782347b6 THJ |
437 | obj = rcu_dereference_check(dtab->netdev_map[key], |
438 | rcu_read_lock_bh_held()); | |
67f29e07 JDB |
439 | return obj; |
440 | } | |
441 | ||
782347b6 THJ |
442 | /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu |
443 | * variable access, and map elements stick around. See comment above | |
444 | * xdp_do_flush() in filter.c. | |
5d053f9d | 445 | */ |
ebc4ecd4 | 446 | static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, |
cb261b59 | 447 | struct net_device *dev_rx, struct bpf_prog *xdp_prog) |
5d053f9d | 448 | { |
1d233886 | 449 | struct list_head *flush_list = this_cpu_ptr(&dev_flush_list); |
75ccae62 | 450 | struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq); |
5d053f9d JDB |
451 | |
452 | if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) | |
0536b852 | 453 | bq_xmit_all(bq, 0); |
5d053f9d | 454 | |
38edddb8 JDB |
455 | /* Ingress dev_rx will be the same for all xdp_frame's in |
456 | * bulk_queue, because bq stored per-CPU and must be flushed | |
457 | * from net_device drivers NAPI func end. | |
cb261b59 JDB |
458 | * |
459 | * Do the same with xdp_prog and flush_list since these fields | |
460 | * are only ever modified together. | |
38edddb8 | 461 | */ |
cb261b59 | 462 | if (!bq->dev_rx) { |
38edddb8 | 463 | bq->dev_rx = dev_rx; |
cb261b59 JDB |
464 | bq->xdp_prog = xdp_prog; |
465 | list_add(&bq->flush_node, flush_list); | |
466 | } | |
38edddb8 | 467 | |
5d053f9d | 468 | bq->q[bq->count++] = xdpf; |
5d053f9d JDB |
469 | } |
470 | ||
d53ad5d8 | 471 | static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, |
cb261b59 JDB |
472 | struct net_device *dev_rx, |
473 | struct bpf_prog *xdp_prog) | |
67f29e07 | 474 | { |
d8d7218a | 475 | int err; |
67f29e07 | 476 | |
b9d460c9 LB |
477 | if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) |
478 | return -EOPNOTSUPP; | |
479 | ||
480 | if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && | |
481 | xdp_frame_has_frags(xdpf))) | |
67f29e07 JDB |
482 | return -EOPNOTSUPP; |
483 | ||
bd82ea52 | 484 | err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf)); |
d8d7218a TM |
485 | if (unlikely(err)) |
486 | return err; | |
487 | ||
cb261b59 | 488 | bq_enqueue(dev, xdpf, dev_rx, xdp_prog); |
ebc4ecd4 | 489 | return 0; |
546ac1ff JF |
490 | } |
491 | ||
2ea5eaba KKD |
492 | static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst) |
493 | { | |
494 | struct xdp_txq_info txq = { .dev = dst->dev }; | |
495 | struct xdp_buff xdp; | |
496 | u32 act; | |
497 | ||
498 | if (!dst->xdp_prog) | |
499 | return XDP_PASS; | |
500 | ||
501 | __skb_pull(skb, skb->mac_len); | |
502 | xdp.txq = &txq; | |
503 | ||
504 | act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog); | |
505 | switch (act) { | |
506 | case XDP_PASS: | |
507 | __skb_push(skb, skb->mac_len); | |
508 | break; | |
509 | default: | |
c8064e5b | 510 | bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); |
2ea5eaba KKD |
511 | fallthrough; |
512 | case XDP_ABORTED: | |
513 | trace_xdp_exception(dst->dev, dst->xdp_prog, act); | |
514 | fallthrough; | |
515 | case XDP_DROP: | |
516 | kfree_skb(skb); | |
517 | break; | |
518 | } | |
519 | ||
520 | return act; | |
521 | } | |
522 | ||
d53ad5d8 | 523 | int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, |
1d233886 THJ |
524 | struct net_device *dev_rx) |
525 | { | |
d53ad5d8 | 526 | return __xdp_enqueue(dev, xdpf, dev_rx, NULL); |
1d233886 THJ |
527 | } |
528 | ||
d53ad5d8 | 529 | int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, |
1d233886 THJ |
530 | struct net_device *dev_rx) |
531 | { | |
532 | struct net_device *dev = dst->dev; | |
533 | ||
d53ad5d8 | 534 | return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); |
1d233886 THJ |
535 | } |
536 | ||
d53ad5d8 | 537 | static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) |
e624d4ed | 538 | { |
b9d460c9 LB |
539 | if (!obj) |
540 | return false; | |
541 | ||
542 | if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT)) | |
543 | return false; | |
544 | ||
545 | if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) && | |
546 | xdp_frame_has_frags(xdpf))) | |
e624d4ed HL |
547 | return false; |
548 | ||
bd82ea52 | 549 | if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf))) |
e624d4ed HL |
550 | return false; |
551 | ||
552 | return true; | |
553 | } | |
554 | ||
555 | static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj, | |
556 | struct net_device *dev_rx, | |
557 | struct xdp_frame *xdpf) | |
558 | { | |
559 | struct xdp_frame *nxdpf; | |
560 | ||
561 | nxdpf = xdpf_clone(xdpf); | |
562 | if (!nxdpf) | |
563 | return -ENOMEM; | |
564 | ||
565 | bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog); | |
566 | ||
567 | return 0; | |
568 | } | |
569 | ||
aeea1b86 JM |
570 | static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex) |
571 | { | |
572 | while (num_excluded--) { | |
573 | if (ifindex == excluded[num_excluded]) | |
574 | return true; | |
575 | } | |
576 | return false; | |
577 | } | |
578 | ||
579 | /* Get ifindex of each upper device. 'indexes' must be able to hold at | |
580 | * least MAX_NEST_DEV elements. | |
581 | * Returns the number of ifindexes added. | |
582 | */ | |
583 | static int get_upper_ifindexes(struct net_device *dev, int *indexes) | |
584 | { | |
585 | struct net_device *upper; | |
586 | struct list_head *iter; | |
587 | int n = 0; | |
588 | ||
589 | netdev_for_each_upper_dev_rcu(dev, upper, iter) { | |
590 | indexes[n++] = upper->ifindex; | |
591 | } | |
592 | return n; | |
593 | } | |
594 | ||
d53ad5d8 | 595 | int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, |
e624d4ed HL |
596 | struct bpf_map *map, bool exclude_ingress) |
597 | { | |
598 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
e624d4ed | 599 | struct bpf_dtab_netdev *dst, *last_dst = NULL; |
aeea1b86 | 600 | int excluded_devices[1+MAX_NEST_DEV]; |
e624d4ed | 601 | struct hlist_head *head; |
aeea1b86 | 602 | int num_excluded = 0; |
e624d4ed HL |
603 | unsigned int i; |
604 | int err; | |
605 | ||
aeea1b86 JM |
606 | if (exclude_ingress) { |
607 | num_excluded = get_upper_ifindexes(dev_rx, excluded_devices); | |
608 | excluded_devices[num_excluded++] = dev_rx->ifindex; | |
609 | } | |
610 | ||
e624d4ed HL |
611 | if (map->map_type == BPF_MAP_TYPE_DEVMAP) { |
612 | for (i = 0; i < map->max_entries; i++) { | |
0fc4dcc1 THJ |
613 | dst = rcu_dereference_check(dtab->netdev_map[i], |
614 | rcu_read_lock_bh_held()); | |
d53ad5d8 | 615 | if (!is_valid_dst(dst, xdpf)) |
aeea1b86 JM |
616 | continue; |
617 | ||
618 | if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) | |
e624d4ed HL |
619 | continue; |
620 | ||
621 | /* we only need n-1 clones; last_dst enqueued below */ | |
622 | if (!last_dst) { | |
623 | last_dst = dst; | |
624 | continue; | |
625 | } | |
626 | ||
627 | err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); | |
628 | if (err) | |
629 | return err; | |
630 | ||
631 | last_dst = dst; | |
632 | } | |
633 | } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ | |
634 | for (i = 0; i < dtab->n_buckets; i++) { | |
635 | head = dev_map_index_hash(dtab, i); | |
636 | hlist_for_each_entry_rcu(dst, head, index_hlist, | |
637 | lockdep_is_held(&dtab->index_lock)) { | |
d53ad5d8 | 638 | if (!is_valid_dst(dst, xdpf)) |
aeea1b86 JM |
639 | continue; |
640 | ||
641 | if (is_ifindex_excluded(excluded_devices, num_excluded, | |
642 | dst->dev->ifindex)) | |
e624d4ed HL |
643 | continue; |
644 | ||
645 | /* we only need n-1 clones; last_dst enqueued below */ | |
646 | if (!last_dst) { | |
647 | last_dst = dst; | |
648 | continue; | |
649 | } | |
650 | ||
651 | err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf); | |
652 | if (err) | |
653 | return err; | |
654 | ||
655 | last_dst = dst; | |
656 | } | |
657 | } | |
658 | } | |
659 | ||
660 | /* consume the last copy of the frame */ | |
661 | if (last_dst) | |
662 | bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog); | |
663 | else | |
664 | xdp_return_frame_rx_napi(xdpf); /* dtab is empty */ | |
665 | ||
666 | return 0; | |
667 | } | |
668 | ||
6d5fc195 TM |
669 | int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, |
670 | struct bpf_prog *xdp_prog) | |
671 | { | |
672 | int err; | |
673 | ||
d8d7218a | 674 | err = xdp_ok_fwd_dev(dst->dev, skb->len); |
6d5fc195 TM |
675 | if (unlikely(err)) |
676 | return err; | |
2ea5eaba KKD |
677 | |
678 | /* Redirect has already succeeded semantically at this point, so we just | |
679 | * return 0 even if packet is dropped. Helper below takes care of | |
680 | * freeing skb. | |
681 | */ | |
682 | if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS) | |
683 | return 0; | |
684 | ||
6d5fc195 TM |
685 | skb->dev = dst->dev; |
686 | generic_xdp_tx(skb, xdp_prog); | |
687 | ||
688 | return 0; | |
689 | } | |
690 | ||
e624d4ed HL |
691 | static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst, |
692 | struct sk_buff *skb, | |
693 | struct bpf_prog *xdp_prog) | |
694 | { | |
695 | struct sk_buff *nskb; | |
696 | int err; | |
697 | ||
698 | nskb = skb_clone(skb, GFP_ATOMIC); | |
699 | if (!nskb) | |
700 | return -ENOMEM; | |
701 | ||
702 | err = dev_map_generic_redirect(dst, nskb, xdp_prog); | |
703 | if (unlikely(err)) { | |
704 | consume_skb(nskb); | |
705 | return err; | |
706 | } | |
707 | ||
708 | return 0; | |
709 | } | |
710 | ||
711 | int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb, | |
712 | struct bpf_prog *xdp_prog, struct bpf_map *map, | |
713 | bool exclude_ingress) | |
714 | { | |
715 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
e624d4ed | 716 | struct bpf_dtab_netdev *dst, *last_dst = NULL; |
aeea1b86 | 717 | int excluded_devices[1+MAX_NEST_DEV]; |
e624d4ed HL |
718 | struct hlist_head *head; |
719 | struct hlist_node *next; | |
aeea1b86 | 720 | int num_excluded = 0; |
e624d4ed HL |
721 | unsigned int i; |
722 | int err; | |
723 | ||
aeea1b86 JM |
724 | if (exclude_ingress) { |
725 | num_excluded = get_upper_ifindexes(dev, excluded_devices); | |
726 | excluded_devices[num_excluded++] = dev->ifindex; | |
727 | } | |
728 | ||
e624d4ed HL |
729 | if (map->map_type == BPF_MAP_TYPE_DEVMAP) { |
730 | for (i = 0; i < map->max_entries; i++) { | |
0fc4dcc1 THJ |
731 | dst = rcu_dereference_check(dtab->netdev_map[i], |
732 | rcu_read_lock_bh_held()); | |
aeea1b86 JM |
733 | if (!dst) |
734 | continue; | |
735 | ||
736 | if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) | |
e624d4ed HL |
737 | continue; |
738 | ||
739 | /* we only need n-1 clones; last_dst enqueued below */ | |
740 | if (!last_dst) { | |
741 | last_dst = dst; | |
742 | continue; | |
743 | } | |
744 | ||
745 | err = dev_map_redirect_clone(last_dst, skb, xdp_prog); | |
746 | if (err) | |
747 | return err; | |
748 | ||
749 | last_dst = dst; | |
aeea1b86 | 750 | |
e624d4ed HL |
751 | } |
752 | } else { /* BPF_MAP_TYPE_DEVMAP_HASH */ | |
753 | for (i = 0; i < dtab->n_buckets; i++) { | |
754 | head = dev_map_index_hash(dtab, i); | |
755 | hlist_for_each_entry_safe(dst, next, head, index_hlist) { | |
aeea1b86 JM |
756 | if (!dst) |
757 | continue; | |
758 | ||
759 | if (is_ifindex_excluded(excluded_devices, num_excluded, | |
760 | dst->dev->ifindex)) | |
e624d4ed HL |
761 | continue; |
762 | ||
763 | /* we only need n-1 clones; last_dst enqueued below */ | |
764 | if (!last_dst) { | |
765 | last_dst = dst; | |
766 | continue; | |
767 | } | |
768 | ||
769 | err = dev_map_redirect_clone(last_dst, skb, xdp_prog); | |
770 | if (err) | |
771 | return err; | |
772 | ||
773 | last_dst = dst; | |
774 | } | |
775 | } | |
776 | } | |
777 | ||
778 | /* consume the first skb and return */ | |
779 | if (last_dst) | |
780 | return dev_map_generic_redirect(last_dst, skb, xdp_prog); | |
781 | ||
782 | /* dtab is empty */ | |
783 | consume_skb(skb); | |
784 | return 0; | |
785 | } | |
786 | ||
af4d045c DB |
787 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
788 | { | |
67f29e07 | 789 | struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); |
af4d045c | 790 | |
7f1c0426 | 791 | return obj ? &obj->val : NULL; |
af4d045c DB |
792 | } |
793 | ||
6f9d451a THJ |
794 | static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) |
795 | { | |
796 | struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, | |
797 | *(u32 *)key); | |
7f1c0426 | 798 | return obj ? &obj->val : NULL; |
6f9d451a THJ |
799 | } |
800 | ||
546ac1ff JF |
801 | static void __dev_map_entry_free(struct rcu_head *rcu) |
802 | { | |
af4d045c | 803 | struct bpf_dtab_netdev *dev; |
546ac1ff | 804 | |
af4d045c | 805 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
fbee97fe DA |
806 | if (dev->xdp_prog) |
807 | bpf_prog_put(dev->xdp_prog); | |
af4d045c DB |
808 | dev_put(dev->dev); |
809 | kfree(dev); | |
546ac1ff JF |
810 | } |
811 | ||
d7ba4cc9 | 812 | static long dev_map_delete_elem(struct bpf_map *map, void *key) |
546ac1ff JF |
813 | { |
814 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
815 | struct bpf_dtab_netdev *old_dev; | |
816 | int k = *(u32 *)key; | |
817 | ||
818 | if (k >= map->max_entries) | |
819 | return -EINVAL; | |
820 | ||
782347b6 | 821 | old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL)); |
fa5e83df | 822 | if (old_dev) { |
546ac1ff | 823 | call_rcu(&old_dev->rcu, __dev_map_entry_free); |
fa5e83df YS |
824 | atomic_dec((atomic_t *)&dtab->items); |
825 | } | |
546ac1ff JF |
826 | return 0; |
827 | } | |
828 | ||
d7ba4cc9 | 829 | static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) |
6f9d451a THJ |
830 | { |
831 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
832 | struct bpf_dtab_netdev *old_dev; | |
833 | int k = *(u32 *)key; | |
834 | unsigned long flags; | |
835 | int ret = -ENOENT; | |
836 | ||
837 | spin_lock_irqsave(&dtab->index_lock, flags); | |
838 | ||
839 | old_dev = __dev_map_hash_lookup_elem(map, k); | |
840 | if (old_dev) { | |
841 | dtab->items--; | |
842 | hlist_del_init_rcu(&old_dev->index_hlist); | |
843 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
844 | ret = 0; | |
845 | } | |
846 | spin_unlock_irqrestore(&dtab->index_lock, flags); | |
847 | ||
848 | return ret; | |
849 | } | |
850 | ||
fca16e51 THJ |
851 | static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, |
852 | struct bpf_dtab *dtab, | |
7f1c0426 | 853 | struct bpf_devmap_val *val, |
fca16e51 | 854 | unsigned int idx) |
546ac1ff | 855 | { |
fbee97fe | 856 | struct bpf_prog *prog = NULL; |
fca16e51 | 857 | struct bpf_dtab_netdev *dev; |
fca16e51 | 858 | |
1440290a | 859 | dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), |
ace2bee8 | 860 | GFP_NOWAIT | __GFP_NOWARN, |
1440290a | 861 | dtab->map.numa_node); |
fca16e51 THJ |
862 | if (!dev) |
863 | return ERR_PTR(-ENOMEM); | |
864 | ||
7f1c0426 DA |
865 | dev->dev = dev_get_by_index(net, val->ifindex); |
866 | if (!dev->dev) | |
867 | goto err_out; | |
fca16e51 | 868 | |
281920b7 | 869 | if (val->bpf_prog.fd > 0) { |
fbee97fe DA |
870 | prog = bpf_prog_get_type_dev(val->bpf_prog.fd, |
871 | BPF_PROG_TYPE_XDP, false); | |
872 | if (IS_ERR(prog)) | |
873 | goto err_put_dev; | |
f45d5b6c THJ |
874 | if (prog->expected_attach_type != BPF_XDP_DEVMAP || |
875 | !bpf_prog_map_compatible(&dtab->map, prog)) | |
fbee97fe DA |
876 | goto err_put_prog; |
877 | } | |
878 | ||
fca16e51 THJ |
879 | dev->idx = idx; |
880 | dev->dtab = dtab; | |
fbee97fe DA |
881 | if (prog) { |
882 | dev->xdp_prog = prog; | |
883 | dev->val.bpf_prog.id = prog->aux->id; | |
884 | } else { | |
885 | dev->xdp_prog = NULL; | |
886 | dev->val.bpf_prog.id = 0; | |
887 | } | |
7f1c0426 | 888 | dev->val.ifindex = val->ifindex; |
fca16e51 THJ |
889 | |
890 | return dev; | |
fbee97fe DA |
891 | err_put_prog: |
892 | bpf_prog_put(prog); | |
893 | err_put_dev: | |
894 | dev_put(dev->dev); | |
7f1c0426 DA |
895 | err_out: |
896 | kfree(dev); | |
897 | return ERR_PTR(-EINVAL); | |
fca16e51 THJ |
898 | } |
899 | ||
d7ba4cc9 JK |
900 | static long __dev_map_update_elem(struct net *net, struct bpf_map *map, |
901 | void *key, void *value, u64 map_flags) | |
fca16e51 THJ |
902 | { |
903 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
546ac1ff | 904 | struct bpf_dtab_netdev *dev, *old_dev; |
281920b7 | 905 | struct bpf_devmap_val val = {}; |
d5df2830 | 906 | u32 i = *(u32 *)key; |
546ac1ff JF |
907 | |
908 | if (unlikely(map_flags > BPF_EXIST)) | |
909 | return -EINVAL; | |
546ac1ff JF |
910 | if (unlikely(i >= dtab->map.max_entries)) |
911 | return -E2BIG; | |
546ac1ff JF |
912 | if (unlikely(map_flags == BPF_NOEXIST)) |
913 | return -EEXIST; | |
914 | ||
7f1c0426 DA |
915 | /* already verified value_size <= sizeof val */ |
916 | memcpy(&val, value, map->value_size); | |
917 | ||
918 | if (!val.ifindex) { | |
546ac1ff | 919 | dev = NULL; |
fbee97fe | 920 | /* can not specify fd if ifindex is 0 */ |
281920b7 | 921 | if (val.bpf_prog.fd > 0) |
fbee97fe | 922 | return -EINVAL; |
546ac1ff | 923 | } else { |
7f1c0426 | 924 | dev = __dev_map_alloc_node(net, dtab, &val, i); |
fca16e51 THJ |
925 | if (IS_ERR(dev)) |
926 | return PTR_ERR(dev); | |
546ac1ff JF |
927 | } |
928 | ||
929 | /* Use call_rcu() here to ensure rcu critical sections have completed | |
930 | * Remembering the driver side flush operation will happen before the | |
931 | * net device is removed. | |
932 | */ | |
782347b6 | 933 | old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev))); |
546ac1ff JF |
934 | if (old_dev) |
935 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
fa5e83df YS |
936 | else |
937 | atomic_inc((atomic_t *)&dtab->items); | |
546ac1ff JF |
938 | |
939 | return 0; | |
940 | } | |
941 | ||
d7ba4cc9 JK |
942 | static long dev_map_update_elem(struct bpf_map *map, void *key, void *value, |
943 | u64 map_flags) | |
fca16e51 THJ |
944 | { |
945 | return __dev_map_update_elem(current->nsproxy->net_ns, | |
946 | map, key, value, map_flags); | |
947 | } | |
948 | ||
d7ba4cc9 JK |
949 | static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, |
950 | void *key, void *value, u64 map_flags) | |
6f9d451a THJ |
951 | { |
952 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
953 | struct bpf_dtab_netdev *dev, *old_dev; | |
281920b7 | 954 | struct bpf_devmap_val val = {}; |
6f9d451a THJ |
955 | u32 idx = *(u32 *)key; |
956 | unsigned long flags; | |
af58e7ee | 957 | int err = -EEXIST; |
6f9d451a | 958 | |
7f1c0426 DA |
959 | /* already verified value_size <= sizeof val */ |
960 | memcpy(&val, value, map->value_size); | |
961 | ||
962 | if (unlikely(map_flags > BPF_EXIST || !val.ifindex)) | |
6f9d451a THJ |
963 | return -EINVAL; |
964 | ||
af58e7ee THJ |
965 | spin_lock_irqsave(&dtab->index_lock, flags); |
966 | ||
6f9d451a THJ |
967 | old_dev = __dev_map_hash_lookup_elem(map, idx); |
968 | if (old_dev && (map_flags & BPF_NOEXIST)) | |
af58e7ee | 969 | goto out_err; |
6f9d451a | 970 | |
7f1c0426 | 971 | dev = __dev_map_alloc_node(net, dtab, &val, idx); |
af58e7ee THJ |
972 | if (IS_ERR(dev)) { |
973 | err = PTR_ERR(dev); | |
974 | goto out_err; | |
975 | } | |
6f9d451a THJ |
976 | |
977 | if (old_dev) { | |
978 | hlist_del_rcu(&old_dev->index_hlist); | |
979 | } else { | |
980 | if (dtab->items >= dtab->map.max_entries) { | |
981 | spin_unlock_irqrestore(&dtab->index_lock, flags); | |
982 | call_rcu(&dev->rcu, __dev_map_entry_free); | |
983 | return -E2BIG; | |
984 | } | |
985 | dtab->items++; | |
986 | } | |
987 | ||
988 | hlist_add_head_rcu(&dev->index_hlist, | |
989 | dev_map_index_hash(dtab, idx)); | |
990 | spin_unlock_irqrestore(&dtab->index_lock, flags); | |
991 | ||
992 | if (old_dev) | |
993 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
994 | ||
995 | return 0; | |
af58e7ee THJ |
996 | |
997 | out_err: | |
998 | spin_unlock_irqrestore(&dtab->index_lock, flags); | |
999 | return err; | |
6f9d451a THJ |
1000 | } |
1001 | ||
d7ba4cc9 JK |
1002 | static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, |
1003 | u64 map_flags) | |
6f9d451a THJ |
1004 | { |
1005 | return __dev_map_hash_update_elem(current->nsproxy->net_ns, | |
1006 | map, key, value, map_flags); | |
1007 | } | |
1008 | ||
d7ba4cc9 | 1009 | static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) |
e6a4750f | 1010 | { |
e624d4ed HL |
1011 | return __bpf_xdp_redirect_map(map, ifindex, flags, |
1012 | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, | |
1013 | __dev_map_lookup_elem); | |
e6a4750f BT |
1014 | } |
1015 | ||
d7ba4cc9 | 1016 | static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) |
e6a4750f | 1017 | { |
e624d4ed HL |
1018 | return __bpf_xdp_redirect_map(map, ifindex, flags, |
1019 | BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, | |
1020 | __dev_map_hash_lookup_elem); | |
e6a4750f BT |
1021 | } |
1022 | ||
fa5e83df YS |
1023 | static u64 dev_map_mem_usage(const struct bpf_map *map) |
1024 | { | |
1025 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
1026 | u64 usage = sizeof(struct bpf_dtab); | |
1027 | ||
1028 | if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) | |
1029 | usage += (u64)dtab->n_buckets * sizeof(struct hlist_head); | |
1030 | else | |
1031 | usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *); | |
1032 | usage += atomic_read((atomic_t *)&dtab->items) * | |
1033 | (u64)sizeof(struct bpf_dtab_netdev); | |
1034 | return usage; | |
1035 | } | |
1036 | ||
c317ab71 | 1037 | BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab) |
546ac1ff | 1038 | const struct bpf_map_ops dev_map_ops = { |
f4d05259 | 1039 | .map_meta_equal = bpf_map_meta_equal, |
546ac1ff JF |
1040 | .map_alloc = dev_map_alloc, |
1041 | .map_free = dev_map_free, | |
1042 | .map_get_next_key = dev_map_get_next_key, | |
1043 | .map_lookup_elem = dev_map_lookup_elem, | |
1044 | .map_update_elem = dev_map_update_elem, | |
1045 | .map_delete_elem = dev_map_delete_elem, | |
e8d2bec0 | 1046 | .map_check_btf = map_check_no_btf, |
fa5e83df | 1047 | .map_mem_usage = dev_map_mem_usage, |
c317ab71 | 1048 | .map_btf_id = &dev_map_btf_ids[0], |
e6a4750f | 1049 | .map_redirect = dev_map_redirect, |
546ac1ff | 1050 | }; |
2ddf71e2 | 1051 | |
6f9d451a | 1052 | const struct bpf_map_ops dev_map_hash_ops = { |
f4d05259 | 1053 | .map_meta_equal = bpf_map_meta_equal, |
6f9d451a THJ |
1054 | .map_alloc = dev_map_alloc, |
1055 | .map_free = dev_map_free, | |
1056 | .map_get_next_key = dev_map_hash_get_next_key, | |
1057 | .map_lookup_elem = dev_map_hash_lookup_elem, | |
1058 | .map_update_elem = dev_map_hash_update_elem, | |
1059 | .map_delete_elem = dev_map_hash_delete_elem, | |
1060 | .map_check_btf = map_check_no_btf, | |
fa5e83df | 1061 | .map_mem_usage = dev_map_mem_usage, |
c317ab71 | 1062 | .map_btf_id = &dev_map_btf_ids[0], |
e6a4750f | 1063 | .map_redirect = dev_hash_map_redirect, |
6f9d451a THJ |
1064 | }; |
1065 | ||
ce197d83 THJ |
1066 | static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, |
1067 | struct net_device *netdev) | |
1068 | { | |
1069 | unsigned long flags; | |
1070 | u32 i; | |
1071 | ||
1072 | spin_lock_irqsave(&dtab->index_lock, flags); | |
1073 | for (i = 0; i < dtab->n_buckets; i++) { | |
1074 | struct bpf_dtab_netdev *dev; | |
1075 | struct hlist_head *head; | |
1076 | struct hlist_node *next; | |
1077 | ||
1078 | head = dev_map_index_hash(dtab, i); | |
1079 | ||
1080 | hlist_for_each_entry_safe(dev, next, head, index_hlist) { | |
1081 | if (netdev != dev->dev) | |
1082 | continue; | |
1083 | ||
1084 | dtab->items--; | |
1085 | hlist_del_rcu(&dev->index_hlist); | |
1086 | call_rcu(&dev->rcu, __dev_map_entry_free); | |
1087 | } | |
1088 | } | |
1089 | spin_unlock_irqrestore(&dtab->index_lock, flags); | |
1090 | } | |
1091 | ||
2ddf71e2 JF |
1092 | static int dev_map_notification(struct notifier_block *notifier, |
1093 | ulong event, void *ptr) | |
1094 | { | |
1095 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); | |
1096 | struct bpf_dtab *dtab; | |
75ccae62 | 1097 | int i, cpu; |
2ddf71e2 JF |
1098 | |
1099 | switch (event) { | |
75ccae62 THJ |
1100 | case NETDEV_REGISTER: |
1101 | if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq) | |
1102 | break; | |
1103 | ||
1104 | /* will be freed in free_netdev() */ | |
7d4553b6 | 1105 | netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); |
75ccae62 THJ |
1106 | if (!netdev->xdp_bulkq) |
1107 | return NOTIFY_BAD; | |
1108 | ||
1109 | for_each_possible_cpu(cpu) | |
1110 | per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev; | |
1111 | break; | |
2ddf71e2 | 1112 | case NETDEV_UNREGISTER: |
4cc7b954 JF |
1113 | /* This rcu_read_lock/unlock pair is needed because |
1114 | * dev_map_list is an RCU list AND to ensure a delete | |
1115 | * operation does not free a netdev_map entry while we | |
1116 | * are comparing it against the netdev being unregistered. | |
1117 | */ | |
1118 | rcu_read_lock(); | |
1119 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { | |
ce197d83 THJ |
1120 | if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { |
1121 | dev_map_hash_remove_netdev(dtab, netdev); | |
1122 | continue; | |
1123 | } | |
1124 | ||
2ddf71e2 | 1125 | for (i = 0; i < dtab->map.max_entries; i++) { |
4cc7b954 | 1126 | struct bpf_dtab_netdev *dev, *odev; |
2ddf71e2 | 1127 | |
782347b6 | 1128 | dev = rcu_dereference(dtab->netdev_map[i]); |
f592f804 | 1129 | if (!dev || netdev != dev->dev) |
2ddf71e2 | 1130 | continue; |
782347b6 | 1131 | odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL)); |
fa5e83df | 1132 | if (dev == odev) { |
2ddf71e2 JF |
1133 | call_rcu(&dev->rcu, |
1134 | __dev_map_entry_free); | |
fa5e83df YS |
1135 | atomic_dec((atomic_t *)&dtab->items); |
1136 | } | |
2ddf71e2 JF |
1137 | } |
1138 | } | |
4cc7b954 | 1139 | rcu_read_unlock(); |
2ddf71e2 JF |
1140 | break; |
1141 | default: | |
1142 | break; | |
1143 | } | |
1144 | return NOTIFY_OK; | |
1145 | } | |
1146 | ||
1147 | static struct notifier_block dev_map_notifier = { | |
1148 | .notifier_call = dev_map_notification, | |
1149 | }; | |
1150 | ||
1151 | static int __init dev_map_init(void) | |
1152 | { | |
96360004 BT |
1153 | int cpu; |
1154 | ||
67f29e07 JDB |
1155 | /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ |
1156 | BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != | |
1157 | offsetof(struct _bpf_dtab_netdev, dev)); | |
2ddf71e2 | 1158 | register_netdevice_notifier(&dev_map_notifier); |
96360004 BT |
1159 | |
1160 | for_each_possible_cpu(cpu) | |
1d233886 | 1161 | INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu)); |
2ddf71e2 JF |
1162 | return 0; |
1163 | } | |
1164 | ||
1165 | subsys_initcall(dev_map_init); |