Commit | Line | Data |
---|---|---|
546ac1ff JF |
1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
2 | * | |
3 | * This program is free software; you can redistribute it and/or | |
4 | * modify it under the terms of version 2 of the GNU General Public | |
5 | * License as published by the Free Software Foundation. | |
6 | * | |
7 | * This program is distributed in the hope that it will be useful, but | |
8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
10 | * General Public License for more details. | |
11 | */ | |
12 | ||
13 | /* Devmaps primary use is as a backend map for XDP BPF helper call | |
14 | * bpf_redirect_map(). Because XDP is mostly concerned with performance we | |
15 | * spent some effort to ensure the datapath with redirect maps does not use | |
16 | * any locking. This is a quick note on the details. | |
17 | * | |
18 | * We have three possible paths to get into the devmap control plane bpf | |
19 | * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall | |
20 | * will invoke an update, delete, or lookup operation. To ensure updates and | |
21 | * deletes appear atomic from the datapath side xchg() is used to modify the | |
22 | * netdev_map array. Then because the datapath does a lookup into the netdev_map | |
23 | * array (read-only) from an RCU critical section we use call_rcu() to wait for | |
24 | * an rcu grace period before free'ing the old data structures. This ensures the | |
25 | * datapath always has a valid copy. However, the datapath does a "flush" | |
26 | * operation that pushes any pending packets in the driver outside the RCU | |
27 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | |
28 | * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed | |
29 | * until all bits are cleared indicating outstanding flush operations have | |
30 | * completed. | |
31 | * | |
32 | * BPF syscalls may race with BPF program calls on any of the update, delete | |
33 | * or lookup operations. As noted above the xchg() operation also keep the | |
34 | * netdev_map consistent in this case. From the devmap side BPF programs | |
35 | * calling into these operations are the same as multiple user space threads | |
36 | * making system calls. | |
2ddf71e2 JF |
37 | * |
38 | * Finally, any of the above may race with a netdev_unregister notifier. The | |
39 | * unregister notifier must search for net devices in the map structure that | |
40 | * contain a reference to the net device and remove them. This is a two step | |
41 | * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) | |
42 | * check to see if the ifindex is the same as the net_device being removed. | |
4cc7b954 JF |
43 | * When removing the dev a cmpxchg() is used to ensure the correct dev is |
44 | * removed, in the case of a concurrent update or delete operation it is | |
45 | * possible that the initially referenced dev is no longer in the map. As the | |
46 | * notifier hook walks the map we know that new dev references can not be | |
47 | * added by the user because core infrastructure ensures dev_get_by_index() | |
48 | * calls will fail at this point. | |
546ac1ff JF |
49 | */ |
50 | #include <linux/bpf.h> | |
546ac1ff | 51 | #include <linux/filter.h> |
546ac1ff | 52 | |
6e71b04a CF |
53 | #define DEV_CREATE_FLAG_MASK \ |
54 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) | |
55 | ||
546ac1ff JF |
56 | struct bpf_dtab_netdev { |
57 | struct net_device *dev; | |
546ac1ff | 58 | struct bpf_dtab *dtab; |
af4d045c DB |
59 | unsigned int bit; |
60 | struct rcu_head rcu; | |
546ac1ff JF |
61 | }; |
62 | ||
63 | struct bpf_dtab { | |
64 | struct bpf_map map; | |
65 | struct bpf_dtab_netdev **netdev_map; | |
af4d045c | 66 | unsigned long __percpu *flush_needed; |
2ddf71e2 | 67 | struct list_head list; |
546ac1ff JF |
68 | }; |
69 | ||
4cc7b954 | 70 | static DEFINE_SPINLOCK(dev_map_lock); |
2ddf71e2 JF |
71 | static LIST_HEAD(dev_map_list); |
72 | ||
af4d045c DB |
73 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) |
74 | { | |
8695a539 | 75 | return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); |
af4d045c DB |
76 | } |
77 | ||
546ac1ff JF |
78 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
79 | { | |
80 | struct bpf_dtab *dtab; | |
582db7e0 | 81 | int err = -EINVAL; |
546ac1ff | 82 | u64 cost; |
546ac1ff | 83 | |
9ef2a8cd JF |
84 | if (!capable(CAP_NET_ADMIN)) |
85 | return ERR_PTR(-EPERM); | |
86 | ||
546ac1ff JF |
87 | /* check sanity of attributes */ |
88 | if (attr->max_entries == 0 || attr->key_size != 4 || | |
6e71b04a | 89 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
546ac1ff JF |
90 | return ERR_PTR(-EINVAL); |
91 | ||
546ac1ff JF |
92 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
93 | if (!dtab) | |
94 | return ERR_PTR(-ENOMEM); | |
95 | ||
bd475643 | 96 | bpf_map_init_from_attr(&dtab->map, attr); |
546ac1ff | 97 | |
546ac1ff JF |
98 | /* make sure page count doesn't overflow */ |
99 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | |
af4d045c | 100 | cost += dev_map_bitmap_size(attr) * num_possible_cpus(); |
546ac1ff JF |
101 | if (cost >= U32_MAX - PAGE_SIZE) |
102 | goto free_dtab; | |
103 | ||
104 | dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | |
105 | ||
106 | /* if map size is larger than memlock limit, reject it early */ | |
107 | err = bpf_map_precharge_memlock(dtab->map.pages); | |
108 | if (err) | |
109 | goto free_dtab; | |
110 | ||
582db7e0 TK |
111 | err = -ENOMEM; |
112 | ||
11393cc9 | 113 | /* A per cpu bitfield with a bit per possible net device */ |
82f8dd28 DB |
114 | dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), |
115 | __alignof__(unsigned long), | |
116 | GFP_KERNEL | __GFP_NOWARN); | |
11393cc9 JF |
117 | if (!dtab->flush_needed) |
118 | goto free_dtab; | |
119 | ||
546ac1ff | 120 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
96eabe7a MKL |
121 | sizeof(struct bpf_dtab_netdev *), |
122 | dtab->map.numa_node); | |
546ac1ff JF |
123 | if (!dtab->netdev_map) |
124 | goto free_dtab; | |
125 | ||
4cc7b954 JF |
126 | spin_lock(&dev_map_lock); |
127 | list_add_tail_rcu(&dtab->list, &dev_map_list); | |
128 | spin_unlock(&dev_map_lock); | |
546ac1ff | 129 | |
af4d045c | 130 | return &dtab->map; |
546ac1ff | 131 | free_dtab: |
11393cc9 | 132 | free_percpu(dtab->flush_needed); |
546ac1ff | 133 | kfree(dtab); |
582db7e0 | 134 | return ERR_PTR(err); |
546ac1ff JF |
135 | } |
136 | ||
137 | static void dev_map_free(struct bpf_map *map) | |
138 | { | |
139 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
11393cc9 | 140 | int i, cpu; |
546ac1ff JF |
141 | |
142 | /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, | |
143 | * so the programs (can be more than one that used this map) were | |
144 | * disconnected from events. Wait for outstanding critical sections in | |
145 | * these programs to complete. The rcu critical section only guarantees | |
146 | * no further reads against netdev_map. It does __not__ ensure pending | |
147 | * flush operations (if any) are complete. | |
148 | */ | |
274043c6 DB |
149 | |
150 | spin_lock(&dev_map_lock); | |
151 | list_del_rcu(&dtab->list); | |
152 | spin_unlock(&dev_map_lock); | |
153 | ||
546ac1ff JF |
154 | synchronize_rcu(); |
155 | ||
11393cc9 JF |
156 | /* To ensure all pending flush operations have completed wait for flush |
157 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | |
158 | * Because the above synchronize_rcu() ensures the map is disconnected | |
159 | * from the program we can assume no new bits will be set. | |
160 | */ | |
161 | for_each_online_cpu(cpu) { | |
162 | unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); | |
163 | ||
164 | while (!bitmap_empty(bitmap, dtab->map.max_entries)) | |
374fb014 | 165 | cond_resched(); |
11393cc9 JF |
166 | } |
167 | ||
546ac1ff JF |
168 | for (i = 0; i < dtab->map.max_entries; i++) { |
169 | struct bpf_dtab_netdev *dev; | |
170 | ||
171 | dev = dtab->netdev_map[i]; | |
172 | if (!dev) | |
173 | continue; | |
174 | ||
175 | dev_put(dev->dev); | |
176 | kfree(dev); | |
177 | } | |
178 | ||
11393cc9 | 179 | free_percpu(dtab->flush_needed); |
546ac1ff JF |
180 | bpf_map_area_free(dtab->netdev_map); |
181 | kfree(dtab); | |
182 | } | |
183 | ||
184 | static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |
185 | { | |
186 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
187 | u32 index = key ? *(u32 *)key : U32_MAX; | |
af4d045c | 188 | u32 *next = next_key; |
546ac1ff JF |
189 | |
190 | if (index >= dtab->map.max_entries) { | |
191 | *next = 0; | |
192 | return 0; | |
193 | } | |
194 | ||
195 | if (index == dtab->map.max_entries - 1) | |
196 | return -ENOENT; | |
546ac1ff JF |
197 | *next = index + 1; |
198 | return 0; | |
199 | } | |
200 | ||
af4d045c | 201 | void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) |
11393cc9 JF |
202 | { |
203 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
204 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
205 | ||
af4d045c | 206 | __set_bit(bit, bitmap); |
97f91a7c JF |
207 | } |
208 | ||
11393cc9 JF |
209 | /* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled |
210 | * from the driver before returning from its napi->poll() routine. The poll() | |
211 | * routine is called either from busy_poll context or net_rx_action signaled | |
212 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the | |
213 | * net device can be torn down. On devmap tear down we ensure the ctx bitmap | |
214 | * is zeroed before completing to ensure all flush operations have completed. | |
215 | */ | |
216 | void __dev_map_flush(struct bpf_map *map) | |
217 | { | |
218 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
219 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | |
220 | u32 bit; | |
221 | ||
222 | for_each_set_bit(bit, bitmap, map->max_entries) { | |
223 | struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); | |
224 | struct net_device *netdev; | |
225 | ||
226 | /* This is possible if the dev entry is removed by user space | |
227 | * between xdp redirect and flush op. | |
228 | */ | |
229 | if (unlikely(!dev)) | |
230 | continue; | |
231 | ||
11393cc9 | 232 | __clear_bit(bit, bitmap); |
a5e2da6e DB |
233 | netdev = dev->dev; |
234 | if (likely(netdev->netdev_ops->ndo_xdp_flush)) | |
235 | netdev->netdev_ops->ndo_xdp_flush(netdev); | |
11393cc9 JF |
236 | } |
237 | } | |
238 | ||
546ac1ff JF |
239 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
240 | * update happens in parallel here a dev_put wont happen until after reading the | |
241 | * ifindex. | |
242 | */ | |
af4d045c | 243 | struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) |
546ac1ff JF |
244 | { |
245 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
246 | struct bpf_dtab_netdev *dev; | |
546ac1ff | 247 | |
af4d045c | 248 | if (key >= map->max_entries) |
546ac1ff JF |
249 | return NULL; |
250 | ||
af4d045c DB |
251 | dev = READ_ONCE(dtab->netdev_map[key]); |
252 | return dev ? dev->dev : NULL; | |
546ac1ff JF |
253 | } |
254 | ||
af4d045c DB |
255 | static void *dev_map_lookup_elem(struct bpf_map *map, void *key) |
256 | { | |
257 | struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key); | |
258 | ||
259 | return dev ? &dev->ifindex : NULL; | |
260 | } | |
261 | ||
262 | static void dev_map_flush_old(struct bpf_dtab_netdev *dev) | |
11393cc9 | 263 | { |
af4d045c DB |
264 | if (dev->dev->netdev_ops->ndo_xdp_flush) { |
265 | struct net_device *fl = dev->dev; | |
11393cc9 JF |
266 | unsigned long *bitmap; |
267 | int cpu; | |
268 | ||
269 | for_each_online_cpu(cpu) { | |
af4d045c DB |
270 | bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); |
271 | __clear_bit(dev->bit, bitmap); | |
11393cc9 | 272 | |
af4d045c | 273 | fl->netdev_ops->ndo_xdp_flush(dev->dev); |
11393cc9 JF |
274 | } |
275 | } | |
276 | } | |
277 | ||
546ac1ff JF |
278 | static void __dev_map_entry_free(struct rcu_head *rcu) |
279 | { | |
af4d045c | 280 | struct bpf_dtab_netdev *dev; |
546ac1ff | 281 | |
af4d045c DB |
282 | dev = container_of(rcu, struct bpf_dtab_netdev, rcu); |
283 | dev_map_flush_old(dev); | |
284 | dev_put(dev->dev); | |
285 | kfree(dev); | |
546ac1ff JF |
286 | } |
287 | ||
288 | static int dev_map_delete_elem(struct bpf_map *map, void *key) | |
289 | { | |
290 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
291 | struct bpf_dtab_netdev *old_dev; | |
292 | int k = *(u32 *)key; | |
293 | ||
294 | if (k >= map->max_entries) | |
295 | return -EINVAL; | |
296 | ||
af4d045c DB |
297 | /* Use call_rcu() here to ensure any rcu critical sections have |
298 | * completed, but this does not guarantee a flush has happened | |
546ac1ff JF |
299 | * yet. Because driver side rcu_read_lock/unlock only protects the |
300 | * running XDP program. However, for pending flush operations the | |
301 | * dev and ctx are stored in another per cpu map. And additionally, | |
302 | * the driver tear down ensures all soft irqs are complete before | |
303 | * removing the net device in the case of dev_put equals zero. | |
304 | */ | |
305 | old_dev = xchg(&dtab->netdev_map[k], NULL); | |
306 | if (old_dev) | |
307 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
308 | return 0; | |
309 | } | |
310 | ||
311 | static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |
312 | u64 map_flags) | |
313 | { | |
314 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | |
315 | struct net *net = current->nsproxy->net_ns; | |
316 | struct bpf_dtab_netdev *dev, *old_dev; | |
317 | u32 i = *(u32 *)key; | |
318 | u32 ifindex = *(u32 *)value; | |
319 | ||
320 | if (unlikely(map_flags > BPF_EXIST)) | |
321 | return -EINVAL; | |
546ac1ff JF |
322 | if (unlikely(i >= dtab->map.max_entries)) |
323 | return -E2BIG; | |
546ac1ff JF |
324 | if (unlikely(map_flags == BPF_NOEXIST)) |
325 | return -EEXIST; | |
326 | ||
327 | if (!ifindex) { | |
328 | dev = NULL; | |
329 | } else { | |
96eabe7a MKL |
330 | dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, |
331 | map->numa_node); | |
546ac1ff JF |
332 | if (!dev) |
333 | return -ENOMEM; | |
334 | ||
335 | dev->dev = dev_get_by_index(net, ifindex); | |
336 | if (!dev->dev) { | |
337 | kfree(dev); | |
338 | return -EINVAL; | |
339 | } | |
340 | ||
af4d045c | 341 | dev->bit = i; |
546ac1ff JF |
342 | dev->dtab = dtab; |
343 | } | |
344 | ||
345 | /* Use call_rcu() here to ensure rcu critical sections have completed | |
346 | * Remembering the driver side flush operation will happen before the | |
347 | * net device is removed. | |
348 | */ | |
349 | old_dev = xchg(&dtab->netdev_map[i], dev); | |
350 | if (old_dev) | |
351 | call_rcu(&old_dev->rcu, __dev_map_entry_free); | |
352 | ||
353 | return 0; | |
354 | } | |
355 | ||
356 | const struct bpf_map_ops dev_map_ops = { | |
357 | .map_alloc = dev_map_alloc, | |
358 | .map_free = dev_map_free, | |
359 | .map_get_next_key = dev_map_get_next_key, | |
360 | .map_lookup_elem = dev_map_lookup_elem, | |
361 | .map_update_elem = dev_map_update_elem, | |
362 | .map_delete_elem = dev_map_delete_elem, | |
363 | }; | |
2ddf71e2 JF |
364 | |
365 | static int dev_map_notification(struct notifier_block *notifier, | |
366 | ulong event, void *ptr) | |
367 | { | |
368 | struct net_device *netdev = netdev_notifier_info_to_dev(ptr); | |
369 | struct bpf_dtab *dtab; | |
370 | int i; | |
371 | ||
372 | switch (event) { | |
373 | case NETDEV_UNREGISTER: | |
4cc7b954 JF |
374 | /* This rcu_read_lock/unlock pair is needed because |
375 | * dev_map_list is an RCU list AND to ensure a delete | |
376 | * operation does not free a netdev_map entry while we | |
377 | * are comparing it against the netdev being unregistered. | |
378 | */ | |
379 | rcu_read_lock(); | |
380 | list_for_each_entry_rcu(dtab, &dev_map_list, list) { | |
2ddf71e2 | 381 | for (i = 0; i < dtab->map.max_entries; i++) { |
4cc7b954 | 382 | struct bpf_dtab_netdev *dev, *odev; |
2ddf71e2 | 383 | |
4cc7b954 | 384 | dev = READ_ONCE(dtab->netdev_map[i]); |
2ddf71e2 JF |
385 | if (!dev || |
386 | dev->dev->ifindex != netdev->ifindex) | |
387 | continue; | |
4cc7b954 JF |
388 | odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); |
389 | if (dev == odev) | |
2ddf71e2 JF |
390 | call_rcu(&dev->rcu, |
391 | __dev_map_entry_free); | |
392 | } | |
393 | } | |
4cc7b954 | 394 | rcu_read_unlock(); |
2ddf71e2 JF |
395 | break; |
396 | default: | |
397 | break; | |
398 | } | |
399 | return NOTIFY_OK; | |
400 | } | |
401 | ||
402 | static struct notifier_block dev_map_notifier = { | |
403 | .notifier_call = dev_map_notification, | |
404 | }; | |
405 | ||
406 | static int __init dev_map_init(void) | |
407 | { | |
408 | register_netdevice_notifier(&dev_map_notifier); | |
409 | return 0; | |
410 | } | |
411 | ||
412 | subsys_initcall(dev_map_init); |