Commit | Line | Data |
---|---|---|
20d29d7a AB |
1 | #include <linux/etherdevice.h> |
2 | #include <linux/if_macvlan.h> | |
3 | #include <linux/interrupt.h> | |
4 | #include <linux/nsproxy.h> | |
5 | #include <linux/compat.h> | |
6 | #include <linux/if_tun.h> | |
7 | #include <linux/module.h> | |
8 | #include <linux/skbuff.h> | |
9 | #include <linux/cache.h> | |
10 | #include <linux/sched.h> | |
11 | #include <linux/types.h> | |
12 | #include <linux/init.h> | |
13 | #include <linux/wait.h> | |
14 | #include <linux/cdev.h> | |
15 | #include <linux/fs.h> | |
16 | ||
17 | #include <net/net_namespace.h> | |
18 | #include <net/rtnetlink.h> | |
19 | #include <net/sock.h> | |
20 | ||
21 | /* | |
22 | * A macvtap queue is the central object of this driver, it connects | |
23 | * an open character device to a macvlan interface. There can be | |
24 | * multiple queues on one interface, which map back to queues | |
25 | * implemented in hardware on the underlying device. | |
26 | * | |
27 | * macvtap_proto is used to allocate queues through the sock allocation | |
28 | * mechanism. | |
29 | * | |
30 | * TODO: multiqueue support is currently not implemented, even though | |
31 | * macvtap is basically prepared for that. We will need to add this | |
32 | * here as well as in virtio-net and qemu to get line rate on 10gbit | |
33 | * adapters from a guest. | |
34 | */ | |
35 | struct macvtap_queue { | |
36 | struct sock sk; | |
37 | struct socket sock; | |
38 | struct macvlan_dev *vlan; | |
39 | struct file *file; | |
40 | }; | |
41 | ||
42 | static struct proto macvtap_proto = { | |
43 | .name = "macvtap", | |
44 | .owner = THIS_MODULE, | |
45 | .obj_size = sizeof (struct macvtap_queue), | |
46 | }; | |
47 | ||
48 | /* | |
49 | * Minor number matches netdev->ifindex, so need a potentially | |
50 | * large value. This also makes it possible to split the | |
51 | * tap functionality out again in the future by offering it | |
52 | * from other drivers besides macvtap. As long as every device | |
53 | * only has one tap, the interface numbers assure that the | |
54 | * device nodes are unique. | |
55 | */ | |
56 | static unsigned int macvtap_major; | |
57 | #define MACVTAP_NUM_DEVS 65536 | |
58 | static struct class *macvtap_class; | |
59 | static struct cdev macvtap_cdev; | |
60 | ||
61 | /* | |
62 | * RCU usage: | |
63 | * The macvtap_queue is referenced both from the chardev struct file | |
64 | * and from the struct macvlan_dev using rcu_read_lock. | |
65 | * | |
66 | * We never actually update the contents of a macvtap_queue atomically | |
67 | * with RCU but it is used for race-free destruction of a queue when | |
68 | * either the file or the macvlan_dev goes away. Pointers back to | |
69 | * the dev and the file are implicitly valid as long as the queue | |
70 | * exists. | |
71 | * | |
72 | * The callbacks from macvlan are always done with rcu_read_lock held | |
564517e8 AB |
73 | * already. For calls from file_operations, we use the rcu_read_lock_bh |
74 | * to get a reference count on the socket and the device. | |
20d29d7a AB |
75 | * |
76 | * When destroying a queue, we remove the pointers from the file and | |
77 | * from the dev and then synchronize_rcu to make sure no thread is | |
78 | * still using the queue. There may still be references to the struct | |
79 | * sock inside of the queue from outbound SKBs, but these never | |
80 | * reference back to the file or the dev. The data structure is freed | |
81 | * through __sk_free when both our references and any pending SKBs | |
82 | * are gone. | |
83 | * | |
84 | * macvtap_lock is only used to prevent multiple concurrent open() | |
85 | * calls to assign a new vlan->tap pointer. It could be moved into | |
86 | * the macvlan_dev itself but is extremely rarely used. | |
87 | */ | |
88 | static DEFINE_SPINLOCK(macvtap_lock); | |
89 | ||
90 | /* | |
91 | * Choose the next free queue, for now there is only one | |
92 | */ | |
93 | static int macvtap_set_queue(struct net_device *dev, struct file *file, | |
94 | struct macvtap_queue *q) | |
95 | { | |
96 | struct macvlan_dev *vlan = netdev_priv(dev); | |
97 | int err = -EBUSY; | |
98 | ||
99 | spin_lock(&macvtap_lock); | |
100 | if (rcu_dereference(vlan->tap)) | |
101 | goto out; | |
102 | ||
103 | err = 0; | |
104 | q->vlan = vlan; | |
105 | rcu_assign_pointer(vlan->tap, q); | |
106 | ||
107 | q->file = file; | |
108 | rcu_assign_pointer(file->private_data, q); | |
109 | ||
110 | out: | |
111 | spin_unlock(&macvtap_lock); | |
112 | return err; | |
113 | } | |
114 | ||
115 | /* | |
116 | * We must destroy each queue exactly once, when either | |
117 | * the netdev or the file go away. | |
118 | * | |
119 | * Using the spinlock makes sure that we don't get | |
120 | * to the queue again after destroying it. | |
121 | * | |
122 | * synchronize_rcu serializes with the packet flow | |
123 | * that uses rcu_read_lock. | |
124 | */ | |
125 | static void macvtap_del_queue(struct macvtap_queue **qp) | |
126 | { | |
127 | struct macvtap_queue *q; | |
128 | ||
129 | spin_lock(&macvtap_lock); | |
130 | q = rcu_dereference(*qp); | |
131 | if (!q) { | |
132 | spin_unlock(&macvtap_lock); | |
133 | return; | |
134 | } | |
135 | ||
136 | rcu_assign_pointer(q->vlan->tap, NULL); | |
137 | rcu_assign_pointer(q->file->private_data, NULL); | |
138 | spin_unlock(&macvtap_lock); | |
139 | ||
140 | synchronize_rcu(); | |
141 | sock_put(&q->sk); | |
142 | } | |
143 | ||
144 | /* | |
145 | * Since we only support one queue, just dereference the pointer. | |
146 | */ | |
147 | static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, | |
148 | struct sk_buff *skb) | |
149 | { | |
150 | struct macvlan_dev *vlan = netdev_priv(dev); | |
151 | ||
152 | return rcu_dereference(vlan->tap); | |
153 | } | |
154 | ||
155 | static void macvtap_del_queues(struct net_device *dev) | |
156 | { | |
157 | struct macvlan_dev *vlan = netdev_priv(dev); | |
158 | macvtap_del_queue(&vlan->tap); | |
159 | } | |
160 | ||
161 | static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) | |
162 | { | |
564517e8 | 163 | struct macvtap_queue *q; |
20d29d7a | 164 | rcu_read_lock_bh(); |
564517e8 AB |
165 | q = rcu_dereference(file->private_data); |
166 | if (q) { | |
167 | sock_hold(&q->sk); | |
168 | dev_hold(q->vlan->dev); | |
169 | } | |
170 | rcu_read_unlock_bh(); | |
171 | return q; | |
20d29d7a AB |
172 | } |
173 | ||
564517e8 | 174 | static inline void macvtap_file_put_queue(struct macvtap_queue *q) |
20d29d7a | 175 | { |
564517e8 AB |
176 | sock_put(&q->sk); |
177 | dev_put(q->vlan->dev); | |
20d29d7a AB |
178 | } |
179 | ||
180 | /* | |
181 | * Forward happens for data that gets sent from one macvlan | |
182 | * endpoint to another one in bridge mode. We just take | |
183 | * the skb and put it into the receive queue. | |
184 | */ | |
185 | static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) | |
186 | { | |
187 | struct macvtap_queue *q = macvtap_get_queue(dev, skb); | |
188 | if (!q) | |
189 | return -ENOLINK; | |
190 | ||
191 | skb_queue_tail(&q->sk.sk_receive_queue, skb); | |
192 | wake_up(q->sk.sk_sleep); | |
193 | return 0; | |
194 | } | |
195 | ||
196 | /* | |
197 | * Receive is for data from the external interface (lowerdev), | |
198 | * in case of macvtap, we can treat that the same way as | |
199 | * forward, which macvlan cannot. | |
200 | */ | |
201 | static int macvtap_receive(struct sk_buff *skb) | |
202 | { | |
203 | skb_push(skb, ETH_HLEN); | |
204 | return macvtap_forward(skb->dev, skb); | |
205 | } | |
206 | ||
207 | static int macvtap_newlink(struct net *src_net, | |
208 | struct net_device *dev, | |
209 | struct nlattr *tb[], | |
210 | struct nlattr *data[]) | |
211 | { | |
212 | struct device *classdev; | |
213 | dev_t devt; | |
214 | int err; | |
215 | ||
216 | err = macvlan_common_newlink(src_net, dev, tb, data, | |
217 | macvtap_receive, macvtap_forward); | |
218 | if (err) | |
219 | goto out; | |
220 | ||
221 | devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); | |
222 | ||
223 | classdev = device_create(macvtap_class, &dev->dev, devt, | |
224 | dev, "tap%d", dev->ifindex); | |
225 | if (IS_ERR(classdev)) { | |
226 | err = PTR_ERR(classdev); | |
227 | macvtap_del_queues(dev); | |
228 | } | |
229 | ||
230 | out: | |
231 | return err; | |
232 | } | |
233 | ||
234 | static void macvtap_dellink(struct net_device *dev, | |
235 | struct list_head *head) | |
236 | { | |
237 | device_destroy(macvtap_class, | |
238 | MKDEV(MAJOR(macvtap_major), dev->ifindex)); | |
239 | ||
240 | macvtap_del_queues(dev); | |
241 | macvlan_dellink(dev, head); | |
242 | } | |
243 | ||
244 | static struct rtnl_link_ops macvtap_link_ops __read_mostly = { | |
245 | .kind = "macvtap", | |
246 | .newlink = macvtap_newlink, | |
247 | .dellink = macvtap_dellink, | |
248 | }; | |
249 | ||
250 | ||
251 | static void macvtap_sock_write_space(struct sock *sk) | |
252 | { | |
253 | if (!sock_writeable(sk) || | |
254 | !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) | |
255 | return; | |
256 | ||
257 | if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) | |
258 | wake_up_interruptible_sync(sk->sk_sleep); | |
259 | } | |
260 | ||
261 | static int macvtap_open(struct inode *inode, struct file *file) | |
262 | { | |
263 | struct net *net = current->nsproxy->net_ns; | |
264 | struct net_device *dev = dev_get_by_index(net, iminor(inode)); | |
265 | struct macvtap_queue *q; | |
266 | int err; | |
267 | ||
268 | err = -ENODEV; | |
269 | if (!dev) | |
270 | goto out; | |
271 | ||
272 | /* check if this is a macvtap device */ | |
273 | err = -EINVAL; | |
274 | if (dev->rtnl_link_ops != &macvtap_link_ops) | |
275 | goto out; | |
276 | ||
277 | err = -ENOMEM; | |
278 | q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, | |
279 | &macvtap_proto); | |
280 | if (!q) | |
281 | goto out; | |
282 | ||
283 | init_waitqueue_head(&q->sock.wait); | |
284 | q->sock.type = SOCK_RAW; | |
285 | q->sock.state = SS_CONNECTED; | |
286 | sock_init_data(&q->sock, &q->sk); | |
287 | q->sk.sk_allocation = GFP_ATOMIC; /* for now */ | |
288 | q->sk.sk_write_space = macvtap_sock_write_space; | |
289 | ||
290 | err = macvtap_set_queue(dev, file, q); | |
291 | if (err) | |
292 | sock_put(&q->sk); | |
293 | ||
294 | out: | |
295 | if (dev) | |
296 | dev_put(dev); | |
297 | ||
298 | return err; | |
299 | } | |
300 | ||
301 | static int macvtap_release(struct inode *inode, struct file *file) | |
302 | { | |
303 | macvtap_del_queue((struct macvtap_queue **)&file->private_data); | |
304 | return 0; | |
305 | } | |
306 | ||
307 | static unsigned int macvtap_poll(struct file *file, poll_table * wait) | |
308 | { | |
309 | struct macvtap_queue *q = macvtap_file_get_queue(file); | |
310 | unsigned int mask = POLLERR; | |
311 | ||
312 | if (!q) | |
313 | goto out; | |
314 | ||
315 | mask = 0; | |
316 | poll_wait(file, &q->sock.wait, wait); | |
317 | ||
318 | if (!skb_queue_empty(&q->sk.sk_receive_queue)) | |
319 | mask |= POLLIN | POLLRDNORM; | |
320 | ||
321 | if (sock_writeable(&q->sk) || | |
322 | (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && | |
323 | sock_writeable(&q->sk))) | |
324 | mask |= POLLOUT | POLLWRNORM; | |
325 | ||
564517e8 | 326 | macvtap_file_put_queue(q); |
20d29d7a | 327 | out: |
20d29d7a AB |
328 | return mask; |
329 | } | |
330 | ||
331 | /* Get packet from user space buffer */ | |
332 | static ssize_t macvtap_get_user(struct macvtap_queue *q, | |
333 | const struct iovec *iv, size_t count, | |
334 | int noblock) | |
335 | { | |
336 | struct sk_buff *skb; | |
337 | size_t len = count; | |
338 | int err; | |
339 | ||
340 | if (unlikely(len < ETH_HLEN)) | |
341 | return -EINVAL; | |
342 | ||
343 | skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); | |
344 | ||
345 | if (!skb) { | |
346 | macvlan_count_rx(q->vlan, 0, false, false); | |
347 | return err; | |
348 | } | |
349 | ||
350 | skb_reserve(skb, NET_IP_ALIGN); | |
351 | skb_put(skb, count); | |
352 | ||
353 | if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { | |
354 | macvlan_count_rx(q->vlan, 0, false, false); | |
355 | kfree_skb(skb); | |
356 | return -EFAULT; | |
357 | } | |
358 | ||
359 | skb_set_network_header(skb, ETH_HLEN); | |
360 | ||
361 | macvlan_start_xmit(skb, q->vlan->dev); | |
362 | ||
363 | return count; | |
364 | } | |
365 | ||
366 | static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, | |
367 | unsigned long count, loff_t pos) | |
368 | { | |
369 | struct file *file = iocb->ki_filp; | |
370 | ssize_t result = -ENOLINK; | |
371 | struct macvtap_queue *q = macvtap_file_get_queue(file); | |
372 | ||
373 | if (!q) | |
374 | goto out; | |
375 | ||
376 | result = macvtap_get_user(q, iv, iov_length(iv, count), | |
377 | file->f_flags & O_NONBLOCK); | |
564517e8 | 378 | macvtap_file_put_queue(q); |
20d29d7a | 379 | out: |
20d29d7a AB |
380 | return result; |
381 | } | |
382 | ||
383 | /* Put packet to the user space buffer */ | |
384 | static ssize_t macvtap_put_user(struct macvtap_queue *q, | |
385 | const struct sk_buff *skb, | |
386 | const struct iovec *iv, int len) | |
387 | { | |
388 | struct macvlan_dev *vlan = q->vlan; | |
389 | int ret; | |
390 | ||
391 | len = min_t(int, skb->len, len); | |
392 | ||
393 | ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); | |
394 | ||
395 | macvlan_count_rx(vlan, len, ret == 0, 0); | |
396 | ||
397 | return ret ? ret : len; | |
398 | } | |
399 | ||
400 | static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, | |
401 | unsigned long count, loff_t pos) | |
402 | { | |
403 | struct file *file = iocb->ki_filp; | |
404 | struct macvtap_queue *q = macvtap_file_get_queue(file); | |
405 | ||
406 | DECLARE_WAITQUEUE(wait, current); | |
407 | struct sk_buff *skb; | |
408 | ssize_t len, ret = 0; | |
409 | ||
564517e8 AB |
410 | if (!q) |
411 | return -ENOLINK; | |
20d29d7a AB |
412 | |
413 | len = iov_length(iv, count); | |
414 | if (len < 0) { | |
415 | ret = -EINVAL; | |
416 | goto out; | |
417 | } | |
418 | ||
419 | add_wait_queue(q->sk.sk_sleep, &wait); | |
420 | while (len) { | |
421 | current->state = TASK_INTERRUPTIBLE; | |
422 | ||
423 | /* Read frames from the queue */ | |
424 | skb = skb_dequeue(&q->sk.sk_receive_queue); | |
425 | if (!skb) { | |
426 | if (file->f_flags & O_NONBLOCK) { | |
427 | ret = -EAGAIN; | |
428 | break; | |
429 | } | |
430 | if (signal_pending(current)) { | |
431 | ret = -ERESTARTSYS; | |
432 | break; | |
433 | } | |
434 | /* Nothing to read, let's sleep */ | |
435 | schedule(); | |
436 | continue; | |
437 | } | |
438 | ret = macvtap_put_user(q, skb, iv, len); | |
439 | kfree_skb(skb); | |
440 | break; | |
441 | } | |
442 | ||
443 | current->state = TASK_RUNNING; | |
444 | remove_wait_queue(q->sk.sk_sleep, &wait); | |
445 | ||
446 | out: | |
564517e8 | 447 | macvtap_file_put_queue(q); |
20d29d7a AB |
448 | return ret; |
449 | } | |
450 | ||
451 | /* | |
452 | * provide compatibility with generic tun/tap interface | |
453 | */ | |
454 | static long macvtap_ioctl(struct file *file, unsigned int cmd, | |
455 | unsigned long arg) | |
456 | { | |
457 | struct macvtap_queue *q; | |
458 | void __user *argp = (void __user *)arg; | |
459 | struct ifreq __user *ifr = argp; | |
460 | unsigned int __user *up = argp; | |
461 | unsigned int u; | |
462 | char devname[IFNAMSIZ]; | |
463 | ||
464 | switch (cmd) { | |
465 | case TUNSETIFF: | |
466 | /* ignore the name, just look at flags */ | |
467 | if (get_user(u, &ifr->ifr_flags)) | |
468 | return -EFAULT; | |
469 | if (u != (IFF_TAP | IFF_NO_PI)) | |
470 | return -EINVAL; | |
471 | return 0; | |
472 | ||
473 | case TUNGETIFF: | |
474 | q = macvtap_file_get_queue(file); | |
475 | if (!q) | |
476 | return -ENOLINK; | |
477 | memcpy(devname, q->vlan->dev->name, sizeof(devname)); | |
564517e8 | 478 | macvtap_file_put_queue(q); |
20d29d7a AB |
479 | |
480 | if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || | |
481 | put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) | |
482 | return -EFAULT; | |
483 | return 0; | |
484 | ||
485 | case TUNGETFEATURES: | |
486 | if (put_user((IFF_TAP | IFF_NO_PI), up)) | |
487 | return -EFAULT; | |
488 | return 0; | |
489 | ||
490 | case TUNSETSNDBUF: | |
491 | if (get_user(u, up)) | |
492 | return -EFAULT; | |
493 | ||
494 | q = macvtap_file_get_queue(file); | |
564517e8 AB |
495 | if (!q) |
496 | return -ENOLINK; | |
20d29d7a | 497 | q->sk.sk_sndbuf = u; |
564517e8 | 498 | macvtap_file_put_queue(q); |
20d29d7a AB |
499 | return 0; |
500 | ||
501 | case TUNSETOFFLOAD: | |
502 | /* let the user check for future flags */ | |
503 | if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | | |
504 | TUN_F_TSO_ECN | TUN_F_UFO)) | |
505 | return -EINVAL; | |
506 | ||
507 | /* TODO: add support for these, so far we don't | |
508 | support any offload */ | |
509 | if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | | |
510 | TUN_F_TSO_ECN | TUN_F_UFO)) | |
511 | return -EINVAL; | |
512 | ||
513 | return 0; | |
514 | ||
515 | default: | |
516 | return -EINVAL; | |
517 | } | |
518 | } | |
519 | ||
520 | #ifdef CONFIG_COMPAT | |
521 | static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, | |
522 | unsigned long arg) | |
523 | { | |
524 | return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); | |
525 | } | |
526 | #endif | |
527 | ||
528 | static const struct file_operations macvtap_fops = { | |
529 | .owner = THIS_MODULE, | |
530 | .open = macvtap_open, | |
531 | .release = macvtap_release, | |
532 | .aio_read = macvtap_aio_read, | |
533 | .aio_write = macvtap_aio_write, | |
534 | .poll = macvtap_poll, | |
535 | .llseek = no_llseek, | |
536 | .unlocked_ioctl = macvtap_ioctl, | |
537 | #ifdef CONFIG_COMPAT | |
538 | .compat_ioctl = macvtap_compat_ioctl, | |
539 | #endif | |
540 | }; | |
541 | ||
542 | static int macvtap_init(void) | |
543 | { | |
544 | int err; | |
545 | ||
546 | err = alloc_chrdev_region(&macvtap_major, 0, | |
547 | MACVTAP_NUM_DEVS, "macvtap"); | |
548 | if (err) | |
549 | goto out1; | |
550 | ||
551 | cdev_init(&macvtap_cdev, &macvtap_fops); | |
552 | err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); | |
553 | if (err) | |
554 | goto out2; | |
555 | ||
556 | macvtap_class = class_create(THIS_MODULE, "macvtap"); | |
557 | if (IS_ERR(macvtap_class)) { | |
558 | err = PTR_ERR(macvtap_class); | |
559 | goto out3; | |
560 | } | |
561 | ||
562 | err = macvlan_link_register(&macvtap_link_ops); | |
563 | if (err) | |
564 | goto out4; | |
565 | ||
566 | return 0; | |
567 | ||
568 | out4: | |
569 | class_unregister(macvtap_class); | |
570 | out3: | |
571 | cdev_del(&macvtap_cdev); | |
572 | out2: | |
573 | unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); | |
574 | out1: | |
575 | return err; | |
576 | } | |
577 | module_init(macvtap_init); | |
578 | ||
579 | static void macvtap_exit(void) | |
580 | { | |
581 | rtnl_link_unregister(&macvtap_link_ops); | |
582 | class_unregister(macvtap_class); | |
583 | cdev_del(&macvtap_cdev); | |
584 | unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); | |
585 | } | |
586 | module_exit(macvtap_exit); | |
587 | ||
588 | MODULE_ALIAS_RTNL_LINK("macvtap"); | |
589 | MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); | |
590 | MODULE_LICENSE("GPL"); |