tools, bpf_asm: simplify parser rule for BPF extensions
[linux-2.6-block.git] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/init.h>
34 #include <linux/in6.h>
35 #include <linux/inetdevice.h>
36 #include <linux/igmp.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/etherdevice.h>
39 #include <linux/if_ether.h>
40 #include <linux/if_vlan.h>
41 #include <linux/rculist.h>
42 #include <linux/err.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57 #include <net/udp.h>
58
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
72                                 __be16 flags, __be32 key)
73 {
74         if (p->i_flags & TUNNEL_KEY) {
75                 if (flags & TUNNEL_KEY)
76                         return key == p->i_key;
77                 else
78                         /* key expected, none present */
79                         return false;
80         } else
81                 return !(flags & TUNNEL_KEY);
82 }
83
84 /* Fallback tunnel: no source, no destination, no key, no options
85
86    Tunnel hash table:
87    We require exact key match i.e. if a key is present in packet
88    it will match only tunnel with the same key; if it is not present,
89    it will match only keyless tunnel.
90
91    All keysless packets, if not matched configured keyless tunnels
92    will match fallback tunnel.
93    Given src, dst and key, find appropriate for input tunnel.
94 */
95 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
96                                    int link, __be16 flags,
97                                    __be32 remote, __be32 local,
98                                    __be32 key)
99 {
100         unsigned int hash;
101         struct ip_tunnel *t, *cand = NULL;
102         struct hlist_head *head;
103
104         hash = ip_tunnel_hash(key, remote);
105         head = &itn->tunnels[hash];
106
107         hlist_for_each_entry_rcu(t, head, hash_node) {
108                 if (local != t->parms.iph.saddr ||
109                     remote != t->parms.iph.daddr ||
110                     !(t->dev->flags & IFF_UP))
111                         continue;
112
113                 if (!ip_tunnel_key_match(&t->parms, flags, key))
114                         continue;
115
116                 if (t->parms.link == link)
117                         return t;
118                 else
119                         cand = t;
120         }
121
122         hlist_for_each_entry_rcu(t, head, hash_node) {
123                 if (remote != t->parms.iph.daddr ||
124                     t->parms.iph.saddr != 0 ||
125                     !(t->dev->flags & IFF_UP))
126                         continue;
127
128                 if (!ip_tunnel_key_match(&t->parms, flags, key))
129                         continue;
130
131                 if (t->parms.link == link)
132                         return t;
133                 else if (!cand)
134                         cand = t;
135         }
136
137         hash = ip_tunnel_hash(key, 0);
138         head = &itn->tunnels[hash];
139
140         hlist_for_each_entry_rcu(t, head, hash_node) {
141                 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
142                     (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
143                         continue;
144
145                 if (!(t->dev->flags & IFF_UP))
146                         continue;
147
148                 if (!ip_tunnel_key_match(&t->parms, flags, key))
149                         continue;
150
151                 if (t->parms.link == link)
152                         return t;
153                 else if (!cand)
154                         cand = t;
155         }
156
157         if (flags & TUNNEL_NO_KEY)
158                 goto skip_key_lookup;
159
160         hlist_for_each_entry_rcu(t, head, hash_node) {
161                 if (t->parms.i_key != key ||
162                     t->parms.iph.saddr != 0 ||
163                     t->parms.iph.daddr != 0 ||
164                     !(t->dev->flags & IFF_UP))
165                         continue;
166
167                 if (t->parms.link == link)
168                         return t;
169                 else if (!cand)
170                         cand = t;
171         }
172
173 skip_key_lookup:
174         if (cand)
175                 return cand;
176
177         t = rcu_dereference(itn->collect_md_tun);
178         if (t)
179                 return t;
180
181         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
182                 return netdev_priv(itn->fb_tunnel_dev);
183
184         return NULL;
185 }
186 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
187
188 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
189                                     struct ip_tunnel_parm *parms)
190 {
191         unsigned int h;
192         __be32 remote;
193         __be32 i_key = parms->i_key;
194
195         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
196                 remote = parms->iph.daddr;
197         else
198                 remote = 0;
199
200         if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
201                 i_key = 0;
202
203         h = ip_tunnel_hash(i_key, remote);
204         return &itn->tunnels[h];
205 }
206
207 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
208 {
209         struct hlist_head *head = ip_bucket(itn, &t->parms);
210
211         if (t->collect_md)
212                 rcu_assign_pointer(itn->collect_md_tun, t);
213         hlist_add_head_rcu(&t->hash_node, head);
214 }
215
216 static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
217 {
218         if (t->collect_md)
219                 rcu_assign_pointer(itn->collect_md_tun, NULL);
220         hlist_del_init_rcu(&t->hash_node);
221 }
222
223 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
224                                         struct ip_tunnel_parm *parms,
225                                         int type)
226 {
227         __be32 remote = parms->iph.daddr;
228         __be32 local = parms->iph.saddr;
229         __be32 key = parms->i_key;
230         __be16 flags = parms->i_flags;
231         int link = parms->link;
232         struct ip_tunnel *t = NULL;
233         struct hlist_head *head = ip_bucket(itn, parms);
234
235         hlist_for_each_entry_rcu(t, head, hash_node) {
236                 if (local == t->parms.iph.saddr &&
237                     remote == t->parms.iph.daddr &&
238                     link == t->parms.link &&
239                     type == t->dev->type &&
240                     ip_tunnel_key_match(&t->parms, flags, key))
241                         break;
242         }
243         return t;
244 }
245
246 static struct net_device *__ip_tunnel_create(struct net *net,
247                                              const struct rtnl_link_ops *ops,
248                                              struct ip_tunnel_parm *parms)
249 {
250         int err;
251         struct ip_tunnel *tunnel;
252         struct net_device *dev;
253         char name[IFNAMSIZ];
254
255         if (parms->name[0])
256                 strlcpy(name, parms->name, IFNAMSIZ);
257         else {
258                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
259                         err = -E2BIG;
260                         goto failed;
261                 }
262                 strlcpy(name, ops->kind, IFNAMSIZ);
263                 strncat(name, "%d", 2);
264         }
265
266         ASSERT_RTNL();
267         dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
268         if (!dev) {
269                 err = -ENOMEM;
270                 goto failed;
271         }
272         dev_net_set(dev, net);
273
274         dev->rtnl_link_ops = ops;
275
276         tunnel = netdev_priv(dev);
277         tunnel->parms = *parms;
278         tunnel->net = net;
279
280         err = register_netdevice(dev);
281         if (err)
282                 goto failed_free;
283
284         return dev;
285
286 failed_free:
287         free_netdev(dev);
288 failed:
289         return ERR_PTR(err);
290 }
291
292 static inline void init_tunnel_flow(struct flowi4 *fl4,
293                                     int proto,
294                                     __be32 daddr, __be32 saddr,
295                                     __be32 key, __u8 tos, int oif)
296 {
297         memset(fl4, 0, sizeof(*fl4));
298         fl4->flowi4_oif = oif;
299         fl4->daddr = daddr;
300         fl4->saddr = saddr;
301         fl4->flowi4_tos = tos;
302         fl4->flowi4_proto = proto;
303         fl4->fl4_gre_key = key;
304 }
305
306 static int ip_tunnel_bind_dev(struct net_device *dev)
307 {
308         struct net_device *tdev = NULL;
309         struct ip_tunnel *tunnel = netdev_priv(dev);
310         const struct iphdr *iph;
311         int hlen = LL_MAX_HEADER;
312         int mtu = ETH_DATA_LEN;
313         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
314
315         iph = &tunnel->parms.iph;
316
317         /* Guess output device to choose reasonable mtu and needed_headroom */
318         if (iph->daddr) {
319                 struct flowi4 fl4;
320                 struct rtable *rt;
321
322                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
323                                  iph->saddr, tunnel->parms.o_key,
324                                  RT_TOS(iph->tos), tunnel->parms.link);
325                 rt = ip_route_output_key(tunnel->net, &fl4);
326
327                 if (!IS_ERR(rt)) {
328                         tdev = rt->dst.dev;
329                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
330                                           fl4.saddr);
331                         ip_rt_put(rt);
332                 }
333                 if (dev->type != ARPHRD_ETHER)
334                         dev->flags |= IFF_POINTOPOINT;
335         }
336
337         if (!tdev && tunnel->parms.link)
338                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
339
340         if (tdev) {
341                 hlen = tdev->hard_header_len + tdev->needed_headroom;
342                 mtu = tdev->mtu;
343         }
344
345         dev->needed_headroom = t_hlen + hlen;
346         mtu -= (dev->hard_header_len + t_hlen);
347
348         if (mtu < 68)
349                 mtu = 68;
350
351         return mtu;
352 }
353
354 static struct ip_tunnel *ip_tunnel_create(struct net *net,
355                                           struct ip_tunnel_net *itn,
356                                           struct ip_tunnel_parm *parms)
357 {
358         struct ip_tunnel *nt;
359         struct net_device *dev;
360
361         BUG_ON(!itn->fb_tunnel_dev);
362         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
363         if (IS_ERR(dev))
364                 return ERR_CAST(dev);
365
366         dev->mtu = ip_tunnel_bind_dev(dev);
367
368         nt = netdev_priv(dev);
369         ip_tunnel_add(itn, nt);
370         return nt;
371 }
372
373 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
374                   const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
375                   bool log_ecn_error)
376 {
377         struct pcpu_sw_netstats *tstats;
378         const struct iphdr *iph = ip_hdr(skb);
379         int err;
380
381 #ifdef CONFIG_NET_IPGRE_BROADCAST
382         if (ipv4_is_multicast(iph->daddr)) {
383                 tunnel->dev->stats.multicast++;
384                 skb->pkt_type = PACKET_BROADCAST;
385         }
386 #endif
387
388         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
389              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
390                 tunnel->dev->stats.rx_crc_errors++;
391                 tunnel->dev->stats.rx_errors++;
392                 goto drop;
393         }
394
395         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
396                 if (!(tpi->flags&TUNNEL_SEQ) ||
397                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
398                         tunnel->dev->stats.rx_fifo_errors++;
399                         tunnel->dev->stats.rx_errors++;
400                         goto drop;
401                 }
402                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
403         }
404
405         skb_reset_network_header(skb);
406
407         err = IP_ECN_decapsulate(iph, skb);
408         if (unlikely(err)) {
409                 if (log_ecn_error)
410                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
411                                         &iph->saddr, iph->tos);
412                 if (err > 1) {
413                         ++tunnel->dev->stats.rx_frame_errors;
414                         ++tunnel->dev->stats.rx_errors;
415                         goto drop;
416                 }
417         }
418
419         tstats = this_cpu_ptr(tunnel->dev->tstats);
420         u64_stats_update_begin(&tstats->syncp);
421         tstats->rx_packets++;
422         tstats->rx_bytes += skb->len;
423         u64_stats_update_end(&tstats->syncp);
424
425         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
426
427         if (tunnel->dev->type == ARPHRD_ETHER) {
428                 skb->protocol = eth_type_trans(skb, tunnel->dev);
429                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
430         } else {
431                 skb->dev = tunnel->dev;
432         }
433
434         if (tun_dst)
435                 skb_dst_set(skb, (struct dst_entry *)tun_dst);
436
437         gro_cells_receive(&tunnel->gro_cells, skb);
438         return 0;
439
440 drop:
441         kfree_skb(skb);
442         return 0;
443 }
444 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
445
446 static int ip_encap_hlen(struct ip_tunnel_encap *e)
447 {
448         const struct ip_tunnel_encap_ops *ops;
449         int hlen = -EINVAL;
450
451         if (e->type == TUNNEL_ENCAP_NONE)
452                 return 0;
453
454         if (e->type >= MAX_IPTUN_ENCAP_OPS)
455                 return -EINVAL;
456
457         rcu_read_lock();
458         ops = rcu_dereference(iptun_encaps[e->type]);
459         if (likely(ops && ops->encap_hlen))
460                 hlen = ops->encap_hlen(e);
461         rcu_read_unlock();
462
463         return hlen;
464 }
465
466 const struct ip_tunnel_encap_ops __rcu *
467                 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
468
469 int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
470                             unsigned int num)
471 {
472         if (num >= MAX_IPTUN_ENCAP_OPS)
473                 return -ERANGE;
474
475         return !cmpxchg((const struct ip_tunnel_encap_ops **)
476                         &iptun_encaps[num],
477                         NULL, ops) ? 0 : -1;
478 }
479 EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
480
481 int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
482                             unsigned int num)
483 {
484         int ret;
485
486         if (num >= MAX_IPTUN_ENCAP_OPS)
487                 return -ERANGE;
488
489         ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
490                        &iptun_encaps[num],
491                        ops, NULL) == ops) ? 0 : -1;
492
493         synchronize_net();
494
495         return ret;
496 }
497 EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
498
499 int ip_tunnel_encap_setup(struct ip_tunnel *t,
500                           struct ip_tunnel_encap *ipencap)
501 {
502         int hlen;
503
504         memset(&t->encap, 0, sizeof(t->encap));
505
506         hlen = ip_encap_hlen(ipencap);
507         if (hlen < 0)
508                 return hlen;
509
510         t->encap.type = ipencap->type;
511         t->encap.sport = ipencap->sport;
512         t->encap.dport = ipencap->dport;
513         t->encap.flags = ipencap->flags;
514
515         t->encap_hlen = hlen;
516         t->hlen = t->encap_hlen + t->tun_hlen;
517
518         return 0;
519 }
520 EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
521
522 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
523                     u8 *protocol, struct flowi4 *fl4)
524 {
525         const struct ip_tunnel_encap_ops *ops;
526         int ret = -EINVAL;
527
528         if (t->encap.type == TUNNEL_ENCAP_NONE)
529                 return 0;
530
531         if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
532                 return -EINVAL;
533
534         rcu_read_lock();
535         ops = rcu_dereference(iptun_encaps[t->encap.type]);
536         if (likely(ops && ops->build_header))
537                 ret = ops->build_header(skb, &t->encap, protocol, fl4);
538         rcu_read_unlock();
539
540         return ret;
541 }
542 EXPORT_SYMBOL(ip_tunnel_encap);
543
544 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
545                             struct rtable *rt, __be16 df,
546                             const struct iphdr *inner_iph)
547 {
548         struct ip_tunnel *tunnel = netdev_priv(dev);
549         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
550         int mtu;
551
552         if (df)
553                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
554                                         - sizeof(struct iphdr) - tunnel->hlen;
555         else
556                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
557
558         if (skb_dst(skb))
559                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
560
561         if (skb->protocol == htons(ETH_P_IP)) {
562                 if (!skb_is_gso(skb) &&
563                     (inner_iph->frag_off & htons(IP_DF)) &&
564                     mtu < pkt_size) {
565                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
566                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
567                         return -E2BIG;
568                 }
569         }
570 #if IS_ENABLED(CONFIG_IPV6)
571         else if (skb->protocol == htons(ETH_P_IPV6)) {
572                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
573
574                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
575                            mtu >= IPV6_MIN_MTU) {
576                         if ((tunnel->parms.iph.daddr &&
577                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
578                             rt6->rt6i_dst.plen == 128) {
579                                 rt6->rt6i_flags |= RTF_MODIFIED;
580                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
581                         }
582                 }
583
584                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
585                                         mtu < pkt_size) {
586                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
587                         return -E2BIG;
588                 }
589         }
590 #endif
591         return 0;
592 }
593
594 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
595                     const struct iphdr *tnl_params, u8 protocol)
596 {
597         struct ip_tunnel *tunnel = netdev_priv(dev);
598         const struct iphdr *inner_iph;
599         struct flowi4 fl4;
600         u8     tos, ttl;
601         __be16 df;
602         struct rtable *rt;              /* Route to the other host */
603         unsigned int max_headroom;      /* The extra header space needed */
604         __be32 dst;
605         bool connected;
606
607         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
608         connected = (tunnel->parms.iph.daddr != 0);
609
610         dst = tnl_params->daddr;
611         if (dst == 0) {
612                 /* NBMA tunnel */
613
614                 if (!skb_dst(skb)) {
615                         dev->stats.tx_fifo_errors++;
616                         goto tx_error;
617                 }
618
619                 if (skb->protocol == htons(ETH_P_IP)) {
620                         rt = skb_rtable(skb);
621                         dst = rt_nexthop(rt, inner_iph->daddr);
622                 }
623 #if IS_ENABLED(CONFIG_IPV6)
624                 else if (skb->protocol == htons(ETH_P_IPV6)) {
625                         const struct in6_addr *addr6;
626                         struct neighbour *neigh;
627                         bool do_tx_error_icmp;
628                         int addr_type;
629
630                         neigh = dst_neigh_lookup(skb_dst(skb),
631                                                  &ipv6_hdr(skb)->daddr);
632                         if (!neigh)
633                                 goto tx_error;
634
635                         addr6 = (const struct in6_addr *)&neigh->primary_key;
636                         addr_type = ipv6_addr_type(addr6);
637
638                         if (addr_type == IPV6_ADDR_ANY) {
639                                 addr6 = &ipv6_hdr(skb)->daddr;
640                                 addr_type = ipv6_addr_type(addr6);
641                         }
642
643                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
644                                 do_tx_error_icmp = true;
645                         else {
646                                 do_tx_error_icmp = false;
647                                 dst = addr6->s6_addr32[3];
648                         }
649                         neigh_release(neigh);
650                         if (do_tx_error_icmp)
651                                 goto tx_error_icmp;
652                 }
653 #endif
654                 else
655                         goto tx_error;
656
657                 connected = false;
658         }
659
660         tos = tnl_params->tos;
661         if (tos & 0x1) {
662                 tos &= ~0x1;
663                 if (skb->protocol == htons(ETH_P_IP)) {
664                         tos = inner_iph->tos;
665                         connected = false;
666                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
667                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
668                         connected = false;
669                 }
670         }
671
672         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
673                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
674
675         if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
676                 goto tx_error;
677
678         rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
679                          NULL;
680
681         if (!rt) {
682                 rt = ip_route_output_key(tunnel->net, &fl4);
683
684                 if (IS_ERR(rt)) {
685                         dev->stats.tx_carrier_errors++;
686                         goto tx_error;
687                 }
688                 if (connected)
689                         dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
690                                           fl4.saddr);
691         }
692
693         if (rt->dst.dev == dev) {
694                 ip_rt_put(rt);
695                 dev->stats.collisions++;
696                 goto tx_error;
697         }
698
699         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
700                 ip_rt_put(rt);
701                 goto tx_error;
702         }
703
704         if (tunnel->err_count > 0) {
705                 if (time_before(jiffies,
706                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
707                         tunnel->err_count--;
708
709                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
710                         dst_link_failure(skb);
711                 } else
712                         tunnel->err_count = 0;
713         }
714
715         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
716         ttl = tnl_params->ttl;
717         if (ttl == 0) {
718                 if (skb->protocol == htons(ETH_P_IP))
719                         ttl = inner_iph->ttl;
720 #if IS_ENABLED(CONFIG_IPV6)
721                 else if (skb->protocol == htons(ETH_P_IPV6))
722                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
723 #endif
724                 else
725                         ttl = ip4_dst_hoplimit(&rt->dst);
726         }
727
728         df = tnl_params->frag_off;
729         if (skb->protocol == htons(ETH_P_IP))
730                 df |= (inner_iph->frag_off&htons(IP_DF));
731
732         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
733                         + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
734         if (max_headroom > dev->needed_headroom)
735                 dev->needed_headroom = max_headroom;
736
737         if (skb_cow_head(skb, dev->needed_headroom)) {
738                 ip_rt_put(rt);
739                 dev->stats.tx_dropped++;
740                 kfree_skb(skb);
741                 return;
742         }
743
744         iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
745                       df, !net_eq(tunnel->net, dev_net(dev)));
746         return;
747
748 #if IS_ENABLED(CONFIG_IPV6)
749 tx_error_icmp:
750         dst_link_failure(skb);
751 #endif
752 tx_error:
753         dev->stats.tx_errors++;
754         kfree_skb(skb);
755 }
756 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
757
758 static void ip_tunnel_update(struct ip_tunnel_net *itn,
759                              struct ip_tunnel *t,
760                              struct net_device *dev,
761                              struct ip_tunnel_parm *p,
762                              bool set_mtu)
763 {
764         ip_tunnel_del(itn, t);
765         t->parms.iph.saddr = p->iph.saddr;
766         t->parms.iph.daddr = p->iph.daddr;
767         t->parms.i_key = p->i_key;
768         t->parms.o_key = p->o_key;
769         if (dev->type != ARPHRD_ETHER) {
770                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
771                 memcpy(dev->broadcast, &p->iph.daddr, 4);
772         }
773         ip_tunnel_add(itn, t);
774
775         t->parms.iph.ttl = p->iph.ttl;
776         t->parms.iph.tos = p->iph.tos;
777         t->parms.iph.frag_off = p->iph.frag_off;
778
779         if (t->parms.link != p->link) {
780                 int mtu;
781
782                 t->parms.link = p->link;
783                 mtu = ip_tunnel_bind_dev(dev);
784                 if (set_mtu)
785                         dev->mtu = mtu;
786         }
787         dst_cache_reset(&t->dst_cache);
788         netdev_state_change(dev);
789 }
790
791 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
792 {
793         int err = 0;
794         struct ip_tunnel *t = netdev_priv(dev);
795         struct net *net = t->net;
796         struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
797
798         BUG_ON(!itn->fb_tunnel_dev);
799         switch (cmd) {
800         case SIOCGETTUNNEL:
801                 if (dev == itn->fb_tunnel_dev) {
802                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
803                         if (!t)
804                                 t = netdev_priv(dev);
805                 }
806                 memcpy(p, &t->parms, sizeof(*p));
807                 break;
808
809         case SIOCADDTUNNEL:
810         case SIOCCHGTUNNEL:
811                 err = -EPERM;
812                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
813                         goto done;
814                 if (p->iph.ttl)
815                         p->iph.frag_off |= htons(IP_DF);
816                 if (!(p->i_flags & VTI_ISVTI)) {
817                         if (!(p->i_flags & TUNNEL_KEY))
818                                 p->i_key = 0;
819                         if (!(p->o_flags & TUNNEL_KEY))
820                                 p->o_key = 0;
821                 }
822
823                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
824
825                 if (cmd == SIOCADDTUNNEL) {
826                         if (!t) {
827                                 t = ip_tunnel_create(net, itn, p);
828                                 err = PTR_ERR_OR_ZERO(t);
829                                 break;
830                         }
831
832                         err = -EEXIST;
833                         break;
834                 }
835                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
836                         if (t) {
837                                 if (t->dev != dev) {
838                                         err = -EEXIST;
839                                         break;
840                                 }
841                         } else {
842                                 unsigned int nflags = 0;
843
844                                 if (ipv4_is_multicast(p->iph.daddr))
845                                         nflags = IFF_BROADCAST;
846                                 else if (p->iph.daddr)
847                                         nflags = IFF_POINTOPOINT;
848
849                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
850                                         err = -EINVAL;
851                                         break;
852                                 }
853
854                                 t = netdev_priv(dev);
855                         }
856                 }
857
858                 if (t) {
859                         err = 0;
860                         ip_tunnel_update(itn, t, dev, p, true);
861                 } else {
862                         err = -ENOENT;
863                 }
864                 break;
865
866         case SIOCDELTUNNEL:
867                 err = -EPERM;
868                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
869                         goto done;
870
871                 if (dev == itn->fb_tunnel_dev) {
872                         err = -ENOENT;
873                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
874                         if (!t)
875                                 goto done;
876                         err = -EPERM;
877                         if (t == netdev_priv(itn->fb_tunnel_dev))
878                                 goto done;
879                         dev = t->dev;
880                 }
881                 unregister_netdevice(dev);
882                 err = 0;
883                 break;
884
885         default:
886                 err = -EINVAL;
887         }
888
889 done:
890         return err;
891 }
892 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
893
894 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
895 {
896         struct ip_tunnel *tunnel = netdev_priv(dev);
897         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
898
899         if (new_mtu < 68 ||
900             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
901                 return -EINVAL;
902         dev->mtu = new_mtu;
903         return 0;
904 }
905 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
906
907 static void ip_tunnel_dev_free(struct net_device *dev)
908 {
909         struct ip_tunnel *tunnel = netdev_priv(dev);
910
911         gro_cells_destroy(&tunnel->gro_cells);
912         dst_cache_destroy(&tunnel->dst_cache);
913         free_percpu(dev->tstats);
914         free_netdev(dev);
915 }
916
917 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
918 {
919         struct ip_tunnel *tunnel = netdev_priv(dev);
920         struct ip_tunnel_net *itn;
921
922         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
923
924         if (itn->fb_tunnel_dev != dev) {
925                 ip_tunnel_del(itn, netdev_priv(dev));
926                 unregister_netdevice_queue(dev, head);
927         }
928 }
929 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
930
931 struct net *ip_tunnel_get_link_net(const struct net_device *dev)
932 {
933         struct ip_tunnel *tunnel = netdev_priv(dev);
934
935         return tunnel->net;
936 }
937 EXPORT_SYMBOL(ip_tunnel_get_link_net);
938
939 int ip_tunnel_get_iflink(const struct net_device *dev)
940 {
941         struct ip_tunnel *tunnel = netdev_priv(dev);
942
943         return tunnel->parms.link;
944 }
945 EXPORT_SYMBOL(ip_tunnel_get_iflink);
946
947 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
948                                   struct rtnl_link_ops *ops, char *devname)
949 {
950         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
951         struct ip_tunnel_parm parms;
952         unsigned int i;
953
954         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
955                 INIT_HLIST_HEAD(&itn->tunnels[i]);
956
957         if (!ops) {
958                 itn->fb_tunnel_dev = NULL;
959                 return 0;
960         }
961
962         memset(&parms, 0, sizeof(parms));
963         if (devname)
964                 strlcpy(parms.name, devname, IFNAMSIZ);
965
966         rtnl_lock();
967         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
968         /* FB netdevice is special: we have one, and only one per netns.
969          * Allowing to move it to another netns is clearly unsafe.
970          */
971         if (!IS_ERR(itn->fb_tunnel_dev)) {
972                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
973                 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
974                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
975         }
976         rtnl_unlock();
977
978         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
979 }
980 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
981
982 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
983                               struct rtnl_link_ops *ops)
984 {
985         struct net *net = dev_net(itn->fb_tunnel_dev);
986         struct net_device *dev, *aux;
987         int h;
988
989         for_each_netdev_safe(net, dev, aux)
990                 if (dev->rtnl_link_ops == ops)
991                         unregister_netdevice_queue(dev, head);
992
993         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
994                 struct ip_tunnel *t;
995                 struct hlist_node *n;
996                 struct hlist_head *thead = &itn->tunnels[h];
997
998                 hlist_for_each_entry_safe(t, n, thead, hash_node)
999                         /* If dev is in the same netns, it has already
1000                          * been added to the list by the previous loop.
1001                          */
1002                         if (!net_eq(dev_net(t->dev), net))
1003                                 unregister_netdevice_queue(t->dev, head);
1004         }
1005 }
1006
1007 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
1008 {
1009         LIST_HEAD(list);
1010
1011         rtnl_lock();
1012         ip_tunnel_destroy(itn, &list, ops);
1013         unregister_netdevice_many(&list);
1014         rtnl_unlock();
1015 }
1016 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1017
1018 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1019                       struct ip_tunnel_parm *p)
1020 {
1021         struct ip_tunnel *nt;
1022         struct net *net = dev_net(dev);
1023         struct ip_tunnel_net *itn;
1024         int mtu;
1025         int err;
1026
1027         nt = netdev_priv(dev);
1028         itn = net_generic(net, nt->ip_tnl_net_id);
1029
1030         if (nt->collect_md) {
1031                 if (rtnl_dereference(itn->collect_md_tun))
1032                         return -EEXIST;
1033         } else {
1034                 if (ip_tunnel_find(itn, p, dev->type))
1035                         return -EEXIST;
1036         }
1037
1038         nt->net = net;
1039         nt->parms = *p;
1040         err = register_netdevice(dev);
1041         if (err)
1042                 goto out;
1043
1044         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1045                 eth_hw_addr_random(dev);
1046
1047         mtu = ip_tunnel_bind_dev(dev);
1048         if (!tb[IFLA_MTU])
1049                 dev->mtu = mtu;
1050
1051         ip_tunnel_add(itn, nt);
1052 out:
1053         return err;
1054 }
1055 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1056
1057 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1058                          struct ip_tunnel_parm *p)
1059 {
1060         struct ip_tunnel *t;
1061         struct ip_tunnel *tunnel = netdev_priv(dev);
1062         struct net *net = tunnel->net;
1063         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1064
1065         if (dev == itn->fb_tunnel_dev)
1066                 return -EINVAL;
1067
1068         t = ip_tunnel_find(itn, p, dev->type);
1069
1070         if (t) {
1071                 if (t->dev != dev)
1072                         return -EEXIST;
1073         } else {
1074                 t = tunnel;
1075
1076                 if (dev->type != ARPHRD_ETHER) {
1077                         unsigned int nflags = 0;
1078
1079                         if (ipv4_is_multicast(p->iph.daddr))
1080                                 nflags = IFF_BROADCAST;
1081                         else if (p->iph.daddr)
1082                                 nflags = IFF_POINTOPOINT;
1083
1084                         if ((dev->flags ^ nflags) &
1085                             (IFF_POINTOPOINT | IFF_BROADCAST))
1086                                 return -EINVAL;
1087                 }
1088         }
1089
1090         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1091         return 0;
1092 }
1093 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1094
1095 int ip_tunnel_init(struct net_device *dev)
1096 {
1097         struct ip_tunnel *tunnel = netdev_priv(dev);
1098         struct iphdr *iph = &tunnel->parms.iph;
1099         int err;
1100
1101         dev->destructor = ip_tunnel_dev_free;
1102         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1103         if (!dev->tstats)
1104                 return -ENOMEM;
1105
1106         err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1107         if (err) {
1108                 free_percpu(dev->tstats);
1109                 return err;
1110         }
1111
1112         err = gro_cells_init(&tunnel->gro_cells, dev);
1113         if (err) {
1114                 dst_cache_destroy(&tunnel->dst_cache);
1115                 free_percpu(dev->tstats);
1116                 return err;
1117         }
1118
1119         tunnel->dev = dev;
1120         tunnel->net = dev_net(dev);
1121         strcpy(tunnel->parms.name, dev->name);
1122         iph->version            = 4;
1123         iph->ihl                = 5;
1124
1125         if (tunnel->collect_md) {
1126                 dev->features |= NETIF_F_NETNS_LOCAL;
1127                 netif_keep_dst(dev);
1128         }
1129         return 0;
1130 }
1131 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1132
1133 void ip_tunnel_uninit(struct net_device *dev)
1134 {
1135         struct ip_tunnel *tunnel = netdev_priv(dev);
1136         struct net *net = tunnel->net;
1137         struct ip_tunnel_net *itn;
1138
1139         itn = net_generic(net, tunnel->ip_tnl_net_id);
1140         /* fb_tunnel_dev will be unregisted in net-exit call. */
1141         if (itn->fb_tunnel_dev != dev)
1142                 ip_tunnel_del(itn, netdev_priv(dev));
1143
1144         dst_cache_reset(&tunnel->dst_cache);
1145 }
1146 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1147
1148 /* Do least required initialization, rest of init is done in tunnel_init call */
1149 void ip_tunnel_setup(struct net_device *dev, int net_id)
1150 {
1151         struct ip_tunnel *tunnel = netdev_priv(dev);
1152         tunnel->ip_tnl_net_id = net_id;
1153 }
1154 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1155
1156 MODULE_LICENSE("GPL");