tunnels: advertise link netns via netlink
[linux-2.6-block.git] / net / ipv4 / ip_tunnel.c
CommitLineData
c5441932
PS
1/*
2 * Copyright (c) 2013 Nicira, Inc.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of version 2 of the GNU General Public
6 * License as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301, USA
17 */
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/capability.h>
22#include <linux/module.h>
23#include <linux/types.h>
24#include <linux/kernel.h>
25#include <linux/slab.h>
26#include <linux/uaccess.h>
27#include <linux/skbuff.h>
28#include <linux/netdevice.h>
29#include <linux/in.h>
30#include <linux/tcp.h>
31#include <linux/udp.h>
32#include <linux/if_arp.h>
33#include <linux/mroute.h>
34#include <linux/init.h>
35#include <linux/in6.h>
36#include <linux/inetdevice.h>
37#include <linux/igmp.h>
38#include <linux/netfilter_ipv4.h>
39#include <linux/etherdevice.h>
40#include <linux/if_ether.h>
41#include <linux/if_vlan.h>
42#include <linux/rculist.h>
27d79f3b 43#include <linux/err.h>
c5441932
PS
44
45#include <net/sock.h>
46#include <net/ip.h>
47#include <net/icmp.h>
48#include <net/protocol.h>
49#include <net/ip_tunnels.h>
50#include <net/arp.h>
51#include <net/checksum.h>
52#include <net/dsfield.h>
53#include <net/inet_ecn.h>
54#include <net/xfrm.h>
55#include <net/net_namespace.h>
56#include <net/netns/generic.h>
57#include <net/rtnetlink.h>
56328486 58#include <net/udp.h>
63487bab 59
c5441932
PS
60#if IS_ENABLED(CONFIG_IPV6)
61#include <net/ipv6.h>
62#include <net/ip6_fib.h>
63#include <net/ip6_route.h>
64#endif
65
967680e0 66static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
c5441932
PS
67{
68 return hash_32((__force u32)key ^ (__force u32)remote,
69 IP_TNL_HASH_BITS);
70}
71
6c7e7610 72static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
95cb5745 73 struct dst_entry *dst, __be32 saddr)
7d442fab
TH
74{
75 struct dst_entry *old_dst;
76
f8864972 77 dst_clone(dst);
6c7e7610 78 old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
7d442fab 79 dst_release(old_dst);
95cb5745 80 idst->saddr = saddr;
7d442fab
TH
81}
82
a35165ca 83static noinline void tunnel_dst_set(struct ip_tunnel *t,
95cb5745 84 struct dst_entry *dst, __be32 saddr)
7d442fab 85{
a35165ca 86 __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
7d442fab
TH
87}
88
6c7e7610 89static void tunnel_dst_reset(struct ip_tunnel *t)
7d442fab 90{
95cb5745 91 tunnel_dst_set(t, NULL, 0);
7d442fab
TH
92}
93
cf71d2bc 94void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
9a4aa9af
TH
95{
96 int i;
97
98 for_each_possible_cpu(i)
95cb5745 99 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
9a4aa9af 100}
cf71d2bc 101EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
9a4aa9af 102
95cb5745
DP
103static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
104 u32 cookie, __be32 *saddr)
7d442fab 105{
95cb5745 106 struct ip_tunnel_dst *idst;
7d442fab
TH
107 struct dst_entry *dst;
108
109 rcu_read_lock();
a35165ca 110 idst = raw_cpu_ptr(t->dst_cache);
95cb5745 111 dst = rcu_dereference(idst->dst);
f8864972
ED
112 if (dst && !atomic_inc_not_zero(&dst->__refcnt))
113 dst = NULL;
b045d37b 114 if (dst) {
95cb5745
DP
115 if (!dst->obsolete || dst->ops->check(dst, cookie)) {
116 *saddr = idst->saddr;
117 } else {
b045d37b 118 tunnel_dst_reset(t);
f8864972
ED
119 dst_release(dst);
120 dst = NULL;
b045d37b 121 }
7d442fab 122 }
b045d37b
ED
123 rcu_read_unlock();
124 return (struct rtable *)dst;
7d442fab
TH
125}
126
c5441932
PS
127static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
128 __be16 flags, __be32 key)
129{
130 if (p->i_flags & TUNNEL_KEY) {
131 if (flags & TUNNEL_KEY)
132 return key == p->i_key;
133 else
134 /* key expected, none present */
135 return false;
136 } else
137 return !(flags & TUNNEL_KEY);
138}
139
140/* Fallback tunnel: no source, no destination, no key, no options
141
142 Tunnel hash table:
143 We require exact key match i.e. if a key is present in packet
144 it will match only tunnel with the same key; if it is not present,
145 it will match only keyless tunnel.
146
147 All keysless packets, if not matched configured keyless tunnels
148 will match fallback tunnel.
149 Given src, dst and key, find appropriate for input tunnel.
150*/
151struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
152 int link, __be16 flags,
153 __be32 remote, __be32 local,
154 __be32 key)
155{
156 unsigned int hash;
157 struct ip_tunnel *t, *cand = NULL;
158 struct hlist_head *head;
159
967680e0 160 hash = ip_tunnel_hash(key, remote);
c5441932
PS
161 head = &itn->tunnels[hash];
162
163 hlist_for_each_entry_rcu(t, head, hash_node) {
164 if (local != t->parms.iph.saddr ||
165 remote != t->parms.iph.daddr ||
166 !(t->dev->flags & IFF_UP))
167 continue;
168
169 if (!ip_tunnel_key_match(&t->parms, flags, key))
170 continue;
171
172 if (t->parms.link == link)
173 return t;
174 else
175 cand = t;
176 }
177
178 hlist_for_each_entry_rcu(t, head, hash_node) {
179 if (remote != t->parms.iph.daddr ||
e0056593 180 t->parms.iph.saddr != 0 ||
c5441932
PS
181 !(t->dev->flags & IFF_UP))
182 continue;
183
184 if (!ip_tunnel_key_match(&t->parms, flags, key))
185 continue;
186
187 if (t->parms.link == link)
188 return t;
189 else if (!cand)
190 cand = t;
191 }
192
967680e0 193 hash = ip_tunnel_hash(key, 0);
c5441932
PS
194 head = &itn->tunnels[hash];
195
196 hlist_for_each_entry_rcu(t, head, hash_node) {
e0056593
DP
197 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
198 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
199 continue;
200
201 if (!(t->dev->flags & IFF_UP))
c5441932
PS
202 continue;
203
204 if (!ip_tunnel_key_match(&t->parms, flags, key))
205 continue;
206
207 if (t->parms.link == link)
208 return t;
209 else if (!cand)
210 cand = t;
211 }
212
213 if (flags & TUNNEL_NO_KEY)
214 goto skip_key_lookup;
215
216 hlist_for_each_entry_rcu(t, head, hash_node) {
217 if (t->parms.i_key != key ||
e0056593
DP
218 t->parms.iph.saddr != 0 ||
219 t->parms.iph.daddr != 0 ||
c5441932
PS
220 !(t->dev->flags & IFF_UP))
221 continue;
222
223 if (t->parms.link == link)
224 return t;
225 else if (!cand)
226 cand = t;
227 }
228
229skip_key_lookup:
230 if (cand)
231 return cand;
232
233 if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
234 return netdev_priv(itn->fb_tunnel_dev);
235
236
237 return NULL;
238}
239EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
240
241static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
242 struct ip_tunnel_parm *parms)
243{
244 unsigned int h;
245 __be32 remote;
6d608f06 246 __be32 i_key = parms->i_key;
c5441932
PS
247
248 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
249 remote = parms->iph.daddr;
250 else
251 remote = 0;
252
6d608f06
SK
253 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
254 i_key = 0;
255
256 h = ip_tunnel_hash(i_key, remote);
c5441932
PS
257 return &itn->tunnels[h];
258}
259
260static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
261{
262 struct hlist_head *head = ip_bucket(itn, &t->parms);
263
264 hlist_add_head_rcu(&t->hash_node, head);
265}
266
267static void ip_tunnel_del(struct ip_tunnel *t)
268{
269 hlist_del_init_rcu(&t->hash_node);
270}
271
272static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
273 struct ip_tunnel_parm *parms,
274 int type)
275{
276 __be32 remote = parms->iph.daddr;
277 __be32 local = parms->iph.saddr;
278 __be32 key = parms->i_key;
5ce54af1 279 __be16 flags = parms->i_flags;
c5441932
PS
280 int link = parms->link;
281 struct ip_tunnel *t = NULL;
282 struct hlist_head *head = ip_bucket(itn, parms);
283
284 hlist_for_each_entry_rcu(t, head, hash_node) {
285 if (local == t->parms.iph.saddr &&
286 remote == t->parms.iph.daddr &&
c5441932 287 link == t->parms.link &&
5ce54af1
DP
288 type == t->dev->type &&
289 ip_tunnel_key_match(&t->parms, flags, key))
c5441932
PS
290 break;
291 }
292 return t;
293}
294
295static struct net_device *__ip_tunnel_create(struct net *net,
296 const struct rtnl_link_ops *ops,
297 struct ip_tunnel_parm *parms)
298{
299 int err;
300 struct ip_tunnel *tunnel;
301 struct net_device *dev;
302 char name[IFNAMSIZ];
303
304 if (parms->name[0])
305 strlcpy(name, parms->name, IFNAMSIZ);
306 else {
54a5d382 307 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
c5441932
PS
308 err = -E2BIG;
309 goto failed;
310 }
311 strlcpy(name, ops->kind, IFNAMSIZ);
312 strncat(name, "%d", 2);
313 }
314
315 ASSERT_RTNL();
c835a677 316 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
c5441932
PS
317 if (!dev) {
318 err = -ENOMEM;
319 goto failed;
320 }
321 dev_net_set(dev, net);
322
323 dev->rtnl_link_ops = ops;
324
325 tunnel = netdev_priv(dev);
326 tunnel->parms = *parms;
5e6700b3 327 tunnel->net = net;
c5441932
PS
328
329 err = register_netdevice(dev);
330 if (err)
331 goto failed_free;
332
333 return dev;
334
335failed_free:
336 free_netdev(dev);
337failed:
338 return ERR_PTR(err);
339}
340
7d442fab
TH
341static inline void init_tunnel_flow(struct flowi4 *fl4,
342 int proto,
343 __be32 daddr, __be32 saddr,
344 __be32 key, __u8 tos, int oif)
c5441932
PS
345{
346 memset(fl4, 0, sizeof(*fl4));
347 fl4->flowi4_oif = oif;
348 fl4->daddr = daddr;
349 fl4->saddr = saddr;
350 fl4->flowi4_tos = tos;
351 fl4->flowi4_proto = proto;
352 fl4->fl4_gre_key = key;
c5441932
PS
353}
354
355static int ip_tunnel_bind_dev(struct net_device *dev)
356{
357 struct net_device *tdev = NULL;
358 struct ip_tunnel *tunnel = netdev_priv(dev);
359 const struct iphdr *iph;
360 int hlen = LL_MAX_HEADER;
361 int mtu = ETH_DATA_LEN;
362 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
363
364 iph = &tunnel->parms.iph;
365
366 /* Guess output device to choose reasonable mtu and needed_headroom */
367 if (iph->daddr) {
368 struct flowi4 fl4;
369 struct rtable *rt;
370
7d442fab
TH
371 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
372 iph->saddr, tunnel->parms.o_key,
373 RT_TOS(iph->tos), tunnel->parms.link);
374 rt = ip_route_output_key(tunnel->net, &fl4);
375
c5441932
PS
376 if (!IS_ERR(rt)) {
377 tdev = rt->dst.dev;
95cb5745 378 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
c5441932
PS
379 ip_rt_put(rt);
380 }
381 if (dev->type != ARPHRD_ETHER)
382 dev->flags |= IFF_POINTOPOINT;
383 }
384
385 if (!tdev && tunnel->parms.link)
6c742e71 386 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
c5441932
PS
387
388 if (tdev) {
389 hlen = tdev->hard_header_len + tdev->needed_headroom;
390 mtu = tdev->mtu;
391 }
392 dev->iflink = tunnel->parms.link;
393
394 dev->needed_headroom = t_hlen + hlen;
395 mtu -= (dev->hard_header_len + t_hlen);
396
397 if (mtu < 68)
398 mtu = 68;
399
400 return mtu;
401}
402
403static struct ip_tunnel *ip_tunnel_create(struct net *net,
404 struct ip_tunnel_net *itn,
405 struct ip_tunnel_parm *parms)
406{
4929fd8c 407 struct ip_tunnel *nt;
c5441932
PS
408 struct net_device *dev;
409
410 BUG_ON(!itn->fb_tunnel_dev);
c5441932
PS
411 dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
412 if (IS_ERR(dev))
6dd3c9ec 413 return ERR_CAST(dev);
c5441932
PS
414
415 dev->mtu = ip_tunnel_bind_dev(dev);
416
417 nt = netdev_priv(dev);
418 ip_tunnel_add(itn, nt);
419 return nt;
420}
421
422int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
423 const struct tnl_ptk_info *tpi, bool log_ecn_error)
424{
8f84985f 425 struct pcpu_sw_netstats *tstats;
c5441932
PS
426 const struct iphdr *iph = ip_hdr(skb);
427 int err;
428
c5441932
PS
429#ifdef CONFIG_NET_IPGRE_BROADCAST
430 if (ipv4_is_multicast(iph->daddr)) {
c5441932
PS
431 tunnel->dev->stats.multicast++;
432 skb->pkt_type = PACKET_BROADCAST;
433 }
434#endif
435
436 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
437 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
438 tunnel->dev->stats.rx_crc_errors++;
439 tunnel->dev->stats.rx_errors++;
440 goto drop;
441 }
442
443 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
444 if (!(tpi->flags&TUNNEL_SEQ) ||
445 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
446 tunnel->dev->stats.rx_fifo_errors++;
447 tunnel->dev->stats.rx_errors++;
448 goto drop;
449 }
450 tunnel->i_seqno = ntohl(tpi->seq) + 1;
451 }
452
e96f2e7c
YC
453 skb_reset_network_header(skb);
454
c5441932
PS
455 err = IP_ECN_decapsulate(iph, skb);
456 if (unlikely(err)) {
457 if (log_ecn_error)
458 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
459 &iph->saddr, iph->tos);
460 if (err > 1) {
461 ++tunnel->dev->stats.rx_frame_errors;
462 ++tunnel->dev->stats.rx_errors;
463 goto drop;
464 }
465 }
466
467 tstats = this_cpu_ptr(tunnel->dev->tstats);
468 u64_stats_update_begin(&tstats->syncp);
469 tstats->rx_packets++;
470 tstats->rx_bytes += skb->len;
471 u64_stats_update_end(&tstats->syncp);
472
81b9eab5
AS
473 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
474
3d7b46cd
PS
475 if (tunnel->dev->type == ARPHRD_ETHER) {
476 skb->protocol = eth_type_trans(skb, tunnel->dev);
477 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
478 } else {
479 skb->dev = tunnel->dev;
480 }
64261f23 481
c5441932
PS
482 gro_cells_receive(&tunnel->gro_cells, skb);
483 return 0;
484
485drop:
486 kfree_skb(skb);
487 return 0;
488}
489EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
490
56328486
TH
491static int ip_encap_hlen(struct ip_tunnel_encap *e)
492{
a8c5f90f
TH
493 const struct ip_tunnel_encap_ops *ops;
494 int hlen = -EINVAL;
495
496 if (e->type == TUNNEL_ENCAP_NONE)
56328486 497 return 0;
a8c5f90f
TH
498
499 if (e->type >= MAX_IPTUN_ENCAP_OPS)
56328486 500 return -EINVAL;
a8c5f90f
TH
501
502 rcu_read_lock();
503 ops = rcu_dereference(iptun_encaps[e->type]);
504 if (likely(ops && ops->encap_hlen))
505 hlen = ops->encap_hlen(e);
506 rcu_read_unlock();
507
508 return hlen;
509}
510
511const struct ip_tunnel_encap_ops __rcu *
512 iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
513
514int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
515 unsigned int num)
516{
bb1553c8
TG
517 if (num >= MAX_IPTUN_ENCAP_OPS)
518 return -ERANGE;
519
a8c5f90f
TH
520 return !cmpxchg((const struct ip_tunnel_encap_ops **)
521 &iptun_encaps[num],
522 NULL, ops) ? 0 : -1;
56328486 523}
a8c5f90f
TH
524EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
525
526int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
527 unsigned int num)
528{
529 int ret;
530
bb1553c8
TG
531 if (num >= MAX_IPTUN_ENCAP_OPS)
532 return -ERANGE;
533
a8c5f90f
TH
534 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
535 &iptun_encaps[num],
536 ops, NULL) == ops) ? 0 : -1;
537
538 synchronize_net();
539
540 return ret;
541}
542EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
56328486
TH
543
544int ip_tunnel_encap_setup(struct ip_tunnel *t,
545 struct ip_tunnel_encap *ipencap)
546{
547 int hlen;
548
549 memset(&t->encap, 0, sizeof(t->encap));
550
551 hlen = ip_encap_hlen(ipencap);
552 if (hlen < 0)
553 return hlen;
554
555 t->encap.type = ipencap->type;
556 t->encap.sport = ipencap->sport;
557 t->encap.dport = ipencap->dport;
558 t->encap.flags = ipencap->flags;
559
560 t->encap_hlen = hlen;
561 t->hlen = t->encap_hlen + t->tun_hlen;
562
563 return 0;
564}
565EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
566
56328486
TH
567int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
568 u8 *protocol, struct flowi4 *fl4)
569{
a8c5f90f
TH
570 const struct ip_tunnel_encap_ops *ops;
571 int ret = -EINVAL;
572
573 if (t->encap.type == TUNNEL_ENCAP_NONE)
56328486 574 return 0;
a8c5f90f 575
f1fb521f
TG
576 if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
577 return -EINVAL;
578
a8c5f90f
TH
579 rcu_read_lock();
580 ops = rcu_dereference(iptun_encaps[t->encap.type]);
581 if (likely(ops && ops->build_header))
582 ret = ops->build_header(skb, &t->encap, protocol, fl4);
583 rcu_read_unlock();
584
585 return ret;
56328486
TH
586}
587EXPORT_SYMBOL(ip_tunnel_encap);
588
23a3647b
PS
589static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
590 struct rtable *rt, __be16 df)
591{
592 struct ip_tunnel *tunnel = netdev_priv(dev);
8c91e162 593 int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
23a3647b
PS
594 int mtu;
595
596 if (df)
597 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
598 - sizeof(struct iphdr) - tunnel->hlen;
599 else
600 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
601
602 if (skb_dst(skb))
603 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
604
605 if (skb->protocol == htons(ETH_P_IP)) {
606 if (!skb_is_gso(skb) &&
607 (df & htons(IP_DF)) && mtu < pkt_size) {
608 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
609 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
610 return -E2BIG;
611 }
612 }
613#if IS_ENABLED(CONFIG_IPV6)
614 else if (skb->protocol == htons(ETH_P_IPV6)) {
615 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
616
617 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
618 mtu >= IPV6_MIN_MTU) {
619 if ((tunnel->parms.iph.daddr &&
620 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
621 rt6->rt6i_dst.plen == 128) {
622 rt6->rt6i_flags |= RTF_MODIFIED;
623 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
624 }
625 }
626
627 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
628 mtu < pkt_size) {
629 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
630 return -E2BIG;
631 }
632 }
633#endif
634 return 0;
635}
636
c5441932 637void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
56328486 638 const struct iphdr *tnl_params, u8 protocol)
c5441932
PS
639{
640 struct ip_tunnel *tunnel = netdev_priv(dev);
641 const struct iphdr *inner_iph;
c5441932
PS
642 struct flowi4 fl4;
643 u8 tos, ttl;
644 __be16 df;
b045d37b 645 struct rtable *rt; /* Route to the other host */
c5441932
PS
646 unsigned int max_headroom; /* The extra header space needed */
647 __be32 dst;
0e6fbc5b 648 int err;
22fb22ea 649 bool connected;
c5441932
PS
650
651 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
22fb22ea 652 connected = (tunnel->parms.iph.daddr != 0);
c5441932
PS
653
654 dst = tnl_params->daddr;
655 if (dst == 0) {
656 /* NBMA tunnel */
657
658 if (skb_dst(skb) == NULL) {
659 dev->stats.tx_fifo_errors++;
660 goto tx_error;
661 }
662
663 if (skb->protocol == htons(ETH_P_IP)) {
664 rt = skb_rtable(skb);
665 dst = rt_nexthop(rt, inner_iph->daddr);
666 }
667#if IS_ENABLED(CONFIG_IPV6)
668 else if (skb->protocol == htons(ETH_P_IPV6)) {
669 const struct in6_addr *addr6;
670 struct neighbour *neigh;
671 bool do_tx_error_icmp;
672 int addr_type;
673
674 neigh = dst_neigh_lookup(skb_dst(skb),
675 &ipv6_hdr(skb)->daddr);
676 if (neigh == NULL)
677 goto tx_error;
678
679 addr6 = (const struct in6_addr *)&neigh->primary_key;
680 addr_type = ipv6_addr_type(addr6);
681
682 if (addr_type == IPV6_ADDR_ANY) {
683 addr6 = &ipv6_hdr(skb)->daddr;
684 addr_type = ipv6_addr_type(addr6);
685 }
686
687 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
688 do_tx_error_icmp = true;
689 else {
690 do_tx_error_icmp = false;
691 dst = addr6->s6_addr32[3];
692 }
693 neigh_release(neigh);
694 if (do_tx_error_icmp)
695 goto tx_error_icmp;
696 }
697#endif
698 else
699 goto tx_error;
7d442fab
TH
700
701 connected = false;
c5441932
PS
702 }
703
704 tos = tnl_params->tos;
705 if (tos & 0x1) {
706 tos &= ~0x1;
7d442fab 707 if (skb->protocol == htons(ETH_P_IP)) {
c5441932 708 tos = inner_iph->tos;
7d442fab
TH
709 connected = false;
710 } else if (skb->protocol == htons(ETH_P_IPV6)) {
c5441932 711 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
7d442fab
TH
712 connected = false;
713 }
c5441932
PS
714 }
715
7d442fab
TH
716 init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
717 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
718
56328486
TH
719 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
720 goto tx_error;
721
95cb5745 722 rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
7d442fab
TH
723
724 if (!rt) {
725 rt = ip_route_output_key(tunnel->net, &fl4);
726
727 if (IS_ERR(rt)) {
728 dev->stats.tx_carrier_errors++;
729 goto tx_error;
730 }
731 if (connected)
95cb5745 732 tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
c5441932 733 }
7d442fab 734
0e6fbc5b 735 if (rt->dst.dev == dev) {
c5441932
PS
736 ip_rt_put(rt);
737 dev->stats.collisions++;
738 goto tx_error;
739 }
c5441932 740
23a3647b
PS
741 if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
742 ip_rt_put(rt);
743 goto tx_error;
c5441932 744 }
c5441932
PS
745
746 if (tunnel->err_count > 0) {
747 if (time_before(jiffies,
748 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
749 tunnel->err_count--;
750
11c21a30 751 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
c5441932
PS
752 dst_link_failure(skb);
753 } else
754 tunnel->err_count = 0;
755 }
756
d4a71b15 757 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
c5441932
PS
758 ttl = tnl_params->ttl;
759 if (ttl == 0) {
760 if (skb->protocol == htons(ETH_P_IP))
761 ttl = inner_iph->ttl;
762#if IS_ENABLED(CONFIG_IPV6)
763 else if (skb->protocol == htons(ETH_P_IPV6))
764 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
765#endif
766 else
767 ttl = ip4_dst_hoplimit(&rt->dst);
768 }
769
23a3647b
PS
770 df = tnl_params->frag_off;
771 if (skb->protocol == htons(ETH_P_IP))
772 df |= (inner_iph->frag_off&htons(IP_DF));
773
0e6fbc5b 774 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
7371e022 775 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
3e08f4a7 776 if (max_headroom > dev->needed_headroom)
c5441932 777 dev->needed_headroom = max_headroom;
3e08f4a7
SK
778
779 if (skb_cow_head(skb, dev->needed_headroom)) {
586d5fc8 780 ip_rt_put(rt);
3e08f4a7 781 dev->stats.tx_dropped++;
3acfa1e7 782 kfree_skb(skb);
3e08f4a7 783 return;
c5441932
PS
784 }
785
aad88724 786 err = iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol,
d4a71b15 787 tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
0e6fbc5b 788 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
c5441932 789
c5441932
PS
790 return;
791
792#if IS_ENABLED(CONFIG_IPV6)
793tx_error_icmp:
794 dst_link_failure(skb);
795#endif
796tx_error:
797 dev->stats.tx_errors++;
3acfa1e7 798 kfree_skb(skb);
c5441932
PS
799}
800EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
801
802static void ip_tunnel_update(struct ip_tunnel_net *itn,
803 struct ip_tunnel *t,
804 struct net_device *dev,
805 struct ip_tunnel_parm *p,
806 bool set_mtu)
807{
808 ip_tunnel_del(t);
809 t->parms.iph.saddr = p->iph.saddr;
810 t->parms.iph.daddr = p->iph.daddr;
811 t->parms.i_key = p->i_key;
812 t->parms.o_key = p->o_key;
813 if (dev->type != ARPHRD_ETHER) {
814 memcpy(dev->dev_addr, &p->iph.saddr, 4);
815 memcpy(dev->broadcast, &p->iph.daddr, 4);
816 }
817 ip_tunnel_add(itn, t);
818
819 t->parms.iph.ttl = p->iph.ttl;
820 t->parms.iph.tos = p->iph.tos;
821 t->parms.iph.frag_off = p->iph.frag_off;
822
823 if (t->parms.link != p->link) {
824 int mtu;
825
826 t->parms.link = p->link;
827 mtu = ip_tunnel_bind_dev(dev);
828 if (set_mtu)
829 dev->mtu = mtu;
830 }
cf71d2bc 831 ip_tunnel_dst_reset_all(t);
c5441932
PS
832 netdev_state_change(dev);
833}
834
835int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
836{
837 int err = 0;
8c923ce2
ND
838 struct ip_tunnel *t = netdev_priv(dev);
839 struct net *net = t->net;
840 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
c5441932
PS
841
842 BUG_ON(!itn->fb_tunnel_dev);
843 switch (cmd) {
844 case SIOCGETTUNNEL:
8c923ce2 845 if (dev == itn->fb_tunnel_dev) {
c5441932 846 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
8c923ce2
ND
847 if (t == NULL)
848 t = netdev_priv(dev);
849 }
c5441932
PS
850 memcpy(p, &t->parms, sizeof(*p));
851 break;
852
853 case SIOCADDTUNNEL:
854 case SIOCCHGTUNNEL:
855 err = -EPERM;
856 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
857 goto done;
858 if (p->iph.ttl)
859 p->iph.frag_off |= htons(IP_DF);
7c8e6b9c
DP
860 if (!(p->i_flags & VTI_ISVTI)) {
861 if (!(p->i_flags & TUNNEL_KEY))
862 p->i_key = 0;
863 if (!(p->o_flags & TUNNEL_KEY))
864 p->o_key = 0;
865 }
c5441932
PS
866
867 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
868
d61746b2
SK
869 if (cmd == SIOCADDTUNNEL) {
870 if (!t) {
871 t = ip_tunnel_create(net, itn, p);
872 err = PTR_ERR_OR_ZERO(t);
873 break;
874 }
875
876 err = -EEXIST;
ee30ef4d 877 break;
6dd3c9ec 878 }
c5441932
PS
879 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
880 if (t != NULL) {
881 if (t->dev != dev) {
882 err = -EEXIST;
883 break;
884 }
885 } else {
886 unsigned int nflags = 0;
887
888 if (ipv4_is_multicast(p->iph.daddr))
889 nflags = IFF_BROADCAST;
890 else if (p->iph.daddr)
891 nflags = IFF_POINTOPOINT;
892
893 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
894 err = -EINVAL;
895 break;
896 }
897
898 t = netdev_priv(dev);
899 }
900 }
901
902 if (t) {
903 err = 0;
904 ip_tunnel_update(itn, t, dev, p, true);
6dd3c9ec
FW
905 } else {
906 err = -ENOENT;
907 }
c5441932
PS
908 break;
909
910 case SIOCDELTUNNEL:
911 err = -EPERM;
912 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
913 goto done;
914
915 if (dev == itn->fb_tunnel_dev) {
916 err = -ENOENT;
917 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
918 if (t == NULL)
919 goto done;
920 err = -EPERM;
921 if (t == netdev_priv(itn->fb_tunnel_dev))
922 goto done;
923 dev = t->dev;
924 }
925 unregister_netdevice(dev);
926 err = 0;
927 break;
928
929 default:
930 err = -EINVAL;
931 }
932
933done:
934 return err;
935}
936EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
937
938int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
939{
940 struct ip_tunnel *tunnel = netdev_priv(dev);
941 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
942
943 if (new_mtu < 68 ||
944 new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
945 return -EINVAL;
946 dev->mtu = new_mtu;
947 return 0;
948}
949EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
950
951static void ip_tunnel_dev_free(struct net_device *dev)
952{
953 struct ip_tunnel *tunnel = netdev_priv(dev);
954
955 gro_cells_destroy(&tunnel->gro_cells);
9a4aa9af 956 free_percpu(tunnel->dst_cache);
c5441932
PS
957 free_percpu(dev->tstats);
958 free_netdev(dev);
959}
960
961void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
962{
c5441932
PS
963 struct ip_tunnel *tunnel = netdev_priv(dev);
964 struct ip_tunnel_net *itn;
965
6c742e71 966 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
c5441932
PS
967
968 if (itn->fb_tunnel_dev != dev) {
969 ip_tunnel_del(netdev_priv(dev));
970 unregister_netdevice_queue(dev, head);
971 }
972}
973EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
974
1728d4fa
ND
975struct net *ip_tunnel_get_link_net(const struct net_device *dev)
976{
977 struct ip_tunnel *tunnel = netdev_priv(dev);
978
979 return tunnel->net;
980}
981EXPORT_SYMBOL(ip_tunnel_get_link_net);
982
d3b6f614 983int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
c5441932
PS
984 struct rtnl_link_ops *ops, char *devname)
985{
986 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
987 struct ip_tunnel_parm parms;
6261d983 988 unsigned int i;
c5441932 989
6261d983 990 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
991 INIT_HLIST_HEAD(&itn->tunnels[i]);
c5441932
PS
992
993 if (!ops) {
994 itn->fb_tunnel_dev = NULL;
995 return 0;
996 }
6261d983 997
c5441932
PS
998 memset(&parms, 0, sizeof(parms));
999 if (devname)
1000 strlcpy(parms.name, devname, IFNAMSIZ);
1001
1002 rtnl_lock();
1003 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
ea857f28
DC
1004 /* FB netdevice is special: we have one, and only one per netns.
1005 * Allowing to move it to another netns is clearly unsafe.
1006 */
67013282 1007 if (!IS_ERR(itn->fb_tunnel_dev)) {
b4de77ad 1008 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
78ff4be4 1009 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
67013282
SK
1010 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1011 }
b4de77ad 1012 rtnl_unlock();
c5441932 1013
27d79f3b 1014 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
c5441932
PS
1015}
1016EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1017
6c742e71
ND
1018static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
1019 struct rtnl_link_ops *ops)
c5441932 1020{
6c742e71
ND
1021 struct net *net = dev_net(itn->fb_tunnel_dev);
1022 struct net_device *dev, *aux;
c5441932
PS
1023 int h;
1024
6c742e71
ND
1025 for_each_netdev_safe(net, dev, aux)
1026 if (dev->rtnl_link_ops == ops)
1027 unregister_netdevice_queue(dev, head);
1028
c5441932
PS
1029 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1030 struct ip_tunnel *t;
1031 struct hlist_node *n;
1032 struct hlist_head *thead = &itn->tunnels[h];
1033
1034 hlist_for_each_entry_safe(t, n, thead, hash_node)
6c742e71
ND
1035 /* If dev is in the same netns, it has already
1036 * been added to the list by the previous loop.
1037 */
1038 if (!net_eq(dev_net(t->dev), net))
1039 unregister_netdevice_queue(t->dev, head);
c5441932 1040 }
c5441932
PS
1041}
1042
6c742e71 1043void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
c5441932
PS
1044{
1045 LIST_HEAD(list);
1046
1047 rtnl_lock();
6c742e71 1048 ip_tunnel_destroy(itn, &list, ops);
c5441932
PS
1049 unregister_netdevice_many(&list);
1050 rtnl_unlock();
c5441932
PS
1051}
1052EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
1053
1054int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1055 struct ip_tunnel_parm *p)
1056{
1057 struct ip_tunnel *nt;
1058 struct net *net = dev_net(dev);
1059 struct ip_tunnel_net *itn;
1060 int mtu;
1061 int err;
1062
1063 nt = netdev_priv(dev);
1064 itn = net_generic(net, nt->ip_tnl_net_id);
1065
1066 if (ip_tunnel_find(itn, p, dev->type))
1067 return -EEXIST;
1068
5e6700b3 1069 nt->net = net;
c5441932
PS
1070 nt->parms = *p;
1071 err = register_netdevice(dev);
1072 if (err)
1073 goto out;
1074
1075 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1076 eth_hw_addr_random(dev);
1077
1078 mtu = ip_tunnel_bind_dev(dev);
1079 if (!tb[IFLA_MTU])
1080 dev->mtu = mtu;
1081
1082 ip_tunnel_add(itn, nt);
1083
1084out:
1085 return err;
1086}
1087EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1088
1089int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1090 struct ip_tunnel_parm *p)
1091{
6c742e71 1092 struct ip_tunnel *t;
c5441932 1093 struct ip_tunnel *tunnel = netdev_priv(dev);
6c742e71 1094 struct net *net = tunnel->net;
c5441932
PS
1095 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1096
1097 if (dev == itn->fb_tunnel_dev)
1098 return -EINVAL;
1099
c5441932
PS
1100 t = ip_tunnel_find(itn, p, dev->type);
1101
1102 if (t) {
1103 if (t->dev != dev)
1104 return -EEXIST;
1105 } else {
6c742e71 1106 t = tunnel;
c5441932
PS
1107
1108 if (dev->type != ARPHRD_ETHER) {
1109 unsigned int nflags = 0;
1110
1111 if (ipv4_is_multicast(p->iph.daddr))
1112 nflags = IFF_BROADCAST;
1113 else if (p->iph.daddr)
1114 nflags = IFF_POINTOPOINT;
1115
1116 if ((dev->flags ^ nflags) &
1117 (IFF_POINTOPOINT | IFF_BROADCAST))
1118 return -EINVAL;
1119 }
1120 }
1121
1122 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1123 return 0;
1124}
1125EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1126
1127int ip_tunnel_init(struct net_device *dev)
1128{
1129 struct ip_tunnel *tunnel = netdev_priv(dev);
1130 struct iphdr *iph = &tunnel->parms.iph;
1c213bd2 1131 int err;
c5441932
PS
1132
1133 dev->destructor = ip_tunnel_dev_free;
1c213bd2 1134 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
c5441932
PS
1135 if (!dev->tstats)
1136 return -ENOMEM;
1137
9a4aa9af
TH
1138 tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1139 if (!tunnel->dst_cache) {
1140 free_percpu(dev->tstats);
1141 return -ENOMEM;
1142 }
1143
c5441932
PS
1144 err = gro_cells_init(&tunnel->gro_cells, dev);
1145 if (err) {
9a4aa9af 1146 free_percpu(tunnel->dst_cache);
c5441932
PS
1147 free_percpu(dev->tstats);
1148 return err;
1149 }
1150
1151 tunnel->dev = dev;
6c742e71 1152 tunnel->net = dev_net(dev);
c5441932
PS
1153 strcpy(tunnel->parms.name, dev->name);
1154 iph->version = 4;
1155 iph->ihl = 5;
1156
1157 return 0;
1158}
1159EXPORT_SYMBOL_GPL(ip_tunnel_init);
1160
1161void ip_tunnel_uninit(struct net_device *dev)
1162{
c5441932 1163 struct ip_tunnel *tunnel = netdev_priv(dev);
6c742e71 1164 struct net *net = tunnel->net;
c5441932
PS
1165 struct ip_tunnel_net *itn;
1166
1167 itn = net_generic(net, tunnel->ip_tnl_net_id);
1168 /* fb_tunnel_dev will be unregisted in net-exit call. */
1169 if (itn->fb_tunnel_dev != dev)
1170 ip_tunnel_del(netdev_priv(dev));
7d442fab 1171
cf71d2bc 1172 ip_tunnel_dst_reset_all(tunnel);
c5441932
PS
1173}
1174EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1175
1176/* Do least required initialization, rest of init is done in tunnel_init call */
1177void ip_tunnel_setup(struct net_device *dev, int net_id)
1178{
1179 struct ip_tunnel *tunnel = netdev_priv(dev);
1180 tunnel->ip_tnl_net_id = net_id;
1181}
1182EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1183
1184MODULE_LICENSE("GPL");