tunnels: advertise link netns via netlink
[linux-2.6-block.git] / drivers / net / vxlan.c
1 /*
2  * VXLAN: Virtual eXtensible Local Area Network
3  *
4  * Copyright (c) 2012-2013 Vyatta Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13 #include <linux/kernel.h>
14 #include <linux/types.h>
15 #include <linux/module.h>
16 #include <linux/errno.h>
17 #include <linux/slab.h>
18 #include <linux/skbuff.h>
19 #include <linux/rculist.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/ip.h>
23 #include <linux/udp.h>
24 #include <linux/igmp.h>
25 #include <linux/etherdevice.h>
26 #include <linux/if_ether.h>
27 #include <linux/if_vlan.h>
28 #include <linux/hash.h>
29 #include <linux/ethtool.h>
30 #include <net/arp.h>
31 #include <net/ndisc.h>
32 #include <net/ip.h>
33 #include <net/ip_tunnels.h>
34 #include <net/icmp.h>
35 #include <net/udp.h>
36 #include <net/udp_tunnel.h>
37 #include <net/rtnetlink.h>
38 #include <net/route.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/net_namespace.h>
42 #include <net/netns/generic.h>
43 #include <net/vxlan.h>
44 #include <net/protocol.h>
45 #include <net/udp_tunnel.h>
46 #if IS_ENABLED(CONFIG_IPV6)
47 #include <net/ipv6.h>
48 #include <net/addrconf.h>
49 #include <net/ip6_tunnel.h>
50 #include <net/ip6_checksum.h>
51 #endif
52
53 #define VXLAN_VERSION   "0.1"
54
55 #define PORT_HASH_BITS  8
56 #define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
57 #define VNI_HASH_BITS   10
58 #define VNI_HASH_SIZE   (1<<VNI_HASH_BITS)
59 #define FDB_HASH_BITS   8
60 #define FDB_HASH_SIZE   (1<<FDB_HASH_BITS)
61 #define FDB_AGE_DEFAULT 300 /* 5 min */
62 #define FDB_AGE_INTERVAL (10 * HZ)      /* rescan interval */
63
64 /* UDP port for VXLAN traffic.
65  * The IANA assigned port is 4789, but the Linux default is 8472
66  * for compatibility with early adopters.
67  */
68 static unsigned short vxlan_port __read_mostly = 8472;
69 module_param_named(udp_port, vxlan_port, ushort, 0444);
70 MODULE_PARM_DESC(udp_port, "Destination UDP port");
71
72 static bool log_ecn_error = true;
73 module_param(log_ecn_error, bool, 0644);
74 MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
75
76 static int vxlan_net_id;
77
78 static const u8 all_zeros_mac[ETH_ALEN];
79
80 /* per-network namespace private data for this module */
81 struct vxlan_net {
82         struct list_head  vxlan_list;
83         struct hlist_head sock_list[PORT_HASH_SIZE];
84         spinlock_t        sock_lock;
85 };
86
87 union vxlan_addr {
88         struct sockaddr_in sin;
89         struct sockaddr_in6 sin6;
90         struct sockaddr sa;
91 };
92
93 struct vxlan_rdst {
94         union vxlan_addr         remote_ip;
95         __be16                   remote_port;
96         u32                      remote_vni;
97         u32                      remote_ifindex;
98         struct list_head         list;
99         struct rcu_head          rcu;
100 };
101
102 /* Forwarding table entry */
103 struct vxlan_fdb {
104         struct hlist_node hlist;        /* linked list of entries */
105         struct rcu_head   rcu;
106         unsigned long     updated;      /* jiffies */
107         unsigned long     used;
108         struct list_head  remotes;
109         u16               state;        /* see ndm_state */
110         u8                flags;        /* see ndm_flags */
111         u8                eth_addr[ETH_ALEN];
112 };
113
114 /* Pseudo network device */
115 struct vxlan_dev {
116         struct hlist_node hlist;        /* vni hash table */
117         struct list_head  next;         /* vxlan's per namespace list */
118         struct vxlan_sock *vn_sock;     /* listening socket */
119         struct net_device *dev;
120         struct net        *net;         /* netns for packet i/o */
121         struct vxlan_rdst default_dst;  /* default destination */
122         union vxlan_addr  saddr;        /* source address */
123         __be16            dst_port;
124         __u16             port_min;     /* source port range */
125         __u16             port_max;
126         __u8              tos;          /* TOS override */
127         __u8              ttl;
128         u32               flags;        /* VXLAN_F_* in vxlan.h */
129
130         struct work_struct sock_work;
131         struct work_struct igmp_join;
132         struct work_struct igmp_leave;
133
134         unsigned long     age_interval;
135         struct timer_list age_timer;
136         spinlock_t        hash_lock;
137         unsigned int      addrcnt;
138         unsigned int      addrmax;
139
140         struct hlist_head fdb_head[FDB_HASH_SIZE];
141 };
142
143 /* salt for hash table */
144 static u32 vxlan_salt __read_mostly;
145 static struct workqueue_struct *vxlan_wq;
146
147 static void vxlan_sock_work(struct work_struct *work);
148
149 #if IS_ENABLED(CONFIG_IPV6)
150 static inline
151 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
152 {
153        if (a->sa.sa_family != b->sa.sa_family)
154                return false;
155        if (a->sa.sa_family == AF_INET6)
156                return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
157        else
158                return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
159 }
160
161 static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
162 {
163        if (ipa->sa.sa_family == AF_INET6)
164                return ipv6_addr_any(&ipa->sin6.sin6_addr);
165        else
166                return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
167 }
168
169 static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
170 {
171        if (ipa->sa.sa_family == AF_INET6)
172                return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
173        else
174                return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
175 }
176
177 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
178 {
179        if (nla_len(nla) >= sizeof(struct in6_addr)) {
180                nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
181                ip->sa.sa_family = AF_INET6;
182                return 0;
183        } else if (nla_len(nla) >= sizeof(__be32)) {
184                ip->sin.sin_addr.s_addr = nla_get_be32(nla);
185                ip->sa.sa_family = AF_INET;
186                return 0;
187        } else {
188                return -EAFNOSUPPORT;
189        }
190 }
191
192 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
193                              const union vxlan_addr *ip)
194 {
195        if (ip->sa.sa_family == AF_INET6)
196                return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
197        else
198                return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
199 }
200
201 #else /* !CONFIG_IPV6 */
202
203 static inline
204 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
205 {
206        return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
207 }
208
209 static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
210 {
211        return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
212 }
213
214 static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
215 {
216        return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
217 }
218
219 static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
220 {
221        if (nla_len(nla) >= sizeof(struct in6_addr)) {
222                return -EAFNOSUPPORT;
223        } else if (nla_len(nla) >= sizeof(__be32)) {
224                ip->sin.sin_addr.s_addr = nla_get_be32(nla);
225                ip->sa.sa_family = AF_INET;
226                return 0;
227        } else {
228                return -EAFNOSUPPORT;
229        }
230 }
231
232 static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
233                              const union vxlan_addr *ip)
234 {
235        return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
236 }
237 #endif
238
239 /* Virtual Network hash table head */
240 static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
241 {
242         return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
243 }
244
245 /* Socket hash table head */
246 static inline struct hlist_head *vs_head(struct net *net, __be16 port)
247 {
248         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
249
250         return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
251 }
252
253 /* First remote destination for a forwarding entry.
254  * Guaranteed to be non-NULL because remotes are never deleted.
255  */
256 static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
257 {
258         return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
259 }
260
261 static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
262 {
263         return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
264 }
265
266 /* Find VXLAN socket based on network namespace, address family and UDP port
267  * and enabled unshareable flags.
268  */
269 static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
270                                           __be16 port, u32 flags)
271 {
272         struct vxlan_sock *vs;
273         u32 match_flags = flags & VXLAN_F_UNSHAREABLE;
274
275         hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
276                 if (inet_sk(vs->sock->sk)->inet_sport == port &&
277                     inet_sk(vs->sock->sk)->sk.sk_family == family &&
278                     (vs->flags & VXLAN_F_UNSHAREABLE) == match_flags)
279                         return vs;
280         }
281         return NULL;
282 }
283
284 static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
285 {
286         struct vxlan_dev *vxlan;
287
288         hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
289                 if (vxlan->default_dst.remote_vni == id)
290                         return vxlan;
291         }
292
293         return NULL;
294 }
295
296 /* Look up VNI in a per net namespace table */
297 static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
298                                         sa_family_t family, __be16 port,
299                                         u32 flags)
300 {
301         struct vxlan_sock *vs;
302
303         vs = vxlan_find_sock(net, family, port, flags);
304         if (!vs)
305                 return NULL;
306
307         return vxlan_vs_find_vni(vs, id);
308 }
309
310 /* Fill in neighbour message in skbuff. */
311 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
312                           const struct vxlan_fdb *fdb,
313                           u32 portid, u32 seq, int type, unsigned int flags,
314                           const struct vxlan_rdst *rdst)
315 {
316         unsigned long now = jiffies;
317         struct nda_cacheinfo ci;
318         struct nlmsghdr *nlh;
319         struct ndmsg *ndm;
320         bool send_ip, send_eth;
321
322         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
323         if (nlh == NULL)
324                 return -EMSGSIZE;
325
326         ndm = nlmsg_data(nlh);
327         memset(ndm, 0, sizeof(*ndm));
328
329         send_eth = send_ip = true;
330
331         if (type == RTM_GETNEIGH) {
332                 ndm->ndm_family = AF_INET;
333                 send_ip = !vxlan_addr_any(&rdst->remote_ip);
334                 send_eth = !is_zero_ether_addr(fdb->eth_addr);
335         } else
336                 ndm->ndm_family = AF_BRIDGE;
337         ndm->ndm_state = fdb->state;
338         ndm->ndm_ifindex = vxlan->dev->ifindex;
339         ndm->ndm_flags = fdb->flags;
340         ndm->ndm_type = RTN_UNICAST;
341
342         if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
343                 goto nla_put_failure;
344
345         if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
346                 goto nla_put_failure;
347
348         if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
349             nla_put_be16(skb, NDA_PORT, rdst->remote_port))
350                 goto nla_put_failure;
351         if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
352             nla_put_u32(skb, NDA_VNI, rdst->remote_vni))
353                 goto nla_put_failure;
354         if (rdst->remote_ifindex &&
355             nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
356                 goto nla_put_failure;
357
358         ci.ndm_used      = jiffies_to_clock_t(now - fdb->used);
359         ci.ndm_confirmed = 0;
360         ci.ndm_updated   = jiffies_to_clock_t(now - fdb->updated);
361         ci.ndm_refcnt    = 0;
362
363         if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
364                 goto nla_put_failure;
365
366         nlmsg_end(skb, nlh);
367         return 0;
368
369 nla_put_failure:
370         nlmsg_cancel(skb, nlh);
371         return -EMSGSIZE;
372 }
373
374 static inline size_t vxlan_nlmsg_size(void)
375 {
376         return NLMSG_ALIGN(sizeof(struct ndmsg))
377                 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
378                 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
379                 + nla_total_size(sizeof(__be16)) /* NDA_PORT */
380                 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
381                 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
382                 + nla_total_size(sizeof(struct nda_cacheinfo));
383 }
384
385 static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
386                              struct vxlan_rdst *rd, int type)
387 {
388         struct net *net = dev_net(vxlan->dev);
389         struct sk_buff *skb;
390         int err = -ENOBUFS;
391
392         skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
393         if (skb == NULL)
394                 goto errout;
395
396         err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
397         if (err < 0) {
398                 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
399                 WARN_ON(err == -EMSGSIZE);
400                 kfree_skb(skb);
401                 goto errout;
402         }
403
404         rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
405         return;
406 errout:
407         if (err < 0)
408                 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
409 }
410
411 static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
412 {
413         struct vxlan_dev *vxlan = netdev_priv(dev);
414         struct vxlan_fdb f = {
415                 .state = NUD_STALE,
416         };
417         struct vxlan_rdst remote = {
418                 .remote_ip = *ipa, /* goes to NDA_DST */
419                 .remote_vni = VXLAN_N_VID,
420         };
421
422         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
423 }
424
425 static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
426 {
427         struct vxlan_fdb f = {
428                 .state = NUD_STALE,
429         };
430         struct vxlan_rdst remote = { };
431
432         memcpy(f.eth_addr, eth_addr, ETH_ALEN);
433
434         vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH);
435 }
436
437 /* Hash Ethernet address */
438 static u32 eth_hash(const unsigned char *addr)
439 {
440         u64 value = get_unaligned((u64 *)addr);
441
442         /* only want 6 bytes */
443 #ifdef __BIG_ENDIAN
444         value >>= 16;
445 #else
446         value <<= 16;
447 #endif
448         return hash_64(value, FDB_HASH_BITS);
449 }
450
451 /* Hash chain to use given mac address */
452 static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
453                                                 const u8 *mac)
454 {
455         return &vxlan->fdb_head[eth_hash(mac)];
456 }
457
458 /* Look up Ethernet address in forwarding table */
459 static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
460                                         const u8 *mac)
461 {
462         struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
463         struct vxlan_fdb *f;
464
465         hlist_for_each_entry_rcu(f, head, hlist) {
466                 if (ether_addr_equal(mac, f->eth_addr))
467                         return f;
468         }
469
470         return NULL;
471 }
472
473 static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
474                                         const u8 *mac)
475 {
476         struct vxlan_fdb *f;
477
478         f = __vxlan_find_mac(vxlan, mac);
479         if (f)
480                 f->used = jiffies;
481
482         return f;
483 }
484
485 /* caller should hold vxlan->hash_lock */
486 static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
487                                               union vxlan_addr *ip, __be16 port,
488                                               __u32 vni, __u32 ifindex)
489 {
490         struct vxlan_rdst *rd;
491
492         list_for_each_entry(rd, &f->remotes, list) {
493                 if (vxlan_addr_equal(&rd->remote_ip, ip) &&
494                     rd->remote_port == port &&
495                     rd->remote_vni == vni &&
496                     rd->remote_ifindex == ifindex)
497                         return rd;
498         }
499
500         return NULL;
501 }
502
503 /* Replace destination of unicast mac */
504 static int vxlan_fdb_replace(struct vxlan_fdb *f,
505                              union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
506 {
507         struct vxlan_rdst *rd;
508
509         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
510         if (rd)
511                 return 0;
512
513         rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
514         if (!rd)
515                 return 0;
516         rd->remote_ip = *ip;
517         rd->remote_port = port;
518         rd->remote_vni = vni;
519         rd->remote_ifindex = ifindex;
520         return 1;
521 }
522
523 /* Add/update destinations for multicast */
524 static int vxlan_fdb_append(struct vxlan_fdb *f,
525                             union vxlan_addr *ip, __be16 port, __u32 vni,
526                             __u32 ifindex, struct vxlan_rdst **rdp)
527 {
528         struct vxlan_rdst *rd;
529
530         rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
531         if (rd)
532                 return 0;
533
534         rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
535         if (rd == NULL)
536                 return -ENOBUFS;
537         rd->remote_ip = *ip;
538         rd->remote_port = port;
539         rd->remote_vni = vni;
540         rd->remote_ifindex = ifindex;
541
542         list_add_tail_rcu(&rd->list, &f->remotes);
543
544         *rdp = rd;
545         return 1;
546 }
547
548 static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
549                                           unsigned int off,
550                                           struct vxlanhdr *vh, size_t hdrlen,
551                                           u32 data)
552 {
553         size_t start, offset, plen;
554         __wsum delta;
555
556         if (skb->remcsum_offload)
557                 return vh;
558
559         if (!NAPI_GRO_CB(skb)->csum_valid)
560                 return NULL;
561
562         start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
563         offset = start + ((data & VXLAN_RCO_UDP) ?
564                           offsetof(struct udphdr, check) :
565                           offsetof(struct tcphdr, check));
566
567         plen = hdrlen + offset + sizeof(u16);
568
569         /* Pull checksum that will be written */
570         if (skb_gro_header_hard(skb, off + plen)) {
571                 vh = skb_gro_header_slow(skb, off + plen, off);
572                 if (!vh)
573                         return NULL;
574         }
575
576         delta = remcsum_adjust((void *)vh + hdrlen,
577                                NAPI_GRO_CB(skb)->csum, start, offset);
578
579         /* Adjust skb->csum since we changed the packet */
580         skb->csum = csum_add(skb->csum, delta);
581         NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
582
583         skb->remcsum_offload = 1;
584
585         return vh;
586 }
587
588 static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
589                                           struct sk_buff *skb,
590                                           struct udp_offload *uoff)
591 {
592         struct sk_buff *p, **pp = NULL;
593         struct vxlanhdr *vh, *vh2;
594         unsigned int hlen, off_vx;
595         int flush = 1;
596         struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
597                                              udp_offloads);
598         u32 flags;
599
600         off_vx = skb_gro_offset(skb);
601         hlen = off_vx + sizeof(*vh);
602         vh   = skb_gro_header_fast(skb, off_vx);
603         if (skb_gro_header_hard(skb, hlen)) {
604                 vh = skb_gro_header_slow(skb, hlen, off_vx);
605                 if (unlikely(!vh))
606                         goto out;
607         }
608
609         skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
610         skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
611
612         flags = ntohl(vh->vx_flags);
613
614         if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
615                 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
616                                        ntohl(vh->vx_vni));
617
618                 if (!vh)
619                         goto out;
620         }
621
622         flush = 0;
623
624         for (p = *head; p; p = p->next) {
625                 if (!NAPI_GRO_CB(p)->same_flow)
626                         continue;
627
628                 vh2 = (struct vxlanhdr *)(p->data + off_vx);
629                 if (vh->vx_flags != vh2->vx_flags ||
630                     vh->vx_vni != vh2->vx_vni) {
631                         NAPI_GRO_CB(p)->same_flow = 0;
632                         continue;
633                 }
634         }
635
636         pp = eth_gro_receive(head, skb);
637
638 out:
639         NAPI_GRO_CB(skb)->flush |= flush;
640
641         return pp;
642 }
643
644 static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
645                               struct udp_offload *uoff)
646 {
647         udp_tunnel_gro_complete(skb, nhoff);
648
649         return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
650 }
651
652 /* Notify netdevs that UDP port started listening */
653 static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
654 {
655         struct net_device *dev;
656         struct sock *sk = vs->sock->sk;
657         struct net *net = sock_net(sk);
658         sa_family_t sa_family = sk->sk_family;
659         __be16 port = inet_sk(sk)->inet_sport;
660         int err;
661
662         if (sa_family == AF_INET) {
663                 err = udp_add_offload(&vs->udp_offloads);
664                 if (err)
665                         pr_warn("vxlan: udp_add_offload failed with status %d\n", err);
666         }
667
668         rcu_read_lock();
669         for_each_netdev_rcu(net, dev) {
670                 if (dev->netdev_ops->ndo_add_vxlan_port)
671                         dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
672                                                             port);
673         }
674         rcu_read_unlock();
675 }
676
677 /* Notify netdevs that UDP port is no more listening */
678 static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
679 {
680         struct net_device *dev;
681         struct sock *sk = vs->sock->sk;
682         struct net *net = sock_net(sk);
683         sa_family_t sa_family = sk->sk_family;
684         __be16 port = inet_sk(sk)->inet_sport;
685
686         rcu_read_lock();
687         for_each_netdev_rcu(net, dev) {
688                 if (dev->netdev_ops->ndo_del_vxlan_port)
689                         dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family,
690                                                             port);
691         }
692         rcu_read_unlock();
693
694         if (sa_family == AF_INET)
695                 udp_del_offload(&vs->udp_offloads);
696 }
697
698 /* Add new entry to forwarding table -- assumes lock held */
699 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
700                             const u8 *mac, union vxlan_addr *ip,
701                             __u16 state, __u16 flags,
702                             __be16 port, __u32 vni, __u32 ifindex,
703                             __u8 ndm_flags)
704 {
705         struct vxlan_rdst *rd = NULL;
706         struct vxlan_fdb *f;
707         int notify = 0;
708
709         f = __vxlan_find_mac(vxlan, mac);
710         if (f) {
711                 if (flags & NLM_F_EXCL) {
712                         netdev_dbg(vxlan->dev,
713                                    "lost race to create %pM\n", mac);
714                         return -EEXIST;
715                 }
716                 if (f->state != state) {
717                         f->state = state;
718                         f->updated = jiffies;
719                         notify = 1;
720                 }
721                 if (f->flags != ndm_flags) {
722                         f->flags = ndm_flags;
723                         f->updated = jiffies;
724                         notify = 1;
725                 }
726                 if ((flags & NLM_F_REPLACE)) {
727                         /* Only change unicasts */
728                         if (!(is_multicast_ether_addr(f->eth_addr) ||
729                              is_zero_ether_addr(f->eth_addr))) {
730                                 int rc = vxlan_fdb_replace(f, ip, port, vni,
731                                                            ifindex);
732
733                                 if (rc < 0)
734                                         return rc;
735                                 notify |= rc;
736                         } else
737                                 return -EOPNOTSUPP;
738                 }
739                 if ((flags & NLM_F_APPEND) &&
740                     (is_multicast_ether_addr(f->eth_addr) ||
741                      is_zero_ether_addr(f->eth_addr))) {
742                         int rc = vxlan_fdb_append(f, ip, port, vni, ifindex,
743                                                   &rd);
744
745                         if (rc < 0)
746                                 return rc;
747                         notify |= rc;
748                 }
749         } else {
750                 if (!(flags & NLM_F_CREATE))
751                         return -ENOENT;
752
753                 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
754                         return -ENOSPC;
755
756                 /* Disallow replace to add a multicast entry */
757                 if ((flags & NLM_F_REPLACE) &&
758                     (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
759                         return -EOPNOTSUPP;
760
761                 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
762                 f = kmalloc(sizeof(*f), GFP_ATOMIC);
763                 if (!f)
764                         return -ENOMEM;
765
766                 notify = 1;
767                 f->state = state;
768                 f->flags = ndm_flags;
769                 f->updated = f->used = jiffies;
770                 INIT_LIST_HEAD(&f->remotes);
771                 memcpy(f->eth_addr, mac, ETH_ALEN);
772
773                 vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
774
775                 ++vxlan->addrcnt;
776                 hlist_add_head_rcu(&f->hlist,
777                                    vxlan_fdb_head(vxlan, mac));
778         }
779
780         if (notify) {
781                 if (rd == NULL)
782                         rd = first_remote_rtnl(f);
783                 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH);
784         }
785
786         return 0;
787 }
788
789 static void vxlan_fdb_free(struct rcu_head *head)
790 {
791         struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
792         struct vxlan_rdst *rd, *nd;
793
794         list_for_each_entry_safe(rd, nd, &f->remotes, list)
795                 kfree(rd);
796         kfree(f);
797 }
798
799 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
800 {
801         netdev_dbg(vxlan->dev,
802                     "delete %pM\n", f->eth_addr);
803
804         --vxlan->addrcnt;
805         vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
806
807         hlist_del_rcu(&f->hlist);
808         call_rcu(&f->rcu, vxlan_fdb_free);
809 }
810
811 static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
812                            union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex)
813 {
814         struct net *net = dev_net(vxlan->dev);
815         int err;
816
817         if (tb[NDA_DST]) {
818                 err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
819                 if (err)
820                         return err;
821         } else {
822                 union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
823                 if (remote->sa.sa_family == AF_INET) {
824                         ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
825                         ip->sa.sa_family = AF_INET;
826 #if IS_ENABLED(CONFIG_IPV6)
827                 } else {
828                         ip->sin6.sin6_addr = in6addr_any;
829                         ip->sa.sa_family = AF_INET6;
830 #endif
831                 }
832         }
833
834         if (tb[NDA_PORT]) {
835                 if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
836                         return -EINVAL;
837                 *port = nla_get_be16(tb[NDA_PORT]);
838         } else {
839                 *port = vxlan->dst_port;
840         }
841
842         if (tb[NDA_VNI]) {
843                 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
844                         return -EINVAL;
845                 *vni = nla_get_u32(tb[NDA_VNI]);
846         } else {
847                 *vni = vxlan->default_dst.remote_vni;
848         }
849
850         if (tb[NDA_IFINDEX]) {
851                 struct net_device *tdev;
852
853                 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
854                         return -EINVAL;
855                 *ifindex = nla_get_u32(tb[NDA_IFINDEX]);
856                 tdev = __dev_get_by_index(net, *ifindex);
857                 if (!tdev)
858                         return -EADDRNOTAVAIL;
859         } else {
860                 *ifindex = 0;
861         }
862
863         return 0;
864 }
865
866 /* Add static entry (via netlink) */
867 static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
868                          struct net_device *dev,
869                          const unsigned char *addr, u16 vid, u16 flags)
870 {
871         struct vxlan_dev *vxlan = netdev_priv(dev);
872         /* struct net *net = dev_net(vxlan->dev); */
873         union vxlan_addr ip;
874         __be16 port;
875         u32 vni, ifindex;
876         int err;
877
878         if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
879                 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
880                         ndm->ndm_state);
881                 return -EINVAL;
882         }
883
884         if (tb[NDA_DST] == NULL)
885                 return -EINVAL;
886
887         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
888         if (err)
889                 return err;
890
891         if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
892                 return -EAFNOSUPPORT;
893
894         spin_lock_bh(&vxlan->hash_lock);
895         err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
896                                port, vni, ifindex, ndm->ndm_flags);
897         spin_unlock_bh(&vxlan->hash_lock);
898
899         return err;
900 }
901
902 /* Delete entry (via netlink) */
903 static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
904                             struct net_device *dev,
905                             const unsigned char *addr, u16 vid)
906 {
907         struct vxlan_dev *vxlan = netdev_priv(dev);
908         struct vxlan_fdb *f;
909         struct vxlan_rdst *rd = NULL;
910         union vxlan_addr ip;
911         __be16 port;
912         u32 vni, ifindex;
913         int err;
914
915         err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex);
916         if (err)
917                 return err;
918
919         err = -ENOENT;
920
921         spin_lock_bh(&vxlan->hash_lock);
922         f = vxlan_find_mac(vxlan, addr);
923         if (!f)
924                 goto out;
925
926         if (!vxlan_addr_any(&ip)) {
927                 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
928                 if (!rd)
929                         goto out;
930         }
931
932         err = 0;
933
934         /* remove a destination if it's not the only one on the list,
935          * otherwise destroy the fdb entry
936          */
937         if (rd && !list_is_singular(&f->remotes)) {
938                 list_del_rcu(&rd->list);
939                 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH);
940                 kfree_rcu(rd, rcu);
941                 goto out;
942         }
943
944         vxlan_fdb_destroy(vxlan, f);
945
946 out:
947         spin_unlock_bh(&vxlan->hash_lock);
948
949         return err;
950 }
951
952 /* Dump forwarding table */
953 static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
954                           struct net_device *dev,
955                           struct net_device *filter_dev, int idx)
956 {
957         struct vxlan_dev *vxlan = netdev_priv(dev);
958         unsigned int h;
959
960         for (h = 0; h < FDB_HASH_SIZE; ++h) {
961                 struct vxlan_fdb *f;
962                 int err;
963
964                 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
965                         struct vxlan_rdst *rd;
966
967                         if (idx < cb->args[0])
968                                 goto skip;
969
970                         list_for_each_entry_rcu(rd, &f->remotes, list) {
971                                 err = vxlan_fdb_info(skb, vxlan, f,
972                                                      NETLINK_CB(cb->skb).portid,
973                                                      cb->nlh->nlmsg_seq,
974                                                      RTM_NEWNEIGH,
975                                                      NLM_F_MULTI, rd);
976                                 if (err < 0)
977                                         goto out;
978                         }
979 skip:
980                         ++idx;
981                 }
982         }
983 out:
984         return idx;
985 }
986
987 /* Watch incoming packets to learn mapping between Ethernet address
988  * and Tunnel endpoint.
989  * Return true if packet is bogus and should be droppped.
990  */
991 static bool vxlan_snoop(struct net_device *dev,
992                         union vxlan_addr *src_ip, const u8 *src_mac)
993 {
994         struct vxlan_dev *vxlan = netdev_priv(dev);
995         struct vxlan_fdb *f;
996
997         f = vxlan_find_mac(vxlan, src_mac);
998         if (likely(f)) {
999                 struct vxlan_rdst *rdst = first_remote_rcu(f);
1000
1001                 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
1002                         return false;
1003
1004                 /* Don't migrate static entries, drop packets */
1005                 if (f->state & NUD_NOARP)
1006                         return true;
1007
1008                 if (net_ratelimit())
1009                         netdev_info(dev,
1010                                     "%pM migrated from %pIS to %pIS\n",
1011                                     src_mac, &rdst->remote_ip, &src_ip);
1012
1013                 rdst->remote_ip = *src_ip;
1014                 f->updated = jiffies;
1015                 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH);
1016         } else {
1017                 /* learned new entry */
1018                 spin_lock(&vxlan->hash_lock);
1019
1020                 /* close off race between vxlan_flush and incoming packets */
1021                 if (netif_running(dev))
1022                         vxlan_fdb_create(vxlan, src_mac, src_ip,
1023                                          NUD_REACHABLE,
1024                                          NLM_F_EXCL|NLM_F_CREATE,
1025                                          vxlan->dst_port,
1026                                          vxlan->default_dst.remote_vni,
1027                                          0, NTF_SELF);
1028                 spin_unlock(&vxlan->hash_lock);
1029         }
1030
1031         return false;
1032 }
1033
1034 /* See if multicast group is already in use by other ID */
1035 static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
1036 {
1037         struct vxlan_dev *vxlan;
1038
1039         /* The vxlan_sock is only used by dev, leaving group has
1040          * no effect on other vxlan devices.
1041          */
1042         if (atomic_read(&dev->vn_sock->refcnt) == 1)
1043                 return false;
1044
1045         list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1046                 if (!netif_running(vxlan->dev) || vxlan == dev)
1047                         continue;
1048
1049                 if (vxlan->vn_sock != dev->vn_sock)
1050                         continue;
1051
1052                 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
1053                                       &dev->default_dst.remote_ip))
1054                         continue;
1055
1056                 if (vxlan->default_dst.remote_ifindex !=
1057                     dev->default_dst.remote_ifindex)
1058                         continue;
1059
1060                 return true;
1061         }
1062
1063         return false;
1064 }
1065
1066 static void vxlan_sock_hold(struct vxlan_sock *vs)
1067 {
1068         atomic_inc(&vs->refcnt);
1069 }
1070
1071 void vxlan_sock_release(struct vxlan_sock *vs)
1072 {
1073         struct sock *sk = vs->sock->sk;
1074         struct net *net = sock_net(sk);
1075         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
1076
1077         if (!atomic_dec_and_test(&vs->refcnt))
1078                 return;
1079
1080         spin_lock(&vn->sock_lock);
1081         hlist_del_rcu(&vs->hlist);
1082         vxlan_notify_del_rx_port(vs);
1083         spin_unlock(&vn->sock_lock);
1084
1085         queue_work(vxlan_wq, &vs->del_work);
1086 }
1087 EXPORT_SYMBOL_GPL(vxlan_sock_release);
1088
1089 /* Callback to update multicast group membership when first VNI on
1090  * multicast asddress is brought up
1091  * Done as workqueue because ip_mc_join_group acquires RTNL.
1092  */
1093 static void vxlan_igmp_join(struct work_struct *work)
1094 {
1095         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
1096         struct vxlan_sock *vs = vxlan->vn_sock;
1097         struct sock *sk = vs->sock->sk;
1098         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1099         int ifindex = vxlan->default_dst.remote_ifindex;
1100
1101         lock_sock(sk);
1102         if (ip->sa.sa_family == AF_INET) {
1103                 struct ip_mreqn mreq = {
1104                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1105                         .imr_ifindex            = ifindex,
1106                 };
1107
1108                 ip_mc_join_group(sk, &mreq);
1109 #if IS_ENABLED(CONFIG_IPV6)
1110         } else {
1111                 ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
1112                                              &ip->sin6.sin6_addr);
1113 #endif
1114         }
1115         release_sock(sk);
1116
1117         vxlan_sock_release(vs);
1118         dev_put(vxlan->dev);
1119 }
1120
1121 /* Inverse of vxlan_igmp_join when last VNI is brought down */
1122 static void vxlan_igmp_leave(struct work_struct *work)
1123 {
1124         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
1125         struct vxlan_sock *vs = vxlan->vn_sock;
1126         struct sock *sk = vs->sock->sk;
1127         union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
1128         int ifindex = vxlan->default_dst.remote_ifindex;
1129
1130         lock_sock(sk);
1131         if (ip->sa.sa_family == AF_INET) {
1132                 struct ip_mreqn mreq = {
1133                         .imr_multiaddr.s_addr   = ip->sin.sin_addr.s_addr,
1134                         .imr_ifindex            = ifindex,
1135                 };
1136
1137                 ip_mc_leave_group(sk, &mreq);
1138 #if IS_ENABLED(CONFIG_IPV6)
1139         } else {
1140                 ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
1141                                              &ip->sin6.sin6_addr);
1142 #endif
1143         }
1144
1145         release_sock(sk);
1146
1147         vxlan_sock_release(vs);
1148         dev_put(vxlan->dev);
1149 }
1150
1151 static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
1152                                       size_t hdrlen, u32 data)
1153 {
1154         size_t start, offset, plen;
1155         __wsum delta;
1156
1157         if (skb->remcsum_offload) {
1158                 /* Already processed in GRO path */
1159                 skb->remcsum_offload = 0;
1160                 return vh;
1161         }
1162
1163         start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
1164         offset = start + ((data & VXLAN_RCO_UDP) ?
1165                           offsetof(struct udphdr, check) :
1166                           offsetof(struct tcphdr, check));
1167
1168         plen = hdrlen + offset + sizeof(u16);
1169
1170         if (!pskb_may_pull(skb, plen))
1171                 return NULL;
1172
1173         vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
1174
1175         if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE))
1176                 __skb_checksum_complete(skb);
1177
1178         delta = remcsum_adjust((void *)vh + hdrlen,
1179                                skb->csum, start, offset);
1180
1181         /* Adjust skb->csum since we changed the packet */
1182         skb->csum = csum_add(skb->csum, delta);
1183
1184         return vh;
1185 }
1186
1187 /* Callback from net/ipv4/udp.c to receive packets */
1188 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
1189 {
1190         struct vxlan_sock *vs;
1191         struct vxlanhdr *vxh;
1192         u32 flags, vni;
1193         struct vxlan_metadata md = {0};
1194
1195         /* Need Vxlan and inner Ethernet header to be present */
1196         if (!pskb_may_pull(skb, VXLAN_HLEN))
1197                 goto error;
1198
1199         vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
1200         flags = ntohl(vxh->vx_flags);
1201         vni = ntohl(vxh->vx_vni);
1202
1203         if (flags & VXLAN_HF_VNI) {
1204                 flags &= ~VXLAN_HF_VNI;
1205         } else {
1206                 /* VNI flag always required to be set */
1207                 goto bad_flags;
1208         }
1209
1210         if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
1211                 goto drop;
1212         vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
1213
1214         vs = rcu_dereference_sk_user_data(sk);
1215         if (!vs)
1216                 goto drop;
1217
1218         if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
1219                 vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni);
1220                 if (!vxh)
1221                         goto drop;
1222
1223                 flags &= ~VXLAN_HF_RCO;
1224                 vni &= VXLAN_VID_MASK;
1225         }
1226
1227         /* For backwards compatibility, only allow reserved fields to be
1228          * used by VXLAN extensions if explicitly requested.
1229          */
1230         if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
1231                 struct vxlanhdr_gbp *gbp;
1232
1233                 gbp = (struct vxlanhdr_gbp *)vxh;
1234                 md.gbp = ntohs(gbp->policy_id);
1235
1236                 if (gbp->dont_learn)
1237                         md.gbp |= VXLAN_GBP_DONT_LEARN;
1238
1239                 if (gbp->policy_applied)
1240                         md.gbp |= VXLAN_GBP_POLICY_APPLIED;
1241
1242                 flags &= ~VXLAN_GBP_USED_BITS;
1243         }
1244
1245         if (flags || (vni & ~VXLAN_VID_MASK)) {
1246                 /* If there are any unprocessed flags remaining treat
1247                  * this as a malformed packet. This behavior diverges from
1248                  * VXLAN RFC (RFC7348) which stipulates that bits in reserved
1249                  * in reserved fields are to be ignored. The approach here
1250                  * maintains compatbility with previous stack code, and also
1251                  * is more robust and provides a little more security in
1252                  * adding extensions to VXLAN.
1253                  */
1254
1255                 goto bad_flags;
1256         }
1257
1258         md.vni = vxh->vx_vni;
1259         vs->rcv(vs, skb, &md);
1260         return 0;
1261
1262 drop:
1263         /* Consume bad packet */
1264         kfree_skb(skb);
1265         return 0;
1266
1267 bad_flags:
1268         netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
1269                    ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
1270
1271 error:
1272         /* Return non vxlan pkt */
1273         return 1;
1274 }
1275
1276 static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
1277                       struct vxlan_metadata *md)
1278 {
1279         struct iphdr *oip = NULL;
1280         struct ipv6hdr *oip6 = NULL;
1281         struct vxlan_dev *vxlan;
1282         struct pcpu_sw_netstats *stats;
1283         union vxlan_addr saddr;
1284         __u32 vni;
1285         int err = 0;
1286         union vxlan_addr *remote_ip;
1287
1288         vni = ntohl(md->vni) >> 8;
1289         /* Is this VNI defined? */
1290         vxlan = vxlan_vs_find_vni(vs, vni);
1291         if (!vxlan)
1292                 goto drop;
1293
1294         remote_ip = &vxlan->default_dst.remote_ip;
1295         skb_reset_mac_header(skb);
1296         skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
1297         skb->protocol = eth_type_trans(skb, vxlan->dev);
1298         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1299
1300         /* Ignore packet loops (and multicast echo) */
1301         if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1302                 goto drop;
1303
1304         /* Re-examine inner Ethernet packet */
1305         if (remote_ip->sa.sa_family == AF_INET) {
1306                 oip = ip_hdr(skb);
1307                 saddr.sin.sin_addr.s_addr = oip->saddr;
1308                 saddr.sa.sa_family = AF_INET;
1309 #if IS_ENABLED(CONFIG_IPV6)
1310         } else {
1311                 oip6 = ipv6_hdr(skb);
1312                 saddr.sin6.sin6_addr = oip6->saddr;
1313                 saddr.sa.sa_family = AF_INET6;
1314 #endif
1315         }
1316
1317         if ((vxlan->flags & VXLAN_F_LEARN) &&
1318             vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
1319                 goto drop;
1320
1321         skb_reset_network_header(skb);
1322         skb->mark = md->gbp;
1323
1324         if (oip6)
1325                 err = IP6_ECN_decapsulate(oip6, skb);
1326         if (oip)
1327                 err = IP_ECN_decapsulate(oip, skb);
1328
1329         if (unlikely(err)) {
1330                 if (log_ecn_error) {
1331                         if (oip6)
1332                                 net_info_ratelimited("non-ECT from %pI6\n",
1333                                                      &oip6->saddr);
1334                         if (oip)
1335                                 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
1336                                                      &oip->saddr, oip->tos);
1337                 }
1338                 if (err > 1) {
1339                         ++vxlan->dev->stats.rx_frame_errors;
1340                         ++vxlan->dev->stats.rx_errors;
1341                         goto drop;
1342                 }
1343         }
1344
1345         stats = this_cpu_ptr(vxlan->dev->tstats);
1346         u64_stats_update_begin(&stats->syncp);
1347         stats->rx_packets++;
1348         stats->rx_bytes += skb->len;
1349         u64_stats_update_end(&stats->syncp);
1350
1351         netif_rx(skb);
1352
1353         return;
1354 drop:
1355         /* Consume bad packet */
1356         kfree_skb(skb);
1357 }
1358
1359 static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
1360 {
1361         struct vxlan_dev *vxlan = netdev_priv(dev);
1362         struct arphdr *parp;
1363         u8 *arpptr, *sha;
1364         __be32 sip, tip;
1365         struct neighbour *n;
1366
1367         if (dev->flags & IFF_NOARP)
1368                 goto out;
1369
1370         if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
1371                 dev->stats.tx_dropped++;
1372                 goto out;
1373         }
1374         parp = arp_hdr(skb);
1375
1376         if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
1377              parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
1378             parp->ar_pro != htons(ETH_P_IP) ||
1379             parp->ar_op != htons(ARPOP_REQUEST) ||
1380             parp->ar_hln != dev->addr_len ||
1381             parp->ar_pln != 4)
1382                 goto out;
1383         arpptr = (u8 *)parp + sizeof(struct arphdr);
1384         sha = arpptr;
1385         arpptr += dev->addr_len;        /* sha */
1386         memcpy(&sip, arpptr, sizeof(sip));
1387         arpptr += sizeof(sip);
1388         arpptr += dev->addr_len;        /* tha */
1389         memcpy(&tip, arpptr, sizeof(tip));
1390
1391         if (ipv4_is_loopback(tip) ||
1392             ipv4_is_multicast(tip))
1393                 goto out;
1394
1395         n = neigh_lookup(&arp_tbl, &tip, dev);
1396
1397         if (n) {
1398                 struct vxlan_fdb *f;
1399                 struct sk_buff  *reply;
1400
1401                 if (!(n->nud_state & NUD_CONNECTED)) {
1402                         neigh_release(n);
1403                         goto out;
1404                 }
1405
1406                 f = vxlan_find_mac(vxlan, n->ha);
1407                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1408                         /* bridge-local neighbor */
1409                         neigh_release(n);
1410                         goto out;
1411                 }
1412
1413                 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
1414                                 n->ha, sha);
1415
1416                 neigh_release(n);
1417
1418                 if (reply == NULL)
1419                         goto out;
1420
1421                 skb_reset_mac_header(reply);
1422                 __skb_pull(reply, skb_network_offset(reply));
1423                 reply->ip_summed = CHECKSUM_UNNECESSARY;
1424                 reply->pkt_type = PACKET_HOST;
1425
1426                 if (netif_rx_ni(reply) == NET_RX_DROP)
1427                         dev->stats.rx_dropped++;
1428         } else if (vxlan->flags & VXLAN_F_L3MISS) {
1429                 union vxlan_addr ipa = {
1430                         .sin.sin_addr.s_addr = tip,
1431                         .sin.sin_family = AF_INET,
1432                 };
1433
1434                 vxlan_ip_miss(dev, &ipa);
1435         }
1436 out:
1437         consume_skb(skb);
1438         return NETDEV_TX_OK;
1439 }
1440
1441 #if IS_ENABLED(CONFIG_IPV6)
1442 static struct sk_buff *vxlan_na_create(struct sk_buff *request,
1443         struct neighbour *n, bool isrouter)
1444 {
1445         struct net_device *dev = request->dev;
1446         struct sk_buff *reply;
1447         struct nd_msg *ns, *na;
1448         struct ipv6hdr *pip6;
1449         u8 *daddr;
1450         int na_olen = 8; /* opt hdr + ETH_ALEN for target */
1451         int ns_olen;
1452         int i, len;
1453
1454         if (dev == NULL)
1455                 return NULL;
1456
1457         len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
1458                 sizeof(*na) + na_olen + dev->needed_tailroom;
1459         reply = alloc_skb(len, GFP_ATOMIC);
1460         if (reply == NULL)
1461                 return NULL;
1462
1463         reply->protocol = htons(ETH_P_IPV6);
1464         reply->dev = dev;
1465         skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
1466         skb_push(reply, sizeof(struct ethhdr));
1467         skb_set_mac_header(reply, 0);
1468
1469         ns = (struct nd_msg *)skb_transport_header(request);
1470
1471         daddr = eth_hdr(request)->h_source;
1472         ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns);
1473         for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
1474                 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
1475                         daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
1476                         break;
1477                 }
1478         }
1479
1480         /* Ethernet header */
1481         ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
1482         ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
1483         eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
1484         reply->protocol = htons(ETH_P_IPV6);
1485
1486         skb_pull(reply, sizeof(struct ethhdr));
1487         skb_set_network_header(reply, 0);
1488         skb_put(reply, sizeof(struct ipv6hdr));
1489
1490         /* IPv6 header */
1491
1492         pip6 = ipv6_hdr(reply);
1493         memset(pip6, 0, sizeof(struct ipv6hdr));
1494         pip6->version = 6;
1495         pip6->priority = ipv6_hdr(request)->priority;
1496         pip6->nexthdr = IPPROTO_ICMPV6;
1497         pip6->hop_limit = 255;
1498         pip6->daddr = ipv6_hdr(request)->saddr;
1499         pip6->saddr = *(struct in6_addr *)n->primary_key;
1500
1501         skb_pull(reply, sizeof(struct ipv6hdr));
1502         skb_set_transport_header(reply, 0);
1503
1504         na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen);
1505
1506         /* Neighbor Advertisement */
1507         memset(na, 0, sizeof(*na)+na_olen);
1508         na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
1509         na->icmph.icmp6_router = isrouter;
1510         na->icmph.icmp6_override = 1;
1511         na->icmph.icmp6_solicited = 1;
1512         na->target = ns->target;
1513         ether_addr_copy(&na->opt[2], n->ha);
1514         na->opt[0] = ND_OPT_TARGET_LL_ADDR;
1515         na->opt[1] = na_olen >> 3;
1516
1517         na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
1518                 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
1519                 csum_partial(na, sizeof(*na)+na_olen, 0));
1520
1521         pip6->payload_len = htons(sizeof(*na)+na_olen);
1522
1523         skb_push(reply, sizeof(struct ipv6hdr));
1524
1525         reply->ip_summed = CHECKSUM_UNNECESSARY;
1526
1527         return reply;
1528 }
1529
1530 static int neigh_reduce(struct net_device *dev, struct sk_buff *skb)
1531 {
1532         struct vxlan_dev *vxlan = netdev_priv(dev);
1533         struct nd_msg *msg;
1534         const struct ipv6hdr *iphdr;
1535         const struct in6_addr *saddr, *daddr;
1536         struct neighbour *n;
1537         struct inet6_dev *in6_dev;
1538
1539         in6_dev = __in6_dev_get(dev);
1540         if (!in6_dev)
1541                 goto out;
1542
1543         iphdr = ipv6_hdr(skb);
1544         saddr = &iphdr->saddr;
1545         daddr = &iphdr->daddr;
1546
1547         msg = (struct nd_msg *)skb_transport_header(skb);
1548         if (msg->icmph.icmp6_code != 0 ||
1549             msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION)
1550                 goto out;
1551
1552         if (ipv6_addr_loopback(daddr) ||
1553             ipv6_addr_is_multicast(&msg->target))
1554                 goto out;
1555
1556         n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
1557
1558         if (n) {
1559                 struct vxlan_fdb *f;
1560                 struct sk_buff *reply;
1561
1562                 if (!(n->nud_state & NUD_CONNECTED)) {
1563                         neigh_release(n);
1564                         goto out;
1565                 }
1566
1567                 f = vxlan_find_mac(vxlan, n->ha);
1568                 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
1569                         /* bridge-local neighbor */
1570                         neigh_release(n);
1571                         goto out;
1572                 }
1573
1574                 reply = vxlan_na_create(skb, n,
1575                                         !!(f ? f->flags & NTF_ROUTER : 0));
1576
1577                 neigh_release(n);
1578
1579                 if (reply == NULL)
1580                         goto out;
1581
1582                 if (netif_rx_ni(reply) == NET_RX_DROP)
1583                         dev->stats.rx_dropped++;
1584
1585         } else if (vxlan->flags & VXLAN_F_L3MISS) {
1586                 union vxlan_addr ipa = {
1587                         .sin6.sin6_addr = msg->target,
1588                         .sin6.sin6_family = AF_INET6,
1589                 };
1590
1591                 vxlan_ip_miss(dev, &ipa);
1592         }
1593
1594 out:
1595         consume_skb(skb);
1596         return NETDEV_TX_OK;
1597 }
1598 #endif
1599
1600 static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
1601 {
1602         struct vxlan_dev *vxlan = netdev_priv(dev);
1603         struct neighbour *n;
1604
1605         if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
1606                 return false;
1607
1608         n = NULL;
1609         switch (ntohs(eth_hdr(skb)->h_proto)) {
1610         case ETH_P_IP:
1611         {
1612                 struct iphdr *pip;
1613
1614                 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
1615                         return false;
1616                 pip = ip_hdr(skb);
1617                 n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
1618                 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
1619                         union vxlan_addr ipa = {
1620                                 .sin.sin_addr.s_addr = pip->daddr,
1621                                 .sin.sin_family = AF_INET,
1622                         };
1623
1624                         vxlan_ip_miss(dev, &ipa);
1625                         return false;
1626                 }
1627
1628                 break;
1629         }
1630 #if IS_ENABLED(CONFIG_IPV6)
1631         case ETH_P_IPV6:
1632         {
1633                 struct ipv6hdr *pip6;
1634
1635                 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
1636                         return false;
1637                 pip6 = ipv6_hdr(skb);
1638                 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
1639                 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) {
1640                         union vxlan_addr ipa = {
1641                                 .sin6.sin6_addr = pip6->daddr,
1642                                 .sin6.sin6_family = AF_INET6,
1643                         };
1644
1645                         vxlan_ip_miss(dev, &ipa);
1646                         return false;
1647                 }
1648
1649                 break;
1650         }
1651 #endif
1652         default:
1653                 return false;
1654         }
1655
1656         if (n) {
1657                 bool diff;
1658
1659                 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
1660                 if (diff) {
1661                         memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
1662                                 dev->addr_len);
1663                         memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
1664                 }
1665                 neigh_release(n);
1666                 return diff;
1667         }
1668
1669         return false;
1670 }
1671
1672 static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, struct vxlan_sock *vs,
1673                                 struct vxlan_metadata *md)
1674 {
1675         struct vxlanhdr_gbp *gbp;
1676
1677         gbp = (struct vxlanhdr_gbp *)vxh;
1678         vxh->vx_flags |= htonl(VXLAN_HF_GBP);
1679
1680         if (md->gbp & VXLAN_GBP_DONT_LEARN)
1681                 gbp->dont_learn = 1;
1682
1683         if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
1684                 gbp->policy_applied = 1;
1685
1686         gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
1687 }
1688
1689 #if IS_ENABLED(CONFIG_IPV6)
1690 static int vxlan6_xmit_skb(struct vxlan_sock *vs,
1691                            struct dst_entry *dst, struct sk_buff *skb,
1692                            struct net_device *dev, struct in6_addr *saddr,
1693                            struct in6_addr *daddr, __u8 prio, __u8 ttl,
1694                            __be16 src_port, __be16 dst_port,
1695                            struct vxlan_metadata *md, bool xnet)
1696 {
1697         struct vxlanhdr *vxh;
1698         int min_headroom;
1699         int err;
1700         bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk);
1701         int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
1702         u16 hdrlen = sizeof(struct vxlanhdr);
1703
1704         if ((vs->flags & VXLAN_F_REMCSUM_TX) &&
1705             skb->ip_summed == CHECKSUM_PARTIAL) {
1706                 int csum_start = skb_checksum_start_offset(skb);
1707
1708                 if (csum_start <= VXLAN_MAX_REMCSUM_START &&
1709                     !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
1710                     (skb->csum_offset == offsetof(struct udphdr, check) ||
1711                      skb->csum_offset == offsetof(struct tcphdr, check))) {
1712                         udp_sum = false;
1713                         type |= SKB_GSO_TUNNEL_REMCSUM;
1714                 }
1715         }
1716
1717         skb = iptunnel_handle_offloads(skb, udp_sum, type);
1718         if (IS_ERR(skb)) {
1719                 err = -EINVAL;
1720                 goto err;
1721         }
1722
1723         skb_scrub_packet(skb, xnet);
1724
1725         min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
1726                         + VXLAN_HLEN + sizeof(struct ipv6hdr)
1727                         + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
1728
1729         /* Need space for new headers (invalidates iph ptr) */
1730         err = skb_cow_head(skb, min_headroom);
1731         if (unlikely(err)) {
1732                 kfree_skb(skb);
1733                 goto err;
1734         }
1735
1736         skb = vlan_hwaccel_push_inside(skb);
1737         if (WARN_ON(!skb)) {
1738                 err = -ENOMEM;
1739                 goto err;
1740         }
1741
1742         vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1743         vxh->vx_flags = htonl(VXLAN_HF_VNI);
1744         vxh->vx_vni = md->vni;
1745
1746         if (type & SKB_GSO_TUNNEL_REMCSUM) {
1747                 u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
1748                            VXLAN_RCO_SHIFT;
1749
1750                 if (skb->csum_offset == offsetof(struct udphdr, check))
1751                         data |= VXLAN_RCO_UDP;
1752
1753                 vxh->vx_vni |= htonl(data);
1754                 vxh->vx_flags |= htonl(VXLAN_HF_RCO);
1755
1756                 if (!skb_is_gso(skb)) {
1757                         skb->ip_summed = CHECKSUM_NONE;
1758                         skb->encapsulation = 0;
1759                 }
1760         }
1761
1762         if (vs->flags & VXLAN_F_GBP)
1763                 vxlan_build_gbp_hdr(vxh, vs, md);
1764
1765         skb_set_inner_protocol(skb, htons(ETH_P_TEB));
1766
1767         udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio,
1768                              ttl, src_port, dst_port);
1769         return 0;
1770 err:
1771         dst_release(dst);
1772         return err;
1773 }
1774 #endif
1775
1776 int vxlan_xmit_skb(struct vxlan_sock *vs,
1777                    struct rtable *rt, struct sk_buff *skb,
1778                    __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
1779                    __be16 src_port, __be16 dst_port,
1780                    struct vxlan_metadata *md, bool xnet)
1781 {
1782         struct vxlanhdr *vxh;
1783         int min_headroom;
1784         int err;
1785         bool udp_sum = !vs->sock->sk->sk_no_check_tx;
1786         int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
1787         u16 hdrlen = sizeof(struct vxlanhdr);
1788
1789         if ((vs->flags & VXLAN_F_REMCSUM_TX) &&
1790             skb->ip_summed == CHECKSUM_PARTIAL) {
1791                 int csum_start = skb_checksum_start_offset(skb);
1792
1793                 if (csum_start <= VXLAN_MAX_REMCSUM_START &&
1794                     !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
1795                     (skb->csum_offset == offsetof(struct udphdr, check) ||
1796                      skb->csum_offset == offsetof(struct tcphdr, check))) {
1797                         udp_sum = false;
1798                         type |= SKB_GSO_TUNNEL_REMCSUM;
1799                 }
1800         }
1801
1802         skb = iptunnel_handle_offloads(skb, udp_sum, type);
1803         if (IS_ERR(skb))
1804                 return PTR_ERR(skb);
1805
1806         min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
1807                         + VXLAN_HLEN + sizeof(struct iphdr)
1808                         + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
1809
1810         /* Need space for new headers (invalidates iph ptr) */
1811         err = skb_cow_head(skb, min_headroom);
1812         if (unlikely(err)) {
1813                 kfree_skb(skb);
1814                 return err;
1815         }
1816
1817         skb = vlan_hwaccel_push_inside(skb);
1818         if (WARN_ON(!skb))
1819                 return -ENOMEM;
1820
1821         vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1822         vxh->vx_flags = htonl(VXLAN_HF_VNI);
1823         vxh->vx_vni = md->vni;
1824
1825         if (type & SKB_GSO_TUNNEL_REMCSUM) {
1826                 u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
1827                            VXLAN_RCO_SHIFT;
1828
1829                 if (skb->csum_offset == offsetof(struct udphdr, check))
1830                         data |= VXLAN_RCO_UDP;
1831
1832                 vxh->vx_vni |= htonl(data);
1833                 vxh->vx_flags |= htonl(VXLAN_HF_RCO);
1834
1835                 if (!skb_is_gso(skb)) {
1836                         skb->ip_summed = CHECKSUM_NONE;
1837                         skb->encapsulation = 0;
1838                 }
1839         }
1840
1841         if (vs->flags & VXLAN_F_GBP)
1842                 vxlan_build_gbp_hdr(vxh, vs, md);
1843
1844         skb_set_inner_protocol(skb, htons(ETH_P_TEB));
1845
1846         return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos,
1847                                    ttl, df, src_port, dst_port, xnet);
1848 }
1849 EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
1850
1851 /* Bypass encapsulation if the destination is local */
1852 static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
1853                                struct vxlan_dev *dst_vxlan)
1854 {
1855         struct pcpu_sw_netstats *tx_stats, *rx_stats;
1856         union vxlan_addr loopback;
1857         union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
1858         struct net_device *dev = skb->dev;
1859         int len = skb->len;
1860
1861         tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
1862         rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
1863         skb->pkt_type = PACKET_HOST;
1864         skb->encapsulation = 0;
1865         skb->dev = dst_vxlan->dev;
1866         __skb_pull(skb, skb_network_offset(skb));
1867
1868         if (remote_ip->sa.sa_family == AF_INET) {
1869                 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
1870                 loopback.sa.sa_family =  AF_INET;
1871 #if IS_ENABLED(CONFIG_IPV6)
1872         } else {
1873                 loopback.sin6.sin6_addr = in6addr_loopback;
1874                 loopback.sa.sa_family =  AF_INET6;
1875 #endif
1876         }
1877
1878         if (dst_vxlan->flags & VXLAN_F_LEARN)
1879                 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source);
1880
1881         u64_stats_update_begin(&tx_stats->syncp);
1882         tx_stats->tx_packets++;
1883         tx_stats->tx_bytes += len;
1884         u64_stats_update_end(&tx_stats->syncp);
1885
1886         if (netif_rx(skb) == NET_RX_SUCCESS) {
1887                 u64_stats_update_begin(&rx_stats->syncp);
1888                 rx_stats->rx_packets++;
1889                 rx_stats->rx_bytes += len;
1890                 u64_stats_update_end(&rx_stats->syncp);
1891         } else {
1892                 dev->stats.rx_dropped++;
1893         }
1894 }
1895
1896 static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1897                            struct vxlan_rdst *rdst, bool did_rsc)
1898 {
1899         struct vxlan_dev *vxlan = netdev_priv(dev);
1900         struct rtable *rt = NULL;
1901         const struct iphdr *old_iph;
1902         struct flowi4 fl4;
1903         union vxlan_addr *dst;
1904         struct vxlan_metadata md;
1905         __be16 src_port = 0, dst_port;
1906         u32 vni;
1907         __be16 df = 0;
1908         __u8 tos, ttl;
1909         int err;
1910
1911         dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
1912         vni = rdst->remote_vni;
1913         dst = &rdst->remote_ip;
1914
1915         if (vxlan_addr_any(dst)) {
1916                 if (did_rsc) {
1917                         /* short-circuited back to local bridge */
1918                         vxlan_encap_bypass(skb, vxlan, vxlan);
1919                         return;
1920                 }
1921                 goto drop;
1922         }
1923
1924         old_iph = ip_hdr(skb);
1925
1926         ttl = vxlan->ttl;
1927         if (!ttl && vxlan_addr_multicast(dst))
1928                 ttl = 1;
1929
1930         tos = vxlan->tos;
1931         if (tos == 1)
1932                 tos = ip_tunnel_get_dsfield(old_iph, skb);
1933
1934         src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min,
1935                                      vxlan->port_max, true);
1936
1937         if (dst->sa.sa_family == AF_INET) {
1938                 memset(&fl4, 0, sizeof(fl4));
1939                 fl4.flowi4_oif = rdst->remote_ifindex;
1940                 fl4.flowi4_tos = RT_TOS(tos);
1941                 fl4.daddr = dst->sin.sin_addr.s_addr;
1942                 fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr;
1943
1944                 rt = ip_route_output_key(vxlan->net, &fl4);
1945                 if (IS_ERR(rt)) {
1946                         netdev_dbg(dev, "no route to %pI4\n",
1947                                    &dst->sin.sin_addr.s_addr);
1948                         dev->stats.tx_carrier_errors++;
1949                         goto tx_error;
1950                 }
1951
1952                 if (rt->dst.dev == dev) {
1953                         netdev_dbg(dev, "circular route to %pI4\n",
1954                                    &dst->sin.sin_addr.s_addr);
1955                         dev->stats.collisions++;
1956                         goto rt_tx_error;
1957                 }
1958
1959                 /* Bypass encapsulation if the destination is local */
1960                 if (rt->rt_flags & RTCF_LOCAL &&
1961                     !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
1962                         struct vxlan_dev *dst_vxlan;
1963
1964                         ip_rt_put(rt);
1965                         dst_vxlan = vxlan_find_vni(vxlan->net, vni,
1966                                                    dst->sa.sa_family, dst_port,
1967                                                    vxlan->flags);
1968                         if (!dst_vxlan)
1969                                 goto tx_error;
1970                         vxlan_encap_bypass(skb, vxlan, dst_vxlan);
1971                         return;
1972                 }
1973
1974                 tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
1975                 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
1976                 md.vni = htonl(vni << 8);
1977                 md.gbp = skb->mark;
1978
1979                 err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb,
1980                                      fl4.saddr, dst->sin.sin_addr.s_addr,
1981                                      tos, ttl, df, src_port, dst_port, &md,
1982                                      !net_eq(vxlan->net, dev_net(vxlan->dev)));
1983                 if (err < 0) {
1984                         /* skb is already freed. */
1985                         skb = NULL;
1986                         goto rt_tx_error;
1987                 }
1988
1989                 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
1990 #if IS_ENABLED(CONFIG_IPV6)
1991         } else {
1992                 struct sock *sk = vxlan->vn_sock->sock->sk;
1993                 struct dst_entry *ndst;
1994                 struct flowi6 fl6;
1995                 u32 flags;
1996
1997                 memset(&fl6, 0, sizeof(fl6));
1998                 fl6.flowi6_oif = rdst->remote_ifindex;
1999                 fl6.daddr = dst->sin6.sin6_addr;
2000                 fl6.saddr = vxlan->saddr.sin6.sin6_addr;
2001                 fl6.flowi6_proto = IPPROTO_UDP;
2002
2003                 if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
2004                         netdev_dbg(dev, "no route to %pI6\n",
2005                                    &dst->sin6.sin6_addr);
2006                         dev->stats.tx_carrier_errors++;
2007                         goto tx_error;
2008                 }
2009
2010                 if (ndst->dev == dev) {
2011                         netdev_dbg(dev, "circular route to %pI6\n",
2012                                    &dst->sin6.sin6_addr);
2013                         dst_release(ndst);
2014                         dev->stats.collisions++;
2015                         goto tx_error;
2016                 }
2017
2018                 /* Bypass encapsulation if the destination is local */
2019                 flags = ((struct rt6_info *)ndst)->rt6i_flags;
2020                 if (flags & RTF_LOCAL &&
2021                     !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
2022                         struct vxlan_dev *dst_vxlan;
2023
2024                         dst_release(ndst);
2025                         dst_vxlan = vxlan_find_vni(vxlan->net, vni,
2026                                                    dst->sa.sa_family, dst_port,
2027                                                    vxlan->flags);
2028                         if (!dst_vxlan)
2029                                 goto tx_error;
2030                         vxlan_encap_bypass(skb, vxlan, dst_vxlan);
2031                         return;
2032                 }
2033
2034                 ttl = ttl ? : ip6_dst_hoplimit(ndst);
2035                 md.vni = htonl(vni << 8);
2036                 md.gbp = skb->mark;
2037
2038                 err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb,
2039                                       dev, &fl6.saddr, &fl6.daddr, 0, ttl,
2040                                       src_port, dst_port, &md,
2041                                       !net_eq(vxlan->net, dev_net(vxlan->dev)));
2042 #endif
2043         }
2044
2045         return;
2046
2047 drop:
2048         dev->stats.tx_dropped++;
2049         goto tx_free;
2050
2051 rt_tx_error:
2052         ip_rt_put(rt);
2053 tx_error:
2054         dev->stats.tx_errors++;
2055 tx_free:
2056         dev_kfree_skb(skb);
2057 }
2058
2059 /* Transmit local packets over Vxlan
2060  *
2061  * Outer IP header inherits ECN and DF from inner header.
2062  * Outer UDP destination is the VXLAN assigned port.
2063  *           source port is based on hash of flow
2064  */
2065 static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
2066 {
2067         struct vxlan_dev *vxlan = netdev_priv(dev);
2068         struct ethhdr *eth;
2069         bool did_rsc = false;
2070         struct vxlan_rdst *rdst, *fdst = NULL;
2071         struct vxlan_fdb *f;
2072
2073         skb_reset_mac_header(skb);
2074         eth = eth_hdr(skb);
2075
2076         if ((vxlan->flags & VXLAN_F_PROXY)) {
2077                 if (ntohs(eth->h_proto) == ETH_P_ARP)
2078                         return arp_reduce(dev, skb);
2079 #if IS_ENABLED(CONFIG_IPV6)
2080                 else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
2081                          pskb_may_pull(skb, sizeof(struct ipv6hdr)
2082                                        + sizeof(struct nd_msg)) &&
2083                          ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
2084                                 struct nd_msg *msg;
2085
2086                                 msg = (struct nd_msg *)skb_transport_header(skb);
2087                                 if (msg->icmph.icmp6_code == 0 &&
2088                                     msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2089                                         return neigh_reduce(dev, skb);
2090                 }
2091                 eth = eth_hdr(skb);
2092 #endif
2093         }
2094
2095         f = vxlan_find_mac(vxlan, eth->h_dest);
2096         did_rsc = false;
2097
2098         if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
2099             (ntohs(eth->h_proto) == ETH_P_IP ||
2100              ntohs(eth->h_proto) == ETH_P_IPV6)) {
2101                 did_rsc = route_shortcircuit(dev, skb);
2102                 if (did_rsc)
2103                         f = vxlan_find_mac(vxlan, eth->h_dest);
2104         }
2105
2106         if (f == NULL) {
2107                 f = vxlan_find_mac(vxlan, all_zeros_mac);
2108                 if (f == NULL) {
2109                         if ((vxlan->flags & VXLAN_F_L2MISS) &&
2110                             !is_multicast_ether_addr(eth->h_dest))
2111                                 vxlan_fdb_miss(vxlan, eth->h_dest);
2112
2113                         dev->stats.tx_dropped++;
2114                         kfree_skb(skb);
2115                         return NETDEV_TX_OK;
2116                 }
2117         }
2118
2119         list_for_each_entry_rcu(rdst, &f->remotes, list) {
2120                 struct sk_buff *skb1;
2121
2122                 if (!fdst) {
2123                         fdst = rdst;
2124                         continue;
2125                 }
2126                 skb1 = skb_clone(skb, GFP_ATOMIC);
2127                 if (skb1)
2128                         vxlan_xmit_one(skb1, dev, rdst, did_rsc);
2129         }
2130
2131         if (fdst)
2132                 vxlan_xmit_one(skb, dev, fdst, did_rsc);
2133         else
2134                 kfree_skb(skb);
2135         return NETDEV_TX_OK;
2136 }
2137
2138 /* Walk the forwarding table and purge stale entries */
2139 static void vxlan_cleanup(unsigned long arg)
2140 {
2141         struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
2142         unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
2143         unsigned int h;
2144
2145         if (!netif_running(vxlan->dev))
2146                 return;
2147
2148         spin_lock_bh(&vxlan->hash_lock);
2149         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2150                 struct hlist_node *p, *n;
2151                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2152                         struct vxlan_fdb *f
2153                                 = container_of(p, struct vxlan_fdb, hlist);
2154                         unsigned long timeout;
2155
2156                         if (f->state & NUD_PERMANENT)
2157                                 continue;
2158
2159                         timeout = f->used + vxlan->age_interval * HZ;
2160                         if (time_before_eq(timeout, jiffies)) {
2161                                 netdev_dbg(vxlan->dev,
2162                                            "garbage collect %pM\n",
2163                                            f->eth_addr);
2164                                 f->state = NUD_STALE;
2165                                 vxlan_fdb_destroy(vxlan, f);
2166                         } else if (time_before(timeout, next_timer))
2167                                 next_timer = timeout;
2168                 }
2169         }
2170         spin_unlock_bh(&vxlan->hash_lock);
2171
2172         mod_timer(&vxlan->age_timer, next_timer);
2173 }
2174
2175 static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
2176 {
2177         __u32 vni = vxlan->default_dst.remote_vni;
2178
2179         vxlan->vn_sock = vs;
2180         hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
2181 }
2182
2183 /* Setup stats when device is created */
2184 static int vxlan_init(struct net_device *dev)
2185 {
2186         struct vxlan_dev *vxlan = netdev_priv(dev);
2187         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2188         struct vxlan_sock *vs;
2189         bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
2190
2191         dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2192         if (!dev->tstats)
2193                 return -ENOMEM;
2194
2195         spin_lock(&vn->sock_lock);
2196         vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
2197                              vxlan->dst_port, vxlan->flags);
2198         if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
2199                 /* If we have a socket with same port already, reuse it */
2200                 vxlan_vs_add_dev(vs, vxlan);
2201         } else {
2202                 /* otherwise make new socket outside of RTNL */
2203                 dev_hold(dev);
2204                 queue_work(vxlan_wq, &vxlan->sock_work);
2205         }
2206         spin_unlock(&vn->sock_lock);
2207
2208         return 0;
2209 }
2210
2211 static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
2212 {
2213         struct vxlan_fdb *f;
2214
2215         spin_lock_bh(&vxlan->hash_lock);
2216         f = __vxlan_find_mac(vxlan, all_zeros_mac);
2217         if (f)
2218                 vxlan_fdb_destroy(vxlan, f);
2219         spin_unlock_bh(&vxlan->hash_lock);
2220 }
2221
2222 static void vxlan_uninit(struct net_device *dev)
2223 {
2224         struct vxlan_dev *vxlan = netdev_priv(dev);
2225         struct vxlan_sock *vs = vxlan->vn_sock;
2226
2227         vxlan_fdb_delete_default(vxlan);
2228
2229         if (vs)
2230                 vxlan_sock_release(vs);
2231         free_percpu(dev->tstats);
2232 }
2233
2234 /* Start ageing timer and join group when device is brought up */
2235 static int vxlan_open(struct net_device *dev)
2236 {
2237         struct vxlan_dev *vxlan = netdev_priv(dev);
2238         struct vxlan_sock *vs = vxlan->vn_sock;
2239
2240         /* socket hasn't been created */
2241         if (!vs)
2242                 return -ENOTCONN;
2243
2244         if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
2245                 vxlan_sock_hold(vs);
2246                 dev_hold(dev);
2247                 queue_work(vxlan_wq, &vxlan->igmp_join);
2248         }
2249
2250         if (vxlan->age_interval)
2251                 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
2252
2253         return 0;
2254 }
2255
2256 /* Purge the forwarding table */
2257 static void vxlan_flush(struct vxlan_dev *vxlan)
2258 {
2259         unsigned int h;
2260
2261         spin_lock_bh(&vxlan->hash_lock);
2262         for (h = 0; h < FDB_HASH_SIZE; ++h) {
2263                 struct hlist_node *p, *n;
2264                 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
2265                         struct vxlan_fdb *f
2266                                 = container_of(p, struct vxlan_fdb, hlist);
2267                         /* the all_zeros_mac entry is deleted at vxlan_uninit */
2268                         if (!is_zero_ether_addr(f->eth_addr))
2269                                 vxlan_fdb_destroy(vxlan, f);
2270                 }
2271         }
2272         spin_unlock_bh(&vxlan->hash_lock);
2273 }
2274
2275 /* Cleanup timer and forwarding table on shutdown */
2276 static int vxlan_stop(struct net_device *dev)
2277 {
2278         struct vxlan_dev *vxlan = netdev_priv(dev);
2279         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2280         struct vxlan_sock *vs = vxlan->vn_sock;
2281
2282         if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
2283             !vxlan_group_used(vn, vxlan)) {
2284                 vxlan_sock_hold(vs);
2285                 dev_hold(dev);
2286                 queue_work(vxlan_wq, &vxlan->igmp_leave);
2287         }
2288
2289         del_timer_sync(&vxlan->age_timer);
2290
2291         vxlan_flush(vxlan);
2292
2293         return 0;
2294 }
2295
2296 /* Stub, nothing needs to be done. */
2297 static void vxlan_set_multicast_list(struct net_device *dev)
2298 {
2299 }
2300
2301 static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
2302 {
2303         struct vxlan_dev *vxlan = netdev_priv(dev);
2304         struct vxlan_rdst *dst = &vxlan->default_dst;
2305         struct net_device *lowerdev;
2306         int max_mtu;
2307
2308         lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex);
2309         if (lowerdev == NULL)
2310                 return eth_change_mtu(dev, new_mtu);
2311
2312         if (dst->remote_ip.sa.sa_family == AF_INET6)
2313                 max_mtu = lowerdev->mtu - VXLAN6_HEADROOM;
2314         else
2315                 max_mtu = lowerdev->mtu - VXLAN_HEADROOM;
2316
2317         if (new_mtu < 68 || new_mtu > max_mtu)
2318                 return -EINVAL;
2319
2320         dev->mtu = new_mtu;
2321         return 0;
2322 }
2323
2324 static const struct net_device_ops vxlan_netdev_ops = {
2325         .ndo_init               = vxlan_init,
2326         .ndo_uninit             = vxlan_uninit,
2327         .ndo_open               = vxlan_open,
2328         .ndo_stop               = vxlan_stop,
2329         .ndo_start_xmit         = vxlan_xmit,
2330         .ndo_get_stats64        = ip_tunnel_get_stats64,
2331         .ndo_set_rx_mode        = vxlan_set_multicast_list,
2332         .ndo_change_mtu         = vxlan_change_mtu,
2333         .ndo_validate_addr      = eth_validate_addr,
2334         .ndo_set_mac_address    = eth_mac_addr,
2335         .ndo_fdb_add            = vxlan_fdb_add,
2336         .ndo_fdb_del            = vxlan_fdb_delete,
2337         .ndo_fdb_dump           = vxlan_fdb_dump,
2338 };
2339
2340 /* Info for udev, that this is a virtual tunnel endpoint */
2341 static struct device_type vxlan_type = {
2342         .name = "vxlan",
2343 };
2344
2345 /* Calls the ndo_add_vxlan_port of the caller in order to
2346  * supply the listening VXLAN udp ports. Callers are expected
2347  * to implement the ndo_add_vxlan_port.
2348  */
2349 void vxlan_get_rx_port(struct net_device *dev)
2350 {
2351         struct vxlan_sock *vs;
2352         struct net *net = dev_net(dev);
2353         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2354         sa_family_t sa_family;
2355         __be16 port;
2356         unsigned int i;
2357
2358         spin_lock(&vn->sock_lock);
2359         for (i = 0; i < PORT_HASH_SIZE; ++i) {
2360                 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
2361                         port = inet_sk(vs->sock->sk)->inet_sport;
2362                         sa_family = vs->sock->sk->sk_family;
2363                         dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
2364                                                             port);
2365                 }
2366         }
2367         spin_unlock(&vn->sock_lock);
2368 }
2369 EXPORT_SYMBOL_GPL(vxlan_get_rx_port);
2370
2371 /* Initialize the device structure. */
2372 static void vxlan_setup(struct net_device *dev)
2373 {
2374         struct vxlan_dev *vxlan = netdev_priv(dev);
2375         unsigned int h;
2376
2377         eth_hw_addr_random(dev);
2378         ether_setup(dev);
2379         if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6)
2380                 dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM;
2381         else
2382                 dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM;
2383
2384         dev->netdev_ops = &vxlan_netdev_ops;
2385         dev->destructor = free_netdev;
2386         SET_NETDEV_DEVTYPE(dev, &vxlan_type);
2387
2388         dev->tx_queue_len = 0;
2389         dev->features   |= NETIF_F_LLTX;
2390         dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
2391         dev->features   |= NETIF_F_RXCSUM;
2392         dev->features   |= NETIF_F_GSO_SOFTWARE;
2393
2394         dev->vlan_features = dev->features;
2395         dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
2396         dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
2397         dev->hw_features |= NETIF_F_GSO_SOFTWARE;
2398         dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
2399         netif_keep_dst(dev);
2400         dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
2401
2402         INIT_LIST_HEAD(&vxlan->next);
2403         spin_lock_init(&vxlan->hash_lock);
2404         INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join);
2405         INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave);
2406         INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
2407
2408         init_timer_deferrable(&vxlan->age_timer);
2409         vxlan->age_timer.function = vxlan_cleanup;
2410         vxlan->age_timer.data = (unsigned long) vxlan;
2411
2412         vxlan->dst_port = htons(vxlan_port);
2413
2414         vxlan->dev = dev;
2415
2416         for (h = 0; h < FDB_HASH_SIZE; ++h)
2417                 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
2418 }
2419
2420 static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
2421         [IFLA_VXLAN_ID]         = { .type = NLA_U32 },
2422         [IFLA_VXLAN_GROUP]      = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
2423         [IFLA_VXLAN_GROUP6]     = { .len = sizeof(struct in6_addr) },
2424         [IFLA_VXLAN_LINK]       = { .type = NLA_U32 },
2425         [IFLA_VXLAN_LOCAL]      = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
2426         [IFLA_VXLAN_LOCAL6]     = { .len = sizeof(struct in6_addr) },
2427         [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
2428         [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
2429         [IFLA_VXLAN_LEARNING]   = { .type = NLA_U8 },
2430         [IFLA_VXLAN_AGEING]     = { .type = NLA_U32 },
2431         [IFLA_VXLAN_LIMIT]      = { .type = NLA_U32 },
2432         [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
2433         [IFLA_VXLAN_PROXY]      = { .type = NLA_U8 },
2434         [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
2435         [IFLA_VXLAN_L2MISS]     = { .type = NLA_U8 },
2436         [IFLA_VXLAN_L3MISS]     = { .type = NLA_U8 },
2437         [IFLA_VXLAN_PORT]       = { .type = NLA_U16 },
2438         [IFLA_VXLAN_UDP_CSUM]   = { .type = NLA_U8 },
2439         [IFLA_VXLAN_UDP_ZERO_CSUM6_TX]  = { .type = NLA_U8 },
2440         [IFLA_VXLAN_UDP_ZERO_CSUM6_RX]  = { .type = NLA_U8 },
2441         [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 },
2442         [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 },
2443         [IFLA_VXLAN_GBP]        = { .type = NLA_FLAG, },
2444 };
2445
2446 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
2447 {
2448         if (tb[IFLA_ADDRESS]) {
2449                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
2450                         pr_debug("invalid link address (not ethernet)\n");
2451                         return -EINVAL;
2452                 }
2453
2454                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
2455                         pr_debug("invalid all zero ethernet address\n");
2456                         return -EADDRNOTAVAIL;
2457                 }
2458         }
2459
2460         if (!data)
2461                 return -EINVAL;
2462
2463         if (data[IFLA_VXLAN_ID]) {
2464                 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
2465                 if (id >= VXLAN_VID_MASK)
2466                         return -ERANGE;
2467         }
2468
2469         if (data[IFLA_VXLAN_PORT_RANGE]) {
2470                 const struct ifla_vxlan_port_range *p
2471                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
2472
2473                 if (ntohs(p->high) < ntohs(p->low)) {
2474                         pr_debug("port range %u .. %u not valid\n",
2475                                  ntohs(p->low), ntohs(p->high));
2476                         return -EINVAL;
2477                 }
2478         }
2479
2480         return 0;
2481 }
2482
2483 static void vxlan_get_drvinfo(struct net_device *netdev,
2484                               struct ethtool_drvinfo *drvinfo)
2485 {
2486         strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
2487         strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
2488 }
2489
2490 static const struct ethtool_ops vxlan_ethtool_ops = {
2491         .get_drvinfo    = vxlan_get_drvinfo,
2492         .get_link       = ethtool_op_get_link,
2493 };
2494
2495 static void vxlan_del_work(struct work_struct *work)
2496 {
2497         struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
2498         udp_tunnel_sock_release(vs->sock);
2499         kfree_rcu(vs, rcu);
2500 }
2501
2502 static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
2503                                         __be16 port, u32 flags)
2504 {
2505         struct socket *sock;
2506         struct udp_port_cfg udp_conf;
2507         int err;
2508
2509         memset(&udp_conf, 0, sizeof(udp_conf));
2510
2511         if (ipv6) {
2512                 udp_conf.family = AF_INET6;
2513                 udp_conf.use_udp6_tx_checksums =
2514                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
2515                 udp_conf.use_udp6_rx_checksums =
2516                     !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
2517         } else {
2518                 udp_conf.family = AF_INET;
2519                 udp_conf.local_ip.s_addr = INADDR_ANY;
2520                 udp_conf.use_udp_checksums =
2521                     !!(flags & VXLAN_F_UDP_CSUM);
2522         }
2523
2524         udp_conf.local_udp_port = port;
2525
2526         /* Open UDP socket */
2527         err = udp_sock_create(net, &udp_conf, &sock);
2528         if (err < 0)
2529                 return ERR_PTR(err);
2530
2531         return sock;
2532 }
2533
2534 /* Create new listen socket if needed */
2535 static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
2536                                               vxlan_rcv_t *rcv, void *data,
2537                                               u32 flags)
2538 {
2539         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2540         struct vxlan_sock *vs;
2541         struct socket *sock;
2542         unsigned int h;
2543         bool ipv6 = !!(flags & VXLAN_F_IPV6);
2544         struct udp_tunnel_sock_cfg tunnel_cfg;
2545
2546         vs = kzalloc(sizeof(*vs), GFP_KERNEL);
2547         if (!vs)
2548                 return ERR_PTR(-ENOMEM);
2549
2550         for (h = 0; h < VNI_HASH_SIZE; ++h)
2551                 INIT_HLIST_HEAD(&vs->vni_list[h]);
2552
2553         INIT_WORK(&vs->del_work, vxlan_del_work);
2554
2555         sock = vxlan_create_sock(net, ipv6, port, flags);
2556         if (IS_ERR(sock)) {
2557                 kfree(vs);
2558                 return ERR_CAST(sock);
2559         }
2560
2561         vs->sock = sock;
2562         atomic_set(&vs->refcnt, 1);
2563         vs->rcv = rcv;
2564         vs->data = data;
2565         vs->flags = flags;
2566
2567         /* Initialize the vxlan udp offloads structure */
2568         vs->udp_offloads.port = port;
2569         vs->udp_offloads.callbacks.gro_receive  = vxlan_gro_receive;
2570         vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
2571
2572         spin_lock(&vn->sock_lock);
2573         hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
2574         vxlan_notify_add_rx_port(vs);
2575         spin_unlock(&vn->sock_lock);
2576
2577         /* Mark socket as an encapsulation socket. */
2578         tunnel_cfg.sk_user_data = vs;
2579         tunnel_cfg.encap_type = 1;
2580         tunnel_cfg.encap_rcv = vxlan_udp_encap_recv;
2581         tunnel_cfg.encap_destroy = NULL;
2582
2583         setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
2584
2585         return vs;
2586 }
2587
2588 struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
2589                                   vxlan_rcv_t *rcv, void *data,
2590                                   bool no_share, u32 flags)
2591 {
2592         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2593         struct vxlan_sock *vs;
2594         bool ipv6 = flags & VXLAN_F_IPV6;
2595
2596         vs = vxlan_socket_create(net, port, rcv, data, flags);
2597         if (!IS_ERR(vs))
2598                 return vs;
2599
2600         if (no_share)   /* Return error if sharing is not allowed. */
2601                 return vs;
2602
2603         spin_lock(&vn->sock_lock);
2604         vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags);
2605         if (vs && ((vs->rcv != rcv) ||
2606                    !atomic_add_unless(&vs->refcnt, 1, 0)))
2607                         vs = ERR_PTR(-EBUSY);
2608         spin_unlock(&vn->sock_lock);
2609
2610         if (!vs)
2611                 vs = ERR_PTR(-EINVAL);
2612
2613         return vs;
2614 }
2615 EXPORT_SYMBOL_GPL(vxlan_sock_add);
2616
2617 /* Scheduled at device creation to bind to a socket */
2618 static void vxlan_sock_work(struct work_struct *work)
2619 {
2620         struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work);
2621         struct net *net = vxlan->net;
2622         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2623         __be16 port = vxlan->dst_port;
2624         struct vxlan_sock *nvs;
2625
2626         nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags);
2627         spin_lock(&vn->sock_lock);
2628         if (!IS_ERR(nvs))
2629                 vxlan_vs_add_dev(nvs, vxlan);
2630         spin_unlock(&vn->sock_lock);
2631
2632         dev_put(vxlan->dev);
2633 }
2634
2635 static int vxlan_newlink(struct net *net, struct net_device *dev,
2636                          struct nlattr *tb[], struct nlattr *data[])
2637 {
2638         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2639         struct vxlan_dev *vxlan = netdev_priv(dev);
2640         struct vxlan_rdst *dst = &vxlan->default_dst;
2641         __u32 vni;
2642         int err;
2643         bool use_ipv6 = false;
2644
2645         if (!data[IFLA_VXLAN_ID])
2646                 return -EINVAL;
2647
2648         vxlan->net = dev_net(dev);
2649
2650         vni = nla_get_u32(data[IFLA_VXLAN_ID]);
2651         dst->remote_vni = vni;
2652
2653         /* Unless IPv6 is explicitly requested, assume IPv4 */
2654         dst->remote_ip.sa.sa_family = AF_INET;
2655         if (data[IFLA_VXLAN_GROUP]) {
2656                 dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
2657         } else if (data[IFLA_VXLAN_GROUP6]) {
2658                 if (!IS_ENABLED(CONFIG_IPV6))
2659                         return -EPFNOSUPPORT;
2660
2661                 nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
2662                            sizeof(struct in6_addr));
2663                 dst->remote_ip.sa.sa_family = AF_INET6;
2664                 use_ipv6 = true;
2665         }
2666
2667         if (data[IFLA_VXLAN_LOCAL]) {
2668                 vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
2669                 vxlan->saddr.sa.sa_family = AF_INET;
2670         } else if (data[IFLA_VXLAN_LOCAL6]) {
2671                 if (!IS_ENABLED(CONFIG_IPV6))
2672                         return -EPFNOSUPPORT;
2673
2674                 /* TODO: respect scope id */
2675                 nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
2676                            sizeof(struct in6_addr));
2677                 vxlan->saddr.sa.sa_family = AF_INET6;
2678                 use_ipv6 = true;
2679         }
2680
2681         if (data[IFLA_VXLAN_LINK] &&
2682             (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
2683                 struct net_device *lowerdev
2684                          = __dev_get_by_index(net, dst->remote_ifindex);
2685
2686                 if (!lowerdev) {
2687                         pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
2688                         return -ENODEV;
2689                 }
2690
2691 #if IS_ENABLED(CONFIG_IPV6)
2692                 if (use_ipv6) {
2693                         struct inet6_dev *idev = __in6_dev_get(lowerdev);
2694                         if (idev && idev->cnf.disable_ipv6) {
2695                                 pr_info("IPv6 is disabled via sysctl\n");
2696                                 return -EPERM;
2697                         }
2698                         vxlan->flags |= VXLAN_F_IPV6;
2699                 }
2700 #endif
2701
2702                 if (!tb[IFLA_MTU])
2703                         dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2704
2705                 dev->needed_headroom = lowerdev->hard_header_len +
2706                                        (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
2707         } else if (use_ipv6)
2708                 vxlan->flags |= VXLAN_F_IPV6;
2709
2710         if (data[IFLA_VXLAN_TOS])
2711                 vxlan->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
2712
2713         if (data[IFLA_VXLAN_TTL])
2714                 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
2715
2716         if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
2717                 vxlan->flags |= VXLAN_F_LEARN;
2718
2719         if (data[IFLA_VXLAN_AGEING])
2720                 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
2721         else
2722                 vxlan->age_interval = FDB_AGE_DEFAULT;
2723
2724         if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
2725                 vxlan->flags |= VXLAN_F_PROXY;
2726
2727         if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
2728                 vxlan->flags |= VXLAN_F_RSC;
2729
2730         if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
2731                 vxlan->flags |= VXLAN_F_L2MISS;
2732
2733         if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
2734                 vxlan->flags |= VXLAN_F_L3MISS;
2735
2736         if (data[IFLA_VXLAN_LIMIT])
2737                 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
2738
2739         if (data[IFLA_VXLAN_PORT_RANGE]) {
2740                 const struct ifla_vxlan_port_range *p
2741                         = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
2742                 vxlan->port_min = ntohs(p->low);
2743                 vxlan->port_max = ntohs(p->high);
2744         }
2745
2746         if (data[IFLA_VXLAN_PORT])
2747                 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
2748
2749         if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
2750                 vxlan->flags |= VXLAN_F_UDP_CSUM;
2751
2752         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] &&
2753             nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
2754                 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
2755
2756         if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] &&
2757             nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
2758                 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
2759
2760         if (data[IFLA_VXLAN_REMCSUM_TX] &&
2761             nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
2762                 vxlan->flags |= VXLAN_F_REMCSUM_TX;
2763
2764         if (data[IFLA_VXLAN_REMCSUM_RX] &&
2765             nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
2766                 vxlan->flags |= VXLAN_F_REMCSUM_RX;
2767
2768         if (data[IFLA_VXLAN_GBP])
2769                 vxlan->flags |= VXLAN_F_GBP;
2770
2771         if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET,
2772                            vxlan->dst_port, vxlan->flags)) {
2773                 pr_info("duplicate VNI %u\n", vni);
2774                 return -EEXIST;
2775         }
2776
2777         dev->ethtool_ops = &vxlan_ethtool_ops;
2778
2779         /* create an fdb entry for a valid default destination */
2780         if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
2781                 err = vxlan_fdb_create(vxlan, all_zeros_mac,
2782                                        &vxlan->default_dst.remote_ip,
2783                                        NUD_REACHABLE|NUD_PERMANENT,
2784                                        NLM_F_EXCL|NLM_F_CREATE,
2785                                        vxlan->dst_port,
2786                                        vxlan->default_dst.remote_vni,
2787                                        vxlan->default_dst.remote_ifindex,
2788                                        NTF_SELF);
2789                 if (err)
2790                         return err;
2791         }
2792
2793         err = register_netdevice(dev);
2794         if (err) {
2795                 vxlan_fdb_delete_default(vxlan);
2796                 return err;
2797         }
2798
2799         list_add(&vxlan->next, &vn->vxlan_list);
2800
2801         return 0;
2802 }
2803
2804 static void vxlan_dellink(struct net_device *dev, struct list_head *head)
2805 {
2806         struct vxlan_dev *vxlan = netdev_priv(dev);
2807         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2808
2809         spin_lock(&vn->sock_lock);
2810         if (!hlist_unhashed(&vxlan->hlist))
2811                 hlist_del_rcu(&vxlan->hlist);
2812         spin_unlock(&vn->sock_lock);
2813
2814         list_del(&vxlan->next);
2815         unregister_netdevice_queue(dev, head);
2816 }
2817
2818 static size_t vxlan_get_size(const struct net_device *dev)
2819 {
2820
2821         return nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_ID */
2822                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
2823                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
2824                 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
2825                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TTL */
2826                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_TOS */
2827                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_LEARNING */
2828                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_PROXY */
2829                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_RSC */
2830                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L2MISS */
2831                 nla_total_size(sizeof(__u8)) +  /* IFLA_VXLAN_L3MISS */
2832                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
2833                 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
2834                 nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
2835                 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
2836                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
2837                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
2838                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
2839                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
2840                 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
2841                 0;
2842 }
2843
2844 static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
2845 {
2846         const struct vxlan_dev *vxlan = netdev_priv(dev);
2847         const struct vxlan_rdst *dst = &vxlan->default_dst;
2848         struct ifla_vxlan_port_range ports = {
2849                 .low =  htons(vxlan->port_min),
2850                 .high = htons(vxlan->port_max),
2851         };
2852
2853         if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
2854                 goto nla_put_failure;
2855
2856         if (!vxlan_addr_any(&dst->remote_ip)) {
2857                 if (dst->remote_ip.sa.sa_family == AF_INET) {
2858                         if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
2859                                          dst->remote_ip.sin.sin_addr.s_addr))
2860                                 goto nla_put_failure;
2861 #if IS_ENABLED(CONFIG_IPV6)
2862                 } else {
2863                         if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
2864                                     &dst->remote_ip.sin6.sin6_addr))
2865                                 goto nla_put_failure;
2866 #endif
2867                 }
2868         }
2869
2870         if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
2871                 goto nla_put_failure;
2872
2873         if (!vxlan_addr_any(&vxlan->saddr)) {
2874                 if (vxlan->saddr.sa.sa_family == AF_INET) {
2875                         if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
2876                                          vxlan->saddr.sin.sin_addr.s_addr))
2877                                 goto nla_put_failure;
2878 #if IS_ENABLED(CONFIG_IPV6)
2879                 } else {
2880                         if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
2881                                     &vxlan->saddr.sin6.sin6_addr))
2882                                 goto nla_put_failure;
2883 #endif
2884                 }
2885         }
2886
2887         if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
2888             nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
2889             nla_put_u8(skb, IFLA_VXLAN_LEARNING,
2890                         !!(vxlan->flags & VXLAN_F_LEARN)) ||
2891             nla_put_u8(skb, IFLA_VXLAN_PROXY,
2892                         !!(vxlan->flags & VXLAN_F_PROXY)) ||
2893             nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
2894             nla_put_u8(skb, IFLA_VXLAN_L2MISS,
2895                         !!(vxlan->flags & VXLAN_F_L2MISS)) ||
2896             nla_put_u8(skb, IFLA_VXLAN_L3MISS,
2897                         !!(vxlan->flags & VXLAN_F_L3MISS)) ||
2898             nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
2899             nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
2900             nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) ||
2901             nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
2902                         !!(vxlan->flags & VXLAN_F_UDP_CSUM)) ||
2903             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
2904                         !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
2905             nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
2906                         !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
2907             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
2908                         !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) ||
2909             nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
2910                         !!(vxlan->flags & VXLAN_F_REMCSUM_RX)))
2911                 goto nla_put_failure;
2912
2913         if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
2914                 goto nla_put_failure;
2915
2916         if (vxlan->flags & VXLAN_F_GBP &&
2917             nla_put_flag(skb, IFLA_VXLAN_GBP))
2918                 goto nla_put_failure;
2919
2920         return 0;
2921
2922 nla_put_failure:
2923         return -EMSGSIZE;
2924 }
2925
2926 static struct net *vxlan_get_link_net(const struct net_device *dev)
2927 {
2928         struct vxlan_dev *vxlan = netdev_priv(dev);
2929
2930         return vxlan->net;
2931 }
2932
2933 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
2934         .kind           = "vxlan",
2935         .maxtype        = IFLA_VXLAN_MAX,
2936         .policy         = vxlan_policy,
2937         .priv_size      = sizeof(struct vxlan_dev),
2938         .setup          = vxlan_setup,
2939         .validate       = vxlan_validate,
2940         .newlink        = vxlan_newlink,
2941         .dellink        = vxlan_dellink,
2942         .get_size       = vxlan_get_size,
2943         .fill_info      = vxlan_fill_info,
2944         .get_link_net   = vxlan_get_link_net,
2945 };
2946
2947 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
2948                                              struct net_device *dev)
2949 {
2950         struct vxlan_dev *vxlan, *next;
2951         LIST_HEAD(list_kill);
2952
2953         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
2954                 struct vxlan_rdst *dst = &vxlan->default_dst;
2955
2956                 /* In case we created vxlan device with carrier
2957                  * and we loose the carrier due to module unload
2958                  * we also need to remove vxlan device. In other
2959                  * cases, it's not necessary and remote_ifindex
2960                  * is 0 here, so no matches.
2961                  */
2962                 if (dst->remote_ifindex == dev->ifindex)
2963                         vxlan_dellink(vxlan->dev, &list_kill);
2964         }
2965
2966         unregister_netdevice_many(&list_kill);
2967 }
2968
2969 static int vxlan_lowerdev_event(struct notifier_block *unused,
2970                                 unsigned long event, void *ptr)
2971 {
2972         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2973         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
2974
2975         if (event == NETDEV_UNREGISTER)
2976                 vxlan_handle_lowerdev_unregister(vn, dev);
2977
2978         return NOTIFY_DONE;
2979 }
2980
2981 static struct notifier_block vxlan_notifier_block __read_mostly = {
2982         .notifier_call = vxlan_lowerdev_event,
2983 };
2984
2985 static __net_init int vxlan_init_net(struct net *net)
2986 {
2987         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
2988         unsigned int h;
2989
2990         INIT_LIST_HEAD(&vn->vxlan_list);
2991         spin_lock_init(&vn->sock_lock);
2992
2993         for (h = 0; h < PORT_HASH_SIZE; ++h)
2994                 INIT_HLIST_HEAD(&vn->sock_list[h]);
2995
2996         return 0;
2997 }
2998
2999 static void __net_exit vxlan_exit_net(struct net *net)
3000 {
3001         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
3002         struct vxlan_dev *vxlan, *next;
3003         struct net_device *dev, *aux;
3004         LIST_HEAD(list);
3005
3006         rtnl_lock();
3007         for_each_netdev_safe(net, dev, aux)
3008                 if (dev->rtnl_link_ops == &vxlan_link_ops)
3009                         unregister_netdevice_queue(dev, &list);
3010
3011         list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
3012                 /* If vxlan->dev is in the same netns, it has already been added
3013                  * to the list by the previous loop.
3014                  */
3015                 if (!net_eq(dev_net(vxlan->dev), net))
3016                         unregister_netdevice_queue(dev, &list);
3017         }
3018
3019         unregister_netdevice_many(&list);
3020         rtnl_unlock();
3021 }
3022
3023 static struct pernet_operations vxlan_net_ops = {
3024         .init = vxlan_init_net,
3025         .exit = vxlan_exit_net,
3026         .id   = &vxlan_net_id,
3027         .size = sizeof(struct vxlan_net),
3028 };
3029
3030 static int __init vxlan_init_module(void)
3031 {
3032         int rc;
3033
3034         vxlan_wq = alloc_workqueue("vxlan", 0, 0);
3035         if (!vxlan_wq)
3036                 return -ENOMEM;
3037
3038         get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
3039
3040         rc = register_pernet_subsys(&vxlan_net_ops);
3041         if (rc)
3042                 goto out1;
3043
3044         rc = register_netdevice_notifier(&vxlan_notifier_block);
3045         if (rc)
3046                 goto out2;
3047
3048         rc = rtnl_link_register(&vxlan_link_ops);
3049         if (rc)
3050                 goto out3;
3051
3052         return 0;
3053 out3:
3054         unregister_netdevice_notifier(&vxlan_notifier_block);
3055 out2:
3056         unregister_pernet_subsys(&vxlan_net_ops);
3057 out1:
3058         destroy_workqueue(vxlan_wq);
3059         return rc;
3060 }
3061 late_initcall(vxlan_init_module);
3062
3063 static void __exit vxlan_cleanup_module(void)
3064 {
3065         rtnl_link_unregister(&vxlan_link_ops);
3066         unregister_netdevice_notifier(&vxlan_notifier_block);
3067         destroy_workqueue(vxlan_wq);
3068         unregister_pernet_subsys(&vxlan_net_ops);
3069         /* rcu_barrier() is called by netns */
3070 }
3071 module_exit(vxlan_cleanup_module);
3072
3073 MODULE_LICENSE("GPL");
3074 MODULE_VERSION(VXLAN_VERSION);
3075 MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
3076 MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
3077 MODULE_ALIAS_RTNL_LINK("vxlan");