2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <linux/bpf.h>
98 #include <net/net_namespace.h>
100 #include <net/busy_poll.h>
101 #include <linux/rtnetlink.h>
102 #include <linux/stat.h>
104 #include <net/dst_metadata.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/iw_handler.h>
115 #include <asm/current.h>
116 #include <linux/audit.h>
117 #include <linux/dmaengine.h>
118 #include <linux/err.h>
119 #include <linux/ctype.h>
120 #include <linux/if_arp.h>
121 #include <linux/if_vlan.h>
122 #include <linux/ip.h>
124 #include <net/mpls.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/static_key.h>
136 #include <linux/hashtable.h>
137 #include <linux/vmalloc.h>
138 #include <linux/if_macvlan.h>
139 #include <linux/errqueue.h>
140 #include <linux/hrtimer.h>
141 #include <linux/netfilter_ingress.h>
142 #include <linux/sctp.h>
143 #include <linux/crash_dump.h>
145 #include "net-sysfs.h"
147 /* Instead of increasing this, you should create a hash table. */
148 #define MAX_GRO_SKBS 8
150 /* This should be increased if a protocol with a bigger head is added. */
151 #define GRO_MAX_HEAD (MAX_HEADER + 128)
153 static DEFINE_SPINLOCK(ptype_lock);
154 static DEFINE_SPINLOCK(offload_lock);
155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
156 struct list_head ptype_all __read_mostly; /* Taps */
157 static struct list_head offload_base __read_mostly;
159 static int netif_rx_internal(struct sk_buff *skb);
160 static int call_netdevice_notifiers_info(unsigned long val,
161 struct net_device *dev,
162 struct netdev_notifier_info *info);
165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
170 * Writers must hold the rtnl semaphore while they loop through the
171 * dev_base_head list, and hold dev_base_lock for writing when they do the
172 * actual updates. This allows pure readers to access the list even
173 * while a writer is preparing to update it.
175 * To put it another way, dev_base_lock is held for writing only to
176 * protect against pure readers; the rtnl semaphore provides the
177 * protection against other writers.
179 * See, for example usages, register_netdevice() and
180 * unregister_netdevice(), which must be called with the rtnl
183 DEFINE_RWLOCK(dev_base_lock);
184 EXPORT_SYMBOL(dev_base_lock);
186 /* protects napi_hash addition/deletion and napi_gen_id */
187 static DEFINE_SPINLOCK(napi_hash_lock);
189 static unsigned int napi_gen_id = NR_CPUS;
190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
192 static seqcount_t devnet_rename_seq;
194 static inline void dev_base_seq_inc(struct net *net)
196 while (++net->dev_base_seq == 0);
199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
201 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 static inline void rps_lock(struct softnet_data *sd)
214 spin_lock(&sd->input_pkt_queue.lock);
218 static inline void rps_unlock(struct softnet_data *sd)
221 spin_unlock(&sd->input_pkt_queue.lock);
225 /* Device list insertion */
226 static void list_netdevice(struct net_device *dev)
228 struct net *net = dev_net(dev);
232 write_lock_bh(&dev_base_lock);
233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
237 write_unlock_bh(&dev_base_lock);
239 dev_base_seq_inc(net);
242 /* Device list removal
243 * caller must respect a RCU grace period before freeing/reusing dev
245 static void unlist_netdevice(struct net_device *dev)
249 /* Unlink dev from the device chain */
250 write_lock_bh(&dev_base_lock);
251 list_del_rcu(&dev->dev_list);
252 hlist_del_rcu(&dev->name_hlist);
253 hlist_del_rcu(&dev->index_hlist);
254 write_unlock_bh(&dev_base_lock);
256 dev_base_seq_inc(dev_net(dev));
263 static RAW_NOTIFIER_HEAD(netdev_chain);
266 * Device drivers call our routines to queue packets here. We empty the
267 * queue in the local softnet handler.
270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
271 EXPORT_PER_CPU_SYMBOL(softnet_data);
273 #ifdef CONFIG_LOCKDEP
275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
276 * according to dev->type
278 static const unsigned short netdev_lock_type[] =
279 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
283 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
284 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
285 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
286 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
287 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
288 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
289 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
290 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
291 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
295 static const char *const netdev_lock_name[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 if (netdev_lock_type[i] == dev_type)
322 /* the last key is used by default */
323 return ARRAY_SIZE(netdev_lock_type) - 1;
326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 unsigned short dev_type)
331 i = netdev_lock_pos(dev_type);
332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 netdev_lock_name[i]);
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
340 i = netdev_lock_pos(dev->type);
341 lockdep_set_class_and_name(&dev->addr_list_lock,
342 &netdev_addr_lock_key[i],
343 netdev_lock_name[i]);
346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 unsigned short dev_type)
350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
355 /*******************************************************************************
357 Protocol management and registration routines
359 *******************************************************************************/
362 * Add a protocol ID to the list. Now that the input handler is
363 * smarter we can dispense with all the messy stuff that used to be
366 * BEWARE!!! Protocol handlers, mangling input packets,
367 * MUST BE last in hash buckets and checking protocol handlers
368 * MUST start from promiscuous ptype_all chain in net_bh.
369 * It is true now, do not change it.
370 * Explanation follows: if protocol handler, mangling packet, will
371 * be the first on list, it is not able to sense, that packet
372 * is cloned and should be copied-on-write, so that it will
373 * change it and subsequent readers will get broken packet.
377 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 if (pt->type == htons(ETH_P_ALL))
380 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
382 return pt->dev ? &pt->dev->ptype_specific :
383 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
399 void dev_add_pack(struct packet_type *pt)
401 struct list_head *head = ptype_head(pt);
403 spin_lock(&ptype_lock);
404 list_add_rcu(&pt->list, head);
405 spin_unlock(&ptype_lock);
407 EXPORT_SYMBOL(dev_add_pack);
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
422 void __dev_remove_pack(struct packet_type *pt)
424 struct list_head *head = ptype_head(pt);
425 struct packet_type *pt1;
427 spin_lock(&ptype_lock);
429 list_for_each_entry(pt1, head, list) {
431 list_del_rcu(&pt->list);
436 pr_warn("dev_remove_pack: %p not found\n", pt);
438 spin_unlock(&ptype_lock);
440 EXPORT_SYMBOL(__dev_remove_pack);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type *pt)
456 __dev_remove_pack(pt);
460 EXPORT_SYMBOL(dev_remove_pack);
464 * dev_add_offload - register offload handlers
465 * @po: protocol offload declaration
467 * Add protocol offload handlers to the networking stack. The passed
468 * &proto_offload is linked into kernel lists and may not be freed until
469 * it has been removed from the kernel lists.
471 * This call does not sleep therefore it can not
472 * guarantee all CPU's that are in middle of receiving packets
473 * will see the new offload handlers (until the next received packet).
475 void dev_add_offload(struct packet_offload *po)
477 struct packet_offload *elem;
479 spin_lock(&offload_lock);
480 list_for_each_entry(elem, &offload_base, list) {
481 if (po->priority < elem->priority)
484 list_add_rcu(&po->list, elem->list.prev);
485 spin_unlock(&offload_lock);
487 EXPORT_SYMBOL(dev_add_offload);
490 * __dev_remove_offload - remove offload handler
491 * @po: packet offload declaration
493 * Remove a protocol offload handler that was previously added to the
494 * kernel offload handlers by dev_add_offload(). The passed &offload_type
495 * is removed from the kernel lists and can be freed or reused once this
498 * The packet type might still be in use by receivers
499 * and must not be freed until after all the CPU's have gone
500 * through a quiescent state.
502 static void __dev_remove_offload(struct packet_offload *po)
504 struct list_head *head = &offload_base;
505 struct packet_offload *po1;
507 spin_lock(&offload_lock);
509 list_for_each_entry(po1, head, list) {
511 list_del_rcu(&po->list);
516 pr_warn("dev_remove_offload: %p not found\n", po);
518 spin_unlock(&offload_lock);
522 * dev_remove_offload - remove packet offload handler
523 * @po: packet offload declaration
525 * Remove a packet offload handler that was previously added to the kernel
526 * offload handlers by dev_add_offload(). The passed &offload_type is
527 * removed from the kernel lists and can be freed or reused once this
530 * This call sleeps to guarantee that no CPU is looking at the packet
533 void dev_remove_offload(struct packet_offload *po)
535 __dev_remove_offload(po);
539 EXPORT_SYMBOL(dev_remove_offload);
541 /******************************************************************************
543 Device Boot-time Settings Routines
545 *******************************************************************************/
547 /* Boot time configuration table */
548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
551 * netdev_boot_setup_add - add new setup entry
552 * @name: name of the device
553 * @map: configured settings for the device
555 * Adds new setup entry to the dev_boot_setup list. The function
556 * returns 0 on error and 1 on success. This is a generic routine to
559 static int netdev_boot_setup_add(char *name, struct ifmap *map)
561 struct netdev_boot_setup *s;
565 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
566 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
567 memset(s[i].name, 0, sizeof(s[i].name));
568 strlcpy(s[i].name, name, IFNAMSIZ);
569 memcpy(&s[i].map, map, sizeof(s[i].map));
574 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
578 * netdev_boot_setup_check - check boot time settings
579 * @dev: the netdevice
581 * Check boot time settings for the device.
582 * The found settings are set for the device to be used
583 * later in the device probing.
584 * Returns 0 if no settings found, 1 if they are.
586 int netdev_boot_setup_check(struct net_device *dev)
588 struct netdev_boot_setup *s = dev_boot_setup;
591 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
592 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
593 !strcmp(dev->name, s[i].name)) {
594 dev->irq = s[i].map.irq;
595 dev->base_addr = s[i].map.base_addr;
596 dev->mem_start = s[i].map.mem_start;
597 dev->mem_end = s[i].map.mem_end;
603 EXPORT_SYMBOL(netdev_boot_setup_check);
607 * netdev_boot_base - get address from boot time settings
608 * @prefix: prefix for network device
609 * @unit: id for network device
611 * Check boot time settings for the base address of device.
612 * The found settings are set for the device to be used
613 * later in the device probing.
614 * Returns 0 if no settings found.
616 unsigned long netdev_boot_base(const char *prefix, int unit)
618 const struct netdev_boot_setup *s = dev_boot_setup;
622 sprintf(name, "%s%d", prefix, unit);
625 * If device already registered then return base of 1
626 * to indicate not to probe for this interface
628 if (__dev_get_by_name(&init_net, name))
631 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
632 if (!strcmp(name, s[i].name))
633 return s[i].map.base_addr;
638 * Saves at boot time configured settings for any netdevice.
640 int __init netdev_boot_setup(char *str)
645 str = get_options(str, ARRAY_SIZE(ints), ints);
650 memset(&map, 0, sizeof(map));
654 map.base_addr = ints[2];
656 map.mem_start = ints[3];
658 map.mem_end = ints[4];
660 /* Add new entry to the list */
661 return netdev_boot_setup_add(str, &map);
664 __setup("netdev=", netdev_boot_setup);
666 /*******************************************************************************
668 Device Interface Subroutines
670 *******************************************************************************/
673 * dev_get_iflink - get 'iflink' value of a interface
674 * @dev: targeted interface
676 * Indicates the ifindex the interface is linked to.
677 * Physical interfaces have the same 'ifindex' and 'iflink' values.
680 int dev_get_iflink(const struct net_device *dev)
682 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
683 return dev->netdev_ops->ndo_get_iflink(dev);
687 EXPORT_SYMBOL(dev_get_iflink);
690 * dev_fill_metadata_dst - Retrieve tunnel egress information.
691 * @dev: targeted interface
694 * For better visibility of tunnel traffic OVS needs to retrieve
695 * egress tunnel information for a packet. Following API allows
696 * user to get this info.
698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
700 struct ip_tunnel_info *info;
702 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
705 info = skb_tunnel_info_unclone(skb);
708 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
711 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
716 * __dev_get_by_name - find a device by its name
717 * @net: the applicable net namespace
718 * @name: name to find
720 * Find an interface by name. Must be called under RTNL semaphore
721 * or @dev_base_lock. If the name is found a pointer to the device
722 * is returned. If the name is not found then %NULL is returned. The
723 * reference counters are not incremented so the caller must be
724 * careful with locks.
727 struct net_device *__dev_get_by_name(struct net *net, const char *name)
729 struct net_device *dev;
730 struct hlist_head *head = dev_name_hash(net, name);
732 hlist_for_each_entry(dev, head, name_hlist)
733 if (!strncmp(dev->name, name, IFNAMSIZ))
738 EXPORT_SYMBOL(__dev_get_by_name);
741 * dev_get_by_name_rcu - find a device by its name
742 * @net: the applicable net namespace
743 * @name: name to find
745 * Find an interface by name.
746 * If the name is found a pointer to the device is returned.
747 * If the name is not found then %NULL is returned.
748 * The reference counters are not incremented so the caller must be
749 * careful with locks. The caller must hold RCU lock.
752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
754 struct net_device *dev;
755 struct hlist_head *head = dev_name_hash(net, name);
757 hlist_for_each_entry_rcu(dev, head, name_hlist)
758 if (!strncmp(dev->name, name, IFNAMSIZ))
763 EXPORT_SYMBOL(dev_get_by_name_rcu);
766 * dev_get_by_name - find a device by its name
767 * @net: the applicable net namespace
768 * @name: name to find
770 * Find an interface by name. This can be called from any
771 * context and does its own locking. The returned handle has
772 * the usage count incremented and the caller must use dev_put() to
773 * release it when it is no longer needed. %NULL is returned if no
774 * matching device is found.
777 struct net_device *dev_get_by_name(struct net *net, const char *name)
779 struct net_device *dev;
782 dev = dev_get_by_name_rcu(net, name);
788 EXPORT_SYMBOL(dev_get_by_name);
791 * __dev_get_by_index - find a device by its ifindex
792 * @net: the applicable net namespace
793 * @ifindex: index of device
795 * Search for an interface by index. Returns %NULL if the device
796 * is not found or a pointer to the device. The device has not
797 * had its reference counter increased so the caller must be careful
798 * about locking. The caller must hold either the RTNL semaphore
802 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
804 struct net_device *dev;
805 struct hlist_head *head = dev_index_hash(net, ifindex);
807 hlist_for_each_entry(dev, head, index_hlist)
808 if (dev->ifindex == ifindex)
813 EXPORT_SYMBOL(__dev_get_by_index);
816 * dev_get_by_index_rcu - find a device by its ifindex
817 * @net: the applicable net namespace
818 * @ifindex: index of device
820 * Search for an interface by index. Returns %NULL if the device
821 * is not found or a pointer to the device. The device has not
822 * had its reference counter increased so the caller must be careful
823 * about locking. The caller must hold RCU lock.
826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
828 struct net_device *dev;
829 struct hlist_head *head = dev_index_hash(net, ifindex);
831 hlist_for_each_entry_rcu(dev, head, index_hlist)
832 if (dev->ifindex == ifindex)
837 EXPORT_SYMBOL(dev_get_by_index_rcu);
841 * dev_get_by_index - find a device by its ifindex
842 * @net: the applicable net namespace
843 * @ifindex: index of device
845 * Search for an interface by index. Returns NULL if the device
846 * is not found or a pointer to the device. The device returned has
847 * had a reference added and the pointer is safe until the user calls
848 * dev_put to indicate they have finished with it.
851 struct net_device *dev_get_by_index(struct net *net, int ifindex)
853 struct net_device *dev;
856 dev = dev_get_by_index_rcu(net, ifindex);
862 EXPORT_SYMBOL(dev_get_by_index);
865 * netdev_get_name - get a netdevice name, knowing its ifindex.
866 * @net: network namespace
867 * @name: a pointer to the buffer where the name will be stored.
868 * @ifindex: the ifindex of the interface to get the name from.
870 * The use of raw_seqcount_begin() and cond_resched() before
871 * retrying is required as we want to give the writers a chance
872 * to complete when CONFIG_PREEMPT is not set.
874 int netdev_get_name(struct net *net, char *name, int ifindex)
876 struct net_device *dev;
880 seq = raw_seqcount_begin(&devnet_rename_seq);
882 dev = dev_get_by_index_rcu(net, ifindex);
888 strcpy(name, dev->name);
890 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
899 * dev_getbyhwaddr_rcu - find a device by its hardware address
900 * @net: the applicable net namespace
901 * @type: media type of device
902 * @ha: hardware address
904 * Search for an interface by MAC address. Returns NULL if the device
905 * is not found or a pointer to the device.
906 * The caller must hold RCU or RTNL.
907 * The returned device has not had its ref count increased
908 * and the caller must therefore be careful about locking
912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
915 struct net_device *dev;
917 for_each_netdev_rcu(net, dev)
918 if (dev->type == type &&
919 !memcmp(dev->dev_addr, ha, dev->addr_len))
924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
928 struct net_device *dev;
931 for_each_netdev(net, dev)
932 if (dev->type == type)
937 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
941 struct net_device *dev, *ret = NULL;
944 for_each_netdev_rcu(net, dev)
945 if (dev->type == type) {
953 EXPORT_SYMBOL(dev_getfirstbyhwtype);
956 * __dev_get_by_flags - find any device with given flags
957 * @net: the applicable net namespace
958 * @if_flags: IFF_* values
959 * @mask: bitmask of bits in if_flags to check
961 * Search for any interface with the given flags. Returns NULL if a device
962 * is not found or a pointer to the device. Must be called inside
963 * rtnl_lock(), and result refcount is unchanged.
966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
969 struct net_device *dev, *ret;
974 for_each_netdev(net, dev) {
975 if (((dev->flags ^ if_flags) & mask) == 0) {
982 EXPORT_SYMBOL(__dev_get_by_flags);
985 * dev_valid_name - check if name is okay for network device
988 * Network device names need to be valid file names to
989 * to allow sysfs to work. We also disallow any kind of
992 bool dev_valid_name(const char *name)
996 if (strlen(name) >= IFNAMSIZ)
998 if (!strcmp(name, ".") || !strcmp(name, ".."))
1002 if (*name == '/' || *name == ':' || isspace(*name))
1008 EXPORT_SYMBOL(dev_valid_name);
1011 * __dev_alloc_name - allocate a name for a device
1012 * @net: network namespace to allocate the device name in
1013 * @name: name format string
1014 * @buf: scratch buffer and result name string
1016 * Passed a format string - eg "lt%d" it will try and find a suitable
1017 * id. It scans list of devices to build up a free map, then chooses
1018 * the first empty slot. The caller must hold the dev_base or rtnl lock
1019 * while allocating the name and adding the device in order to avoid
1021 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022 * Returns the number of the unit assigned or a negative errno code.
1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1029 const int max_netdevices = 8*PAGE_SIZE;
1030 unsigned long *inuse;
1031 struct net_device *d;
1033 p = strnchr(name, IFNAMSIZ-1, '%');
1036 * Verify the string as this thing may have come from
1037 * the user. There must be either one "%d" and no other "%"
1040 if (p[1] != 'd' || strchr(p + 2, '%'))
1043 /* Use one page as a bit array of possible slots */
1044 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1048 for_each_netdev(net, d) {
1049 if (!sscanf(d->name, name, &i))
1051 if (i < 0 || i >= max_netdevices)
1054 /* avoid cases where sscanf is not exact inverse of printf */
1055 snprintf(buf, IFNAMSIZ, name, i);
1056 if (!strncmp(buf, d->name, IFNAMSIZ))
1060 i = find_first_zero_bit(inuse, max_netdevices);
1061 free_page((unsigned long) inuse);
1065 snprintf(buf, IFNAMSIZ, name, i);
1066 if (!__dev_get_by_name(net, buf))
1069 /* It is possible to run out of possible slots
1070 * when the name is long and there isn't enough space left
1071 * for the digits, or if all bits are used.
1077 * dev_alloc_name - allocate a name for a device
1079 * @name: name format string
1081 * Passed a format string - eg "lt%d" it will try and find a suitable
1082 * id. It scans list of devices to build up a free map, then chooses
1083 * the first empty slot. The caller must hold the dev_base or rtnl lock
1084 * while allocating the name and adding the device in order to avoid
1086 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1087 * Returns the number of the unit assigned or a negative errno code.
1090 int dev_alloc_name(struct net_device *dev, const char *name)
1096 BUG_ON(!dev_net(dev));
1098 ret = __dev_alloc_name(net, name, buf);
1100 strlcpy(dev->name, buf, IFNAMSIZ);
1103 EXPORT_SYMBOL(dev_alloc_name);
1105 static int dev_alloc_name_ns(struct net *net,
1106 struct net_device *dev,
1112 ret = __dev_alloc_name(net, name, buf);
1114 strlcpy(dev->name, buf, IFNAMSIZ);
1118 static int dev_get_valid_name(struct net *net,
1119 struct net_device *dev,
1124 if (!dev_valid_name(name))
1127 if (strchr(name, '%'))
1128 return dev_alloc_name_ns(net, dev, name);
1129 else if (__dev_get_by_name(net, name))
1131 else if (dev->name != name)
1132 strlcpy(dev->name, name, IFNAMSIZ);
1138 * dev_change_name - change name of a device
1140 * @newname: name (or format string) must be at least IFNAMSIZ
1142 * Change name of a device, can pass format strings "eth%d".
1145 int dev_change_name(struct net_device *dev, const char *newname)
1147 unsigned char old_assign_type;
1148 char oldname[IFNAMSIZ];
1154 BUG_ON(!dev_net(dev));
1157 if (dev->flags & IFF_UP)
1160 write_seqcount_begin(&devnet_rename_seq);
1162 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1163 write_seqcount_end(&devnet_rename_seq);
1167 memcpy(oldname, dev->name, IFNAMSIZ);
1169 err = dev_get_valid_name(net, dev, newname);
1171 write_seqcount_end(&devnet_rename_seq);
1175 if (oldname[0] && !strchr(oldname, '%'))
1176 netdev_info(dev, "renamed from %s\n", oldname);
1178 old_assign_type = dev->name_assign_type;
1179 dev->name_assign_type = NET_NAME_RENAMED;
1182 ret = device_rename(&dev->dev, dev->name);
1184 memcpy(dev->name, oldname, IFNAMSIZ);
1185 dev->name_assign_type = old_assign_type;
1186 write_seqcount_end(&devnet_rename_seq);
1190 write_seqcount_end(&devnet_rename_seq);
1192 netdev_adjacent_rename_links(dev, oldname);
1194 write_lock_bh(&dev_base_lock);
1195 hlist_del_rcu(&dev->name_hlist);
1196 write_unlock_bh(&dev_base_lock);
1200 write_lock_bh(&dev_base_lock);
1201 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1202 write_unlock_bh(&dev_base_lock);
1204 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1205 ret = notifier_to_errno(ret);
1208 /* err >= 0 after dev_alloc_name() or stores the first errno */
1211 write_seqcount_begin(&devnet_rename_seq);
1212 memcpy(dev->name, oldname, IFNAMSIZ);
1213 memcpy(oldname, newname, IFNAMSIZ);
1214 dev->name_assign_type = old_assign_type;
1215 old_assign_type = NET_NAME_RENAMED;
1218 pr_err("%s: name change rollback failed: %d\n",
1227 * dev_set_alias - change ifalias of a device
1229 * @alias: name up to IFALIASZ
1230 * @len: limit of bytes to copy from info
1232 * Set ifalias for a device,
1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1240 if (len >= IFALIASZ)
1244 kfree(dev->ifalias);
1245 dev->ifalias = NULL;
1249 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1252 dev->ifalias = new_ifalias;
1254 strlcpy(dev->ifalias, alias, len+1);
1260 * netdev_features_change - device changes features
1261 * @dev: device to cause notification
1263 * Called to indicate a device has changed features.
1265 void netdev_features_change(struct net_device *dev)
1267 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1269 EXPORT_SYMBOL(netdev_features_change);
1272 * netdev_state_change - device changes state
1273 * @dev: device to cause notification
1275 * Called to indicate a device has changed state. This function calls
1276 * the notifier chains for netdev_chain and sends a NEWLINK message
1277 * to the routing socket.
1279 void netdev_state_change(struct net_device *dev)
1281 if (dev->flags & IFF_UP) {
1282 struct netdev_notifier_change_info change_info;
1284 change_info.flags_changed = 0;
1285 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1287 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1290 EXPORT_SYMBOL(netdev_state_change);
1293 * netdev_notify_peers - notify network peers about existence of @dev
1294 * @dev: network device
1296 * Generate traffic such that interested network peers are aware of
1297 * @dev, such as by generating a gratuitous ARP. This may be used when
1298 * a device wants to inform the rest of the network about some sort of
1299 * reconfiguration such as a failover event or virtual machine
1302 void netdev_notify_peers(struct net_device *dev)
1305 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1308 EXPORT_SYMBOL(netdev_notify_peers);
1310 static int __dev_open(struct net_device *dev)
1312 const struct net_device_ops *ops = dev->netdev_ops;
1317 if (!netif_device_present(dev))
1320 /* Block netpoll from trying to do any rx path servicing.
1321 * If we don't do this there is a chance ndo_poll_controller
1322 * or ndo_poll may be running while we open the device
1324 netpoll_poll_disable(dev);
1326 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1327 ret = notifier_to_errno(ret);
1331 set_bit(__LINK_STATE_START, &dev->state);
1333 if (ops->ndo_validate_addr)
1334 ret = ops->ndo_validate_addr(dev);
1336 if (!ret && ops->ndo_open)
1337 ret = ops->ndo_open(dev);
1339 netpoll_poll_enable(dev);
1342 clear_bit(__LINK_STATE_START, &dev->state);
1344 dev->flags |= IFF_UP;
1345 dev_set_rx_mode(dev);
1347 add_device_randomness(dev->dev_addr, dev->addr_len);
1354 * dev_open - prepare an interface for use.
1355 * @dev: device to open
1357 * Takes a device from down to up state. The device's private open
1358 * function is invoked and then the multicast lists are loaded. Finally
1359 * the device is moved into the up state and a %NETDEV_UP message is
1360 * sent to the netdev notifier chain.
1362 * Calling this function on an active interface is a nop. On a failure
1363 * a negative errno code is returned.
1365 int dev_open(struct net_device *dev)
1369 if (dev->flags & IFF_UP)
1372 ret = __dev_open(dev);
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1377 call_netdevice_notifiers(NETDEV_UP, dev);
1381 EXPORT_SYMBOL(dev_open);
1383 static int __dev_close_many(struct list_head *head)
1385 struct net_device *dev;
1390 list_for_each_entry(dev, head, close_list) {
1391 /* Temporarily disable netpoll until the interface is down */
1392 netpoll_poll_disable(dev);
1394 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1396 clear_bit(__LINK_STATE_START, &dev->state);
1398 /* Synchronize to scheduled poll. We cannot touch poll list, it
1399 * can be even on different cpu. So just clear netif_running().
1401 * dev->stop() will invoke napi_disable() on all of it's
1402 * napi_struct instances on this device.
1404 smp_mb__after_atomic(); /* Commit netif_running(). */
1407 dev_deactivate_many(head);
1409 list_for_each_entry(dev, head, close_list) {
1410 const struct net_device_ops *ops = dev->netdev_ops;
1413 * Call the device specific close. This cannot fail.
1414 * Only if device is UP
1416 * We allow it to be called even after a DETACH hot-plug
1422 dev->flags &= ~IFF_UP;
1423 netpoll_poll_enable(dev);
1429 static int __dev_close(struct net_device *dev)
1434 list_add(&dev->close_list, &single);
1435 retval = __dev_close_many(&single);
1441 int dev_close_many(struct list_head *head, bool unlink)
1443 struct net_device *dev, *tmp;
1445 /* Remove the devices that don't need to be closed */
1446 list_for_each_entry_safe(dev, tmp, head, close_list)
1447 if (!(dev->flags & IFF_UP))
1448 list_del_init(&dev->close_list);
1450 __dev_close_many(head);
1452 list_for_each_entry_safe(dev, tmp, head, close_list) {
1453 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1454 call_netdevice_notifiers(NETDEV_DOWN, dev);
1456 list_del_init(&dev->close_list);
1461 EXPORT_SYMBOL(dev_close_many);
1464 * dev_close - shutdown an interface.
1465 * @dev: device to shutdown
1467 * This function moves an active device into down state. A
1468 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1469 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1472 int dev_close(struct net_device *dev)
1474 if (dev->flags & IFF_UP) {
1477 list_add(&dev->close_list, &single);
1478 dev_close_many(&single, true);
1483 EXPORT_SYMBOL(dev_close);
1487 * dev_disable_lro - disable Large Receive Offload on a device
1490 * Disable Large Receive Offload (LRO) on a net device. Must be
1491 * called under RTNL. This is needed if received packets may be
1492 * forwarded to another interface.
1494 void dev_disable_lro(struct net_device *dev)
1496 struct net_device *lower_dev;
1497 struct list_head *iter;
1499 dev->wanted_features &= ~NETIF_F_LRO;
1500 netdev_update_features(dev);
1502 if (unlikely(dev->features & NETIF_F_LRO))
1503 netdev_WARN(dev, "failed to disable LRO!\n");
1505 netdev_for_each_lower_dev(dev, lower_dev, iter)
1506 dev_disable_lro(lower_dev);
1508 EXPORT_SYMBOL(dev_disable_lro);
1510 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1511 struct net_device *dev)
1513 struct netdev_notifier_info info;
1515 netdev_notifier_info_init(&info, dev);
1516 return nb->notifier_call(nb, val, &info);
1519 static int dev_boot_phase = 1;
1522 * register_netdevice_notifier - register a network notifier block
1525 * Register a notifier to be called when network device events occur.
1526 * The notifier passed is linked into the kernel structures and must
1527 * not be reused until it has been unregistered. A negative errno code
1528 * is returned on a failure.
1530 * When registered all registration and up events are replayed
1531 * to the new notifier to allow device to have a race free
1532 * view of the network device list.
1535 int register_netdevice_notifier(struct notifier_block *nb)
1537 struct net_device *dev;
1538 struct net_device *last;
1543 err = raw_notifier_chain_register(&netdev_chain, nb);
1549 for_each_netdev(net, dev) {
1550 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1551 err = notifier_to_errno(err);
1555 if (!(dev->flags & IFF_UP))
1558 call_netdevice_notifier(nb, NETDEV_UP, dev);
1569 for_each_netdev(net, dev) {
1573 if (dev->flags & IFF_UP) {
1574 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1576 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1578 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1583 raw_notifier_chain_unregister(&netdev_chain, nb);
1586 EXPORT_SYMBOL(register_netdevice_notifier);
1589 * unregister_netdevice_notifier - unregister a network notifier block
1592 * Unregister a notifier previously registered by
1593 * register_netdevice_notifier(). The notifier is unlinked into the
1594 * kernel structures and may then be reused. A negative errno code
1595 * is returned on a failure.
1597 * After unregistering unregister and down device events are synthesized
1598 * for all devices on the device list to the removed notifier to remove
1599 * the need for special case cleanup code.
1602 int unregister_netdevice_notifier(struct notifier_block *nb)
1604 struct net_device *dev;
1609 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1614 for_each_netdev(net, dev) {
1615 if (dev->flags & IFF_UP) {
1616 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1618 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1620 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1627 EXPORT_SYMBOL(unregister_netdevice_notifier);
1630 * call_netdevice_notifiers_info - call all network notifier blocks
1631 * @val: value passed unmodified to notifier function
1632 * @dev: net_device pointer passed unmodified to notifier function
1633 * @info: notifier information data
1635 * Call all network notifier blocks. Parameters and return value
1636 * are as for raw_notifier_call_chain().
1639 static int call_netdevice_notifiers_info(unsigned long val,
1640 struct net_device *dev,
1641 struct netdev_notifier_info *info)
1644 netdev_notifier_info_init(info, dev);
1645 return raw_notifier_call_chain(&netdev_chain, val, info);
1649 * call_netdevice_notifiers - call all network notifier blocks
1650 * @val: value passed unmodified to notifier function
1651 * @dev: net_device pointer passed unmodified to notifier function
1653 * Call all network notifier blocks. Parameters and return value
1654 * are as for raw_notifier_call_chain().
1657 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1659 struct netdev_notifier_info info;
1661 return call_netdevice_notifiers_info(val, dev, &info);
1663 EXPORT_SYMBOL(call_netdevice_notifiers);
1665 #ifdef CONFIG_NET_INGRESS
1666 static struct static_key ingress_needed __read_mostly;
1668 void net_inc_ingress_queue(void)
1670 static_key_slow_inc(&ingress_needed);
1672 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1674 void net_dec_ingress_queue(void)
1676 static_key_slow_dec(&ingress_needed);
1678 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1681 #ifdef CONFIG_NET_EGRESS
1682 static struct static_key egress_needed __read_mostly;
1684 void net_inc_egress_queue(void)
1686 static_key_slow_inc(&egress_needed);
1688 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1690 void net_dec_egress_queue(void)
1692 static_key_slow_dec(&egress_needed);
1694 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1697 static struct static_key netstamp_needed __read_mostly;
1698 #ifdef HAVE_JUMP_LABEL
1699 /* We are not allowed to call static_key_slow_dec() from irq context
1700 * If net_disable_timestamp() is called from irq context, defer the
1701 * static_key_slow_dec() calls.
1703 static atomic_t netstamp_needed_deferred;
1706 void net_enable_timestamp(void)
1708 #ifdef HAVE_JUMP_LABEL
1709 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1713 static_key_slow_dec(&netstamp_needed);
1717 static_key_slow_inc(&netstamp_needed);
1719 EXPORT_SYMBOL(net_enable_timestamp);
1721 void net_disable_timestamp(void)
1723 #ifdef HAVE_JUMP_LABEL
1724 if (in_interrupt()) {
1725 atomic_inc(&netstamp_needed_deferred);
1729 static_key_slow_dec(&netstamp_needed);
1731 EXPORT_SYMBOL(net_disable_timestamp);
1733 static inline void net_timestamp_set(struct sk_buff *skb)
1735 skb->tstamp.tv64 = 0;
1736 if (static_key_false(&netstamp_needed))
1737 __net_timestamp(skb);
1740 #define net_timestamp_check(COND, SKB) \
1741 if (static_key_false(&netstamp_needed)) { \
1742 if ((COND) && !(SKB)->tstamp.tv64) \
1743 __net_timestamp(SKB); \
1746 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1750 if (!(dev->flags & IFF_UP))
1753 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1754 if (skb->len <= len)
1757 /* if TSO is enabled, we don't care about the length as the packet
1758 * could be forwarded without being segmented before
1760 if (skb_is_gso(skb))
1765 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1767 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1769 int ret = ____dev_forward_skb(dev, skb);
1772 skb->protocol = eth_type_trans(skb, dev);
1773 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1778 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1781 * dev_forward_skb - loopback an skb to another netif
1783 * @dev: destination network device
1784 * @skb: buffer to forward
1787 * NET_RX_SUCCESS (no congestion)
1788 * NET_RX_DROP (packet was dropped, but freed)
1790 * dev_forward_skb can be used for injecting an skb from the
1791 * start_xmit function of one device into the receive queue
1792 * of another device.
1794 * The receiving device may be in another namespace, so
1795 * we have to clear all information in the skb that could
1796 * impact namespace isolation.
1798 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1800 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1802 EXPORT_SYMBOL_GPL(dev_forward_skb);
1804 static inline int deliver_skb(struct sk_buff *skb,
1805 struct packet_type *pt_prev,
1806 struct net_device *orig_dev)
1808 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1810 atomic_inc(&skb->users);
1811 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1814 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1815 struct packet_type **pt,
1816 struct net_device *orig_dev,
1818 struct list_head *ptype_list)
1820 struct packet_type *ptype, *pt_prev = *pt;
1822 list_for_each_entry_rcu(ptype, ptype_list, list) {
1823 if (ptype->type != type)
1826 deliver_skb(skb, pt_prev, orig_dev);
1832 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1834 if (!ptype->af_packet_priv || !skb->sk)
1837 if (ptype->id_match)
1838 return ptype->id_match(ptype, skb->sk);
1839 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1846 * Support routine. Sends outgoing frames to any network
1847 * taps currently in use.
1850 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1852 struct packet_type *ptype;
1853 struct sk_buff *skb2 = NULL;
1854 struct packet_type *pt_prev = NULL;
1855 struct list_head *ptype_list = &ptype_all;
1859 list_for_each_entry_rcu(ptype, ptype_list, list) {
1860 /* Never send packets back to the socket
1861 * they originated from - MvS (miquels@drinkel.ow.org)
1863 if (skb_loop_sk(ptype, skb))
1867 deliver_skb(skb2, pt_prev, skb->dev);
1872 /* need to clone skb, done only once */
1873 skb2 = skb_clone(skb, GFP_ATOMIC);
1877 net_timestamp_set(skb2);
1879 /* skb->nh should be correctly
1880 * set by sender, so that the second statement is
1881 * just protection against buggy protocols.
1883 skb_reset_mac_header(skb2);
1885 if (skb_network_header(skb2) < skb2->data ||
1886 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1887 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1888 ntohs(skb2->protocol),
1890 skb_reset_network_header(skb2);
1893 skb2->transport_header = skb2->network_header;
1894 skb2->pkt_type = PACKET_OUTGOING;
1898 if (ptype_list == &ptype_all) {
1899 ptype_list = &dev->ptype_all;
1904 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1907 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1910 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1911 * @dev: Network device
1912 * @txq: number of queues available
1914 * If real_num_tx_queues is changed the tc mappings may no longer be
1915 * valid. To resolve this verify the tc mapping remains valid and if
1916 * not NULL the mapping. With no priorities mapping to this
1917 * offset/count pair it will no longer be used. In the worst case TC0
1918 * is invalid nothing can be done so disable priority mappings. If is
1919 * expected that drivers will fix this mapping if they can before
1920 * calling netif_set_real_num_tx_queues.
1922 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1925 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1927 /* If TC0 is invalidated disable TC mapping */
1928 if (tc->offset + tc->count > txq) {
1929 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934 /* Invalidated prio to tc mappings set to TC0 */
1935 for (i = 1; i < TC_BITMASK + 1; i++) {
1936 int q = netdev_get_prio_tc_map(dev, i);
1938 tc = &dev->tc_to_txq[q];
1939 if (tc->offset + tc->count > txq) {
1940 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1942 netdev_set_prio_tc_map(dev, i, 0);
1948 static DEFINE_MUTEX(xps_map_mutex);
1949 #define xmap_dereference(P) \
1950 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1952 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1955 struct xps_map *map = NULL;
1959 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1961 for (pos = 0; map && pos < map->len; pos++) {
1962 if (map->queues[pos] == index) {
1964 map->queues[pos] = map->queues[--map->len];
1966 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1967 kfree_rcu(map, rcu);
1977 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1979 struct xps_dev_maps *dev_maps;
1981 bool active = false;
1983 mutex_lock(&xps_map_mutex);
1984 dev_maps = xmap_dereference(dev->xps_maps);
1989 for_each_possible_cpu(cpu) {
1990 for (i = index; i < dev->num_tx_queues; i++) {
1991 if (!remove_xps_queue(dev_maps, cpu, i))
1994 if (i == dev->num_tx_queues)
1999 RCU_INIT_POINTER(dev->xps_maps, NULL);
2000 kfree_rcu(dev_maps, rcu);
2003 for (i = index; i < dev->num_tx_queues; i++)
2004 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2008 mutex_unlock(&xps_map_mutex);
2011 static struct xps_map *expand_xps_map(struct xps_map *map,
2014 struct xps_map *new_map;
2015 int alloc_len = XPS_MIN_MAP_ALLOC;
2018 for (pos = 0; map && pos < map->len; pos++) {
2019 if (map->queues[pos] != index)
2024 /* Need to add queue to this CPU's existing map */
2026 if (pos < map->alloc_len)
2029 alloc_len = map->alloc_len * 2;
2032 /* Need to allocate new map to store queue on this CPU's map */
2033 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038 for (i = 0; i < pos; i++)
2039 new_map->queues[i] = map->queues[i];
2040 new_map->alloc_len = alloc_len;
2046 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2049 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2050 struct xps_map *map, *new_map;
2051 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2052 int cpu, numa_node_id = -2;
2053 bool active = false;
2055 mutex_lock(&xps_map_mutex);
2057 dev_maps = xmap_dereference(dev->xps_maps);
2059 /* allocate memory for queue storage */
2060 for_each_online_cpu(cpu) {
2061 if (!cpumask_test_cpu(cpu, mask))
2065 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2066 if (!new_dev_maps) {
2067 mutex_unlock(&xps_map_mutex);
2071 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2074 map = expand_xps_map(map, cpu, index);
2078 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2082 goto out_no_new_maps;
2084 for_each_possible_cpu(cpu) {
2085 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2086 /* add queue to CPU maps */
2089 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2090 while ((pos < map->len) && (map->queues[pos] != index))
2093 if (pos == map->len)
2094 map->queues[map->len++] = index;
2096 if (numa_node_id == -2)
2097 numa_node_id = cpu_to_node(cpu);
2098 else if (numa_node_id != cpu_to_node(cpu))
2101 } else if (dev_maps) {
2102 /* fill in the new device map from the old device map */
2103 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2104 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2111 /* Cleanup old maps */
2113 for_each_possible_cpu(cpu) {
2114 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2115 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2116 if (map && map != new_map)
2117 kfree_rcu(map, rcu);
2120 kfree_rcu(dev_maps, rcu);
2123 dev_maps = new_dev_maps;
2127 /* update Tx queue numa node */
2128 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2129 (numa_node_id >= 0) ? numa_node_id :
2135 /* removes queue from unused CPUs */
2136 for_each_possible_cpu(cpu) {
2137 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2140 if (remove_xps_queue(dev_maps, cpu, index))
2144 /* free map if not active */
2146 RCU_INIT_POINTER(dev->xps_maps, NULL);
2147 kfree_rcu(dev_maps, rcu);
2151 mutex_unlock(&xps_map_mutex);
2155 /* remove any maps that we added */
2156 for_each_possible_cpu(cpu) {
2157 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2158 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2160 if (new_map && new_map != map)
2164 mutex_unlock(&xps_map_mutex);
2166 kfree(new_dev_maps);
2169 EXPORT_SYMBOL(netif_set_xps_queue);
2173 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2174 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2176 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2180 if (txq < 1 || txq > dev->num_tx_queues)
2183 if (dev->reg_state == NETREG_REGISTERED ||
2184 dev->reg_state == NETREG_UNREGISTERING) {
2187 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2193 netif_setup_tc(dev, txq);
2195 if (txq < dev->real_num_tx_queues) {
2196 qdisc_reset_all_tx_gt(dev, txq);
2198 netif_reset_xps_queues_gt(dev, txq);
2203 dev->real_num_tx_queues = txq;
2206 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2210 * netif_set_real_num_rx_queues - set actual number of RX queues used
2211 * @dev: Network device
2212 * @rxq: Actual number of RX queues
2214 * This must be called either with the rtnl_lock held or before
2215 * registration of the net device. Returns 0 on success, or a
2216 * negative error code. If called before registration, it always
2219 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2223 if (rxq < 1 || rxq > dev->num_rx_queues)
2226 if (dev->reg_state == NETREG_REGISTERED) {
2229 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2235 dev->real_num_rx_queues = rxq;
2238 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2242 * netif_get_num_default_rss_queues - default number of RSS queues
2244 * This routine should set an upper limit on the number of RSS queues
2245 * used by default by multiqueue devices.
2247 int netif_get_num_default_rss_queues(void)
2249 return is_kdump_kernel() ?
2250 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2252 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2254 static void __netif_reschedule(struct Qdisc *q)
2256 struct softnet_data *sd;
2257 unsigned long flags;
2259 local_irq_save(flags);
2260 sd = this_cpu_ptr(&softnet_data);
2261 q->next_sched = NULL;
2262 *sd->output_queue_tailp = q;
2263 sd->output_queue_tailp = &q->next_sched;
2264 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2265 local_irq_restore(flags);
2268 void __netif_schedule(struct Qdisc *q)
2270 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2271 __netif_reschedule(q);
2273 EXPORT_SYMBOL(__netif_schedule);
2275 struct dev_kfree_skb_cb {
2276 enum skb_free_reason reason;
2279 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2281 return (struct dev_kfree_skb_cb *)skb->cb;
2284 void netif_schedule_queue(struct netdev_queue *txq)
2287 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2288 struct Qdisc *q = rcu_dereference(txq->qdisc);
2290 __netif_schedule(q);
2294 EXPORT_SYMBOL(netif_schedule_queue);
2297 * netif_wake_subqueue - allow sending packets on subqueue
2298 * @dev: network device
2299 * @queue_index: sub queue index
2301 * Resume individual transmit queue of a device with multiple transmit queues.
2303 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2305 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2307 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2311 q = rcu_dereference(txq->qdisc);
2312 __netif_schedule(q);
2316 EXPORT_SYMBOL(netif_wake_subqueue);
2318 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2320 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2324 q = rcu_dereference(dev_queue->qdisc);
2325 __netif_schedule(q);
2329 EXPORT_SYMBOL(netif_tx_wake_queue);
2331 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2333 unsigned long flags;
2335 if (likely(atomic_read(&skb->users) == 1)) {
2337 atomic_set(&skb->users, 0);
2338 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2341 get_kfree_skb_cb(skb)->reason = reason;
2342 local_irq_save(flags);
2343 skb->next = __this_cpu_read(softnet_data.completion_queue);
2344 __this_cpu_write(softnet_data.completion_queue, skb);
2345 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2346 local_irq_restore(flags);
2348 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2350 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2352 if (in_irq() || irqs_disabled())
2353 __dev_kfree_skb_irq(skb, reason);
2357 EXPORT_SYMBOL(__dev_kfree_skb_any);
2361 * netif_device_detach - mark device as removed
2362 * @dev: network device
2364 * Mark device as removed from system and therefore no longer available.
2366 void netif_device_detach(struct net_device *dev)
2368 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2369 netif_running(dev)) {
2370 netif_tx_stop_all_queues(dev);
2373 EXPORT_SYMBOL(netif_device_detach);
2376 * netif_device_attach - mark device as attached
2377 * @dev: network device
2379 * Mark device as attached from system and restart if needed.
2381 void netif_device_attach(struct net_device *dev)
2383 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2384 netif_running(dev)) {
2385 netif_tx_wake_all_queues(dev);
2386 __netdev_watchdog_up(dev);
2389 EXPORT_SYMBOL(netif_device_attach);
2392 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2393 * to be used as a distribution range.
2395 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2396 unsigned int num_tx_queues)
2400 u16 qcount = num_tx_queues;
2402 if (skb_rx_queue_recorded(skb)) {
2403 hash = skb_get_rx_queue(skb);
2404 while (unlikely(hash >= num_tx_queues))
2405 hash -= num_tx_queues;
2410 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2411 qoffset = dev->tc_to_txq[tc].offset;
2412 qcount = dev->tc_to_txq[tc].count;
2415 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2417 EXPORT_SYMBOL(__skb_tx_hash);
2419 static void skb_warn_bad_offload(const struct sk_buff *skb)
2421 static const netdev_features_t null_features;
2422 struct net_device *dev = skb->dev;
2423 const char *name = "";
2425 if (!net_ratelimit())
2429 if (dev->dev.parent)
2430 name = dev_driver_string(dev->dev.parent);
2432 name = netdev_name(dev);
2434 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2435 "gso_type=%d ip_summed=%d\n",
2436 name, dev ? &dev->features : &null_features,
2437 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2438 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2439 skb_shinfo(skb)->gso_type, skb->ip_summed);
2443 * Invalidate hardware checksum when packet is to be mangled, and
2444 * complete checksum manually on outgoing path.
2446 int skb_checksum_help(struct sk_buff *skb)
2449 int ret = 0, offset;
2451 if (skb->ip_summed == CHECKSUM_COMPLETE)
2452 goto out_set_summed;
2454 if (unlikely(skb_shinfo(skb)->gso_size)) {
2455 skb_warn_bad_offload(skb);
2459 /* Before computing a checksum, we should make sure no frag could
2460 * be modified by an external entity : checksum could be wrong.
2462 if (skb_has_shared_frag(skb)) {
2463 ret = __skb_linearize(skb);
2468 offset = skb_checksum_start_offset(skb);
2469 BUG_ON(offset >= skb_headlen(skb));
2470 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2472 offset += skb->csum_offset;
2473 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2475 if (skb_cloned(skb) &&
2476 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2477 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2482 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2484 skb->ip_summed = CHECKSUM_NONE;
2488 EXPORT_SYMBOL(skb_checksum_help);
2490 /* skb_csum_offload_check - Driver helper function to determine if a device
2491 * with limited checksum offload capabilities is able to offload the checksum
2492 * for a given packet.
2495 * skb - sk_buff for the packet in question
2496 * spec - contains the description of what device can offload
2497 * csum_encapped - returns true if the checksum being offloaded is
2498 * encpasulated. That is it is checksum for the transport header
2499 * in the inner headers.
2500 * checksum_help - when set indicates that helper function should
2501 * call skb_checksum_help if offload checks fail
2504 * true: Packet has passed the checksum checks and should be offloadable to
2505 * the device (a driver may still need to check for additional
2506 * restrictions of its device)
2507 * false: Checksum is not offloadable. If checksum_help was set then
2508 * skb_checksum_help was called to resolve checksum for non-GSO
2509 * packets and when IP protocol is not SCTP
2511 bool __skb_csum_offload_chk(struct sk_buff *skb,
2512 const struct skb_csum_offl_spec *spec,
2513 bool *csum_encapped,
2517 struct ipv6hdr *ipv6;
2522 if (skb->protocol == htons(ETH_P_8021Q) ||
2523 skb->protocol == htons(ETH_P_8021AD)) {
2524 if (!spec->vlan_okay)
2528 /* We check whether the checksum refers to a transport layer checksum in
2529 * the outermost header or an encapsulated transport layer checksum that
2530 * corresponds to the inner headers of the skb. If the checksum is for
2531 * something else in the packet we need help.
2533 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2534 /* Non-encapsulated checksum */
2535 protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2536 nhdr = skb_network_header(skb);
2537 *csum_encapped = false;
2538 if (spec->no_not_encapped)
2540 } else if (skb->encapsulation && spec->encap_okay &&
2541 skb_checksum_start_offset(skb) ==
2542 skb_inner_transport_offset(skb)) {
2543 /* Encapsulated checksum */
2544 *csum_encapped = true;
2545 switch (skb->inner_protocol_type) {
2546 case ENCAP_TYPE_ETHER:
2547 protocol = eproto_to_ipproto(skb->inner_protocol);
2549 case ENCAP_TYPE_IPPROTO:
2550 protocol = skb->inner_protocol;
2553 nhdr = skb_inner_network_header(skb);
2560 if (!spec->ipv4_okay)
2563 ip_proto = iph->protocol;
2564 if (iph->ihl != 5 && !spec->ip_options_okay)
2568 if (!spec->ipv6_okay)
2570 if (spec->no_encapped_ipv6 && *csum_encapped)
2573 nhdr += sizeof(*ipv6);
2574 ip_proto = ipv6->nexthdr;
2583 if (!spec->tcp_okay ||
2584 skb->csum_offset != offsetof(struct tcphdr, check))
2588 if (!spec->udp_okay ||
2589 skb->csum_offset != offsetof(struct udphdr, check))
2593 if (!spec->sctp_okay ||
2594 skb->csum_offset != offsetof(struct sctphdr, checksum))
2598 case NEXTHDR_ROUTING:
2599 case NEXTHDR_DEST: {
2602 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2605 ip_proto = opthdr[0];
2606 nhdr += (opthdr[1] + 1) << 3;
2608 goto ip_proto_again;
2614 /* Passed the tests for offloading checksum */
2618 if (csum_help && !skb_shinfo(skb)->gso_size)
2619 skb_checksum_help(skb);
2623 EXPORT_SYMBOL(__skb_csum_offload_chk);
2625 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2627 __be16 type = skb->protocol;
2629 /* Tunnel gso handlers can set protocol to ethernet. */
2630 if (type == htons(ETH_P_TEB)) {
2633 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2636 eth = (struct ethhdr *)skb_mac_header(skb);
2637 type = eth->h_proto;
2640 return __vlan_get_protocol(skb, type, depth);
2644 * skb_mac_gso_segment - mac layer segmentation handler.
2645 * @skb: buffer to segment
2646 * @features: features for the output path (see dev->features)
2648 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2649 netdev_features_t features)
2651 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2652 struct packet_offload *ptype;
2653 int vlan_depth = skb->mac_len;
2654 __be16 type = skb_network_protocol(skb, &vlan_depth);
2656 if (unlikely(!type))
2657 return ERR_PTR(-EINVAL);
2659 __skb_pull(skb, vlan_depth);
2662 list_for_each_entry_rcu(ptype, &offload_base, list) {
2663 if (ptype->type == type && ptype->callbacks.gso_segment) {
2664 segs = ptype->callbacks.gso_segment(skb, features);
2670 __skb_push(skb, skb->data - skb_mac_header(skb));
2674 EXPORT_SYMBOL(skb_mac_gso_segment);
2677 /* openvswitch calls this on rx path, so we need a different check.
2679 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2682 return skb->ip_summed != CHECKSUM_PARTIAL;
2684 return skb->ip_summed == CHECKSUM_NONE;
2688 * __skb_gso_segment - Perform segmentation on skb.
2689 * @skb: buffer to segment
2690 * @features: features for the output path (see dev->features)
2691 * @tx_path: whether it is called in TX path
2693 * This function segments the given skb and returns a list of segments.
2695 * It may return NULL if the skb requires no segmentation. This is
2696 * only possible when GSO is used for verifying header integrity.
2698 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2700 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2701 netdev_features_t features, bool tx_path)
2703 if (unlikely(skb_needs_check(skb, tx_path))) {
2706 skb_warn_bad_offload(skb);
2708 err = skb_cow_head(skb, 0);
2710 return ERR_PTR(err);
2713 /* Only report GSO partial support if it will enable us to
2714 * support segmentation on this frame without needing additional
2717 if (features & NETIF_F_GSO_PARTIAL) {
2718 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2719 struct net_device *dev = skb->dev;
2721 partial_features |= dev->features & dev->gso_partial_features;
2722 if (!skb_gso_ok(skb, features | partial_features))
2723 features &= ~NETIF_F_GSO_PARTIAL;
2726 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2727 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2729 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2730 SKB_GSO_CB(skb)->encap_level = 0;
2732 skb_reset_mac_header(skb);
2733 skb_reset_mac_len(skb);
2735 return skb_mac_gso_segment(skb, features);
2737 EXPORT_SYMBOL(__skb_gso_segment);
2739 /* Take action when hardware reception checksum errors are detected. */
2741 void netdev_rx_csum_fault(struct net_device *dev)
2743 if (net_ratelimit()) {
2744 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2748 EXPORT_SYMBOL(netdev_rx_csum_fault);
2751 /* Actually, we should eliminate this check as soon as we know, that:
2752 * 1. IOMMU is present and allows to map all the memory.
2753 * 2. No high memory really exists on this machine.
2756 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2758 #ifdef CONFIG_HIGHMEM
2760 if (!(dev->features & NETIF_F_HIGHDMA)) {
2761 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2762 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2763 if (PageHighMem(skb_frag_page(frag)))
2768 if (PCI_DMA_BUS_IS_PHYS) {
2769 struct device *pdev = dev->dev.parent;
2773 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2774 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2775 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2776 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2784 /* If MPLS offload request, verify we are testing hardware MPLS features
2785 * instead of standard features for the netdev.
2787 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2788 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2789 netdev_features_t features,
2792 if (eth_p_mpls(type))
2793 features &= skb->dev->mpls_features;
2798 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2799 netdev_features_t features,
2806 static netdev_features_t harmonize_features(struct sk_buff *skb,
2807 netdev_features_t features)
2812 type = skb_network_protocol(skb, &tmp);
2813 features = net_mpls_features(skb, features, type);
2815 if (skb->ip_summed != CHECKSUM_NONE &&
2816 !can_checksum_protocol(features, type)) {
2817 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2818 } else if (illegal_highdma(skb->dev, skb)) {
2819 features &= ~NETIF_F_SG;
2825 netdev_features_t passthru_features_check(struct sk_buff *skb,
2826 struct net_device *dev,
2827 netdev_features_t features)
2831 EXPORT_SYMBOL(passthru_features_check);
2833 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2834 struct net_device *dev,
2835 netdev_features_t features)
2837 return vlan_features_check(skb, features);
2840 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2841 struct net_device *dev,
2842 netdev_features_t features)
2844 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2846 if (gso_segs > dev->gso_max_segs)
2847 return features & ~NETIF_F_GSO_MASK;
2849 /* Support for GSO partial features requires software
2850 * intervention before we can actually process the packets
2851 * so we need to strip support for any partial features now
2852 * and we can pull them back in after we have partially
2853 * segmented the frame.
2855 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2856 features &= ~dev->gso_partial_features;
2858 /* Make sure to clear the IPv4 ID mangling feature if the
2859 * IPv4 header has the potential to be fragmented.
2861 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2862 struct iphdr *iph = skb->encapsulation ?
2863 inner_ip_hdr(skb) : ip_hdr(skb);
2865 if (!(iph->frag_off & htons(IP_DF)))
2866 features &= ~NETIF_F_TSO_MANGLEID;
2872 netdev_features_t netif_skb_features(struct sk_buff *skb)
2874 struct net_device *dev = skb->dev;
2875 netdev_features_t features = dev->features;
2877 if (skb_is_gso(skb))
2878 features = gso_features_check(skb, dev, features);
2880 /* If encapsulation offload request, verify we are testing
2881 * hardware encapsulation features instead of standard
2882 * features for the netdev
2884 if (skb->encapsulation)
2885 features &= dev->hw_enc_features;
2887 if (skb_vlan_tagged(skb))
2888 features = netdev_intersect_features(features,
2889 dev->vlan_features |
2890 NETIF_F_HW_VLAN_CTAG_TX |
2891 NETIF_F_HW_VLAN_STAG_TX);
2893 if (dev->netdev_ops->ndo_features_check)
2894 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2897 features &= dflt_features_check(skb, dev, features);
2899 return harmonize_features(skb, features);
2901 EXPORT_SYMBOL(netif_skb_features);
2903 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2904 struct netdev_queue *txq, bool more)
2909 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2910 dev_queue_xmit_nit(skb, dev);
2913 trace_net_dev_start_xmit(skb, dev);
2914 rc = netdev_start_xmit(skb, dev, txq, more);
2915 trace_net_dev_xmit(skb, rc, dev, len);
2920 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2921 struct netdev_queue *txq, int *ret)
2923 struct sk_buff *skb = first;
2924 int rc = NETDEV_TX_OK;
2927 struct sk_buff *next = skb->next;
2930 rc = xmit_one(skb, dev, txq, next != NULL);
2931 if (unlikely(!dev_xmit_complete(rc))) {
2937 if (netif_xmit_stopped(txq) && skb) {
2938 rc = NETDEV_TX_BUSY;
2948 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2949 netdev_features_t features)
2951 if (skb_vlan_tag_present(skb) &&
2952 !vlan_hw_offload_capable(features, skb->vlan_proto))
2953 skb = __vlan_hwaccel_push_inside(skb);
2957 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2959 netdev_features_t features;
2961 features = netif_skb_features(skb);
2962 skb = validate_xmit_vlan(skb, features);
2966 if (netif_needs_gso(skb, features)) {
2967 struct sk_buff *segs;
2969 segs = skb_gso_segment(skb, features);
2977 if (skb_needs_linearize(skb, features) &&
2978 __skb_linearize(skb))
2981 /* If packet is not checksummed and device does not
2982 * support checksumming for this protocol, complete
2983 * checksumming here.
2985 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2986 if (skb->encapsulation)
2987 skb_set_inner_transport_header(skb,
2988 skb_checksum_start_offset(skb));
2990 skb_set_transport_header(skb,
2991 skb_checksum_start_offset(skb));
2992 if (!(features & NETIF_F_CSUM_MASK) &&
2993 skb_checksum_help(skb))
3003 atomic_long_inc(&dev->tx_dropped);
3007 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3009 struct sk_buff *next, *head = NULL, *tail;
3011 for (; skb != NULL; skb = next) {
3015 /* in case skb wont be segmented, point to itself */
3018 skb = validate_xmit_skb(skb, dev);
3026 /* If skb was segmented, skb->prev points to
3027 * the last segment. If not, it still contains skb.
3033 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3035 static void qdisc_pkt_len_init(struct sk_buff *skb)
3037 const struct skb_shared_info *shinfo = skb_shinfo(skb);
3039 qdisc_skb_cb(skb)->pkt_len = skb->len;
3041 /* To get more precise estimation of bytes sent on wire,
3042 * we add to pkt_len the headers size of all segments
3044 if (shinfo->gso_size) {
3045 unsigned int hdr_len;
3046 u16 gso_segs = shinfo->gso_segs;
3048 /* mac layer + network layer */
3049 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3051 /* + transport layer */
3052 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3053 hdr_len += tcp_hdrlen(skb);
3055 hdr_len += sizeof(struct udphdr);
3057 if (shinfo->gso_type & SKB_GSO_DODGY)
3058 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3061 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3065 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3066 struct net_device *dev,
3067 struct netdev_queue *txq)
3069 spinlock_t *root_lock = qdisc_lock(q);
3070 struct sk_buff *to_free = NULL;
3074 qdisc_calculate_pkt_len(skb, q);
3076 * Heuristic to force contended enqueues to serialize on a
3077 * separate lock before trying to get qdisc main lock.
3078 * This permits qdisc->running owner to get the lock more
3079 * often and dequeue packets faster.
3081 contended = qdisc_is_running(q);
3082 if (unlikely(contended))
3083 spin_lock(&q->busylock);
3085 spin_lock(root_lock);
3086 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3087 __qdisc_drop(skb, &to_free);
3089 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3090 qdisc_run_begin(q)) {
3092 * This is a work-conserving queue; there are no old skbs
3093 * waiting to be sent out; and the qdisc is not running -
3094 * xmit the skb directly.
3097 qdisc_bstats_update(q, skb);
3099 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3100 if (unlikely(contended)) {
3101 spin_unlock(&q->busylock);
3108 rc = NET_XMIT_SUCCESS;
3110 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3111 if (qdisc_run_begin(q)) {
3112 if (unlikely(contended)) {
3113 spin_unlock(&q->busylock);
3119 spin_unlock(root_lock);
3120 if (unlikely(to_free))
3121 kfree_skb_list(to_free);
3122 if (unlikely(contended))
3123 spin_unlock(&q->busylock);
3127 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3128 static void skb_update_prio(struct sk_buff *skb)
3130 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3132 if (!skb->priority && skb->sk && map) {
3133 unsigned int prioidx =
3134 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3136 if (prioidx < map->priomap_len)
3137 skb->priority = map->priomap[prioidx];
3141 #define skb_update_prio(skb)
3144 DEFINE_PER_CPU(int, xmit_recursion);
3145 EXPORT_SYMBOL(xmit_recursion);
3148 * dev_loopback_xmit - loop back @skb
3149 * @net: network namespace this loopback is happening in
3150 * @sk: sk needed to be a netfilter okfn
3151 * @skb: buffer to transmit
3153 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3155 skb_reset_mac_header(skb);
3156 __skb_pull(skb, skb_network_offset(skb));
3157 skb->pkt_type = PACKET_LOOPBACK;
3158 skb->ip_summed = CHECKSUM_UNNECESSARY;
3159 WARN_ON(!skb_dst(skb));
3164 EXPORT_SYMBOL(dev_loopback_xmit);
3166 #ifdef CONFIG_NET_EGRESS
3167 static struct sk_buff *
3168 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3170 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3171 struct tcf_result cl_res;
3176 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3177 * earlier by the caller.
3179 qdisc_bstats_cpu_update(cl->q, skb);
3181 switch (tc_classify(skb, cl, &cl_res, false)) {
3183 case TC_ACT_RECLASSIFY:
3184 skb->tc_index = TC_H_MIN(cl_res.classid);
3187 qdisc_qstats_cpu_drop(cl->q);
3188 *ret = NET_XMIT_DROP;
3193 *ret = NET_XMIT_SUCCESS;
3196 case TC_ACT_REDIRECT:
3197 /* No need to push/pop skb's mac_header here on egress! */
3198 skb_do_redirect(skb);
3199 *ret = NET_XMIT_SUCCESS;
3207 #endif /* CONFIG_NET_EGRESS */
3209 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3212 struct xps_dev_maps *dev_maps;
3213 struct xps_map *map;
3214 int queue_index = -1;
3217 dev_maps = rcu_dereference(dev->xps_maps);
3219 map = rcu_dereference(
3220 dev_maps->cpu_map[skb->sender_cpu - 1]);
3223 queue_index = map->queues[0];
3225 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3227 if (unlikely(queue_index >= dev->real_num_tx_queues))
3239 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3241 struct sock *sk = skb->sk;
3242 int queue_index = sk_tx_queue_get(sk);
3244 if (queue_index < 0 || skb->ooo_okay ||
3245 queue_index >= dev->real_num_tx_queues) {
3246 int new_index = get_xps_queue(dev, skb);
3248 new_index = skb_tx_hash(dev, skb);
3250 if (queue_index != new_index && sk &&
3252 rcu_access_pointer(sk->sk_dst_cache))
3253 sk_tx_queue_set(sk, new_index);
3255 queue_index = new_index;
3261 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3262 struct sk_buff *skb,
3265 int queue_index = 0;
3268 u32 sender_cpu = skb->sender_cpu - 1;
3270 if (sender_cpu >= (u32)NR_CPUS)
3271 skb->sender_cpu = raw_smp_processor_id() + 1;
3274 if (dev->real_num_tx_queues != 1) {
3275 const struct net_device_ops *ops = dev->netdev_ops;
3276 if (ops->ndo_select_queue)
3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3280 queue_index = __netdev_pick_tx(dev, skb);
3283 queue_index = netdev_cap_txqueue(dev, queue_index);
3286 skb_set_queue_mapping(skb, queue_index);
3287 return netdev_get_tx_queue(dev, queue_index);
3291 * __dev_queue_xmit - transmit a buffer
3292 * @skb: buffer to transmit
3293 * @accel_priv: private data used for L2 forwarding offload
3295 * Queue a buffer for transmission to a network device. The caller must
3296 * have set the device and priority and built the buffer before calling
3297 * this function. The function can be called from an interrupt.
3299 * A negative errno code is returned on a failure. A success does not
3300 * guarantee the frame will be transmitted as it may be dropped due
3301 * to congestion or traffic shaping.
3303 * -----------------------------------------------------------------------------------
3304 * I notice this method can also return errors from the queue disciplines,
3305 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3308 * Regardless of the return value, the skb is consumed, so it is currently
3309 * difficult to retry a send to this method. (You can bump the ref count
3310 * before sending to hold a reference for retry if you are careful.)
3312 * When calling this method, interrupts MUST be enabled. This is because
3313 * the BH enable code must have IRQs enabled so that it will not deadlock.
3316 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3318 struct net_device *dev = skb->dev;
3319 struct netdev_queue *txq;
3323 skb_reset_mac_header(skb);
3325 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3326 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3328 /* Disable soft irqs for various locks below. Also
3329 * stops preemption for RCU.
3333 skb_update_prio(skb);
3335 qdisc_pkt_len_init(skb);
3336 #ifdef CONFIG_NET_CLS_ACT
3337 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3338 # ifdef CONFIG_NET_EGRESS
3339 if (static_key_false(&egress_needed)) {
3340 skb = sch_handle_egress(skb, &rc, dev);
3346 /* If device/qdisc don't need skb->dst, release it right now while
3347 * its hot in this cpu cache.
3349 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3354 txq = netdev_pick_tx(dev, skb, accel_priv);
3355 q = rcu_dereference_bh(txq->qdisc);
3357 trace_net_dev_queue(skb);
3359 rc = __dev_xmit_skb(skb, q, dev, txq);
3363 /* The device has no queue. Common case for software devices:
3364 loopback, all the sorts of tunnels...
3366 Really, it is unlikely that netif_tx_lock protection is necessary
3367 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3369 However, it is possible, that they rely on protection
3372 Check this and shot the lock. It is not prone from deadlocks.
3373 Either shot noqueue qdisc, it is even simpler 8)
3375 if (dev->flags & IFF_UP) {
3376 int cpu = smp_processor_id(); /* ok because BHs are off */
3378 if (txq->xmit_lock_owner != cpu) {
3379 if (unlikely(__this_cpu_read(xmit_recursion) >
3380 XMIT_RECURSION_LIMIT))
3381 goto recursion_alert;
3383 skb = validate_xmit_skb(skb, dev);
3387 HARD_TX_LOCK(dev, txq, cpu);
3389 if (!netif_xmit_stopped(txq)) {
3390 __this_cpu_inc(xmit_recursion);
3391 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3392 __this_cpu_dec(xmit_recursion);
3393 if (dev_xmit_complete(rc)) {
3394 HARD_TX_UNLOCK(dev, txq);
3398 HARD_TX_UNLOCK(dev, txq);
3399 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3402 /* Recursion is detected! It is possible,
3406 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3412 rcu_read_unlock_bh();
3414 atomic_long_inc(&dev->tx_dropped);
3415 kfree_skb_list(skb);
3418 rcu_read_unlock_bh();
3422 int dev_queue_xmit(struct sk_buff *skb)
3424 return __dev_queue_xmit(skb, NULL);
3426 EXPORT_SYMBOL(dev_queue_xmit);
3428 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3430 return __dev_queue_xmit(skb, accel_priv);
3432 EXPORT_SYMBOL(dev_queue_xmit_accel);
3435 /*=======================================================================
3437 =======================================================================*/
3439 int netdev_max_backlog __read_mostly = 1000;
3440 EXPORT_SYMBOL(netdev_max_backlog);
3442 int netdev_tstamp_prequeue __read_mostly = 1;
3443 int netdev_budget __read_mostly = 300;
3444 int weight_p __read_mostly = 64; /* old backlog weight */
3446 /* Called with irq disabled */
3447 static inline void ____napi_schedule(struct softnet_data *sd,
3448 struct napi_struct *napi)
3450 list_add_tail(&napi->poll_list, &sd->poll_list);
3451 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3456 /* One global table that all flow-based protocols share. */
3457 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3458 EXPORT_SYMBOL(rps_sock_flow_table);
3459 u32 rps_cpu_mask __read_mostly;
3460 EXPORT_SYMBOL(rps_cpu_mask);
3462 struct static_key rps_needed __read_mostly;
3463 EXPORT_SYMBOL(rps_needed);
3465 static struct rps_dev_flow *
3466 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3467 struct rps_dev_flow *rflow, u16 next_cpu)
3469 if (next_cpu < nr_cpu_ids) {
3470 #ifdef CONFIG_RFS_ACCEL
3471 struct netdev_rx_queue *rxqueue;
3472 struct rps_dev_flow_table *flow_table;
3473 struct rps_dev_flow *old_rflow;
3478 /* Should we steer this flow to a different hardware queue? */
3479 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3480 !(dev->features & NETIF_F_NTUPLE))
3482 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3483 if (rxq_index == skb_get_rx_queue(skb))
3486 rxqueue = dev->_rx + rxq_index;
3487 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3490 flow_id = skb_get_hash(skb) & flow_table->mask;
3491 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3492 rxq_index, flow_id);
3496 rflow = &flow_table->flows[flow_id];
3498 if (old_rflow->filter == rflow->filter)
3499 old_rflow->filter = RPS_NO_FILTER;
3503 per_cpu(softnet_data, next_cpu).input_queue_head;
3506 rflow->cpu = next_cpu;
3511 * get_rps_cpu is called from netif_receive_skb and returns the target
3512 * CPU from the RPS map of the receiving queue for a given skb.
3513 * rcu_read_lock must be held on entry.
3515 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3516 struct rps_dev_flow **rflowp)
3518 const struct rps_sock_flow_table *sock_flow_table;
3519 struct netdev_rx_queue *rxqueue = dev->_rx;
3520 struct rps_dev_flow_table *flow_table;
3521 struct rps_map *map;
3526 if (skb_rx_queue_recorded(skb)) {
3527 u16 index = skb_get_rx_queue(skb);
3529 if (unlikely(index >= dev->real_num_rx_queues)) {
3530 WARN_ONCE(dev->real_num_rx_queues > 1,
3531 "%s received packet on queue %u, but number "
3532 "of RX queues is %u\n",
3533 dev->name, index, dev->real_num_rx_queues);
3539 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3541 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3542 map = rcu_dereference(rxqueue->rps_map);
3543 if (!flow_table && !map)
3546 skb_reset_network_header(skb);
3547 hash = skb_get_hash(skb);
3551 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3552 if (flow_table && sock_flow_table) {
3553 struct rps_dev_flow *rflow;
3557 /* First check into global flow table if there is a match */
3558 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3559 if ((ident ^ hash) & ~rps_cpu_mask)
3562 next_cpu = ident & rps_cpu_mask;
3564 /* OK, now we know there is a match,
3565 * we can look at the local (per receive queue) flow table
3567 rflow = &flow_table->flows[hash & flow_table->mask];
3571 * If the desired CPU (where last recvmsg was done) is
3572 * different from current CPU (one in the rx-queue flow
3573 * table entry), switch if one of the following holds:
3574 * - Current CPU is unset (>= nr_cpu_ids).
3575 * - Current CPU is offline.
3576 * - The current CPU's queue tail has advanced beyond the
3577 * last packet that was enqueued using this table entry.
3578 * This guarantees that all previous packets for the flow
3579 * have been dequeued, thus preserving in order delivery.
3581 if (unlikely(tcpu != next_cpu) &&
3582 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3583 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3584 rflow->last_qtail)) >= 0)) {
3586 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3589 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3599 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3600 if (cpu_online(tcpu)) {
3610 #ifdef CONFIG_RFS_ACCEL
3613 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3614 * @dev: Device on which the filter was set
3615 * @rxq_index: RX queue index
3616 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3617 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3619 * Drivers that implement ndo_rx_flow_steer() should periodically call
3620 * this function for each installed filter and remove the filters for
3621 * which it returns %true.
3623 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3624 u32 flow_id, u16 filter_id)
3626 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3627 struct rps_dev_flow_table *flow_table;
3628 struct rps_dev_flow *rflow;
3633 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3634 if (flow_table && flow_id <= flow_table->mask) {
3635 rflow = &flow_table->flows[flow_id];
3636 cpu = ACCESS_ONCE(rflow->cpu);
3637 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3638 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3639 rflow->last_qtail) <
3640 (int)(10 * flow_table->mask)))
3646 EXPORT_SYMBOL(rps_may_expire_flow);
3648 #endif /* CONFIG_RFS_ACCEL */
3650 /* Called from hardirq (IPI) context */
3651 static void rps_trigger_softirq(void *data)
3653 struct softnet_data *sd = data;
3655 ____napi_schedule(sd, &sd->backlog);
3659 #endif /* CONFIG_RPS */
3662 * Check if this softnet_data structure is another cpu one
3663 * If yes, queue it to our IPI list and return 1
3666 static int rps_ipi_queued(struct softnet_data *sd)
3669 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3672 sd->rps_ipi_next = mysd->rps_ipi_list;
3673 mysd->rps_ipi_list = sd;
3675 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3678 #endif /* CONFIG_RPS */
3682 #ifdef CONFIG_NET_FLOW_LIMIT
3683 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3686 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3688 #ifdef CONFIG_NET_FLOW_LIMIT
3689 struct sd_flow_limit *fl;
3690 struct softnet_data *sd;
3691 unsigned int old_flow, new_flow;
3693 if (qlen < (netdev_max_backlog >> 1))
3696 sd = this_cpu_ptr(&softnet_data);
3699 fl = rcu_dereference(sd->flow_limit);
3701 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3702 old_flow = fl->history[fl->history_head];
3703 fl->history[fl->history_head] = new_flow;
3706 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3708 if (likely(fl->buckets[old_flow]))
3709 fl->buckets[old_flow]--;
3711 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3723 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3724 * queue (may be a remote CPU queue).
3726 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3727 unsigned int *qtail)
3729 struct softnet_data *sd;
3730 unsigned long flags;
3733 sd = &per_cpu(softnet_data, cpu);
3735 local_irq_save(flags);
3738 if (!netif_running(skb->dev))
3740 qlen = skb_queue_len(&sd->input_pkt_queue);
3741 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3744 __skb_queue_tail(&sd->input_pkt_queue, skb);
3745 input_queue_tail_incr_save(sd, qtail);
3747 local_irq_restore(flags);
3748 return NET_RX_SUCCESS;
3751 /* Schedule NAPI for backlog device
3752 * We can use non atomic operation since we own the queue lock
3754 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3755 if (!rps_ipi_queued(sd))
3756 ____napi_schedule(sd, &sd->backlog);
3765 local_irq_restore(flags);
3767 atomic_long_inc(&skb->dev->rx_dropped);
3772 static int netif_rx_internal(struct sk_buff *skb)
3776 net_timestamp_check(netdev_tstamp_prequeue, skb);
3778 trace_netif_rx(skb);
3780 if (static_key_false(&rps_needed)) {
3781 struct rps_dev_flow voidflow, *rflow = &voidflow;
3787 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3789 cpu = smp_processor_id();
3791 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3799 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3806 * netif_rx - post buffer to the network code
3807 * @skb: buffer to post
3809 * This function receives a packet from a device driver and queues it for
3810 * the upper (protocol) levels to process. It always succeeds. The buffer
3811 * may be dropped during processing for congestion control or by the
3815 * NET_RX_SUCCESS (no congestion)
3816 * NET_RX_DROP (packet was dropped)
3820 int netif_rx(struct sk_buff *skb)
3822 trace_netif_rx_entry(skb);
3824 return netif_rx_internal(skb);
3826 EXPORT_SYMBOL(netif_rx);
3828 int netif_rx_ni(struct sk_buff *skb)
3832 trace_netif_rx_ni_entry(skb);
3835 err = netif_rx_internal(skb);
3836 if (local_softirq_pending())
3842 EXPORT_SYMBOL(netif_rx_ni);
3844 static __latent_entropy void net_tx_action(struct softirq_action *h)
3846 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3848 if (sd->completion_queue) {
3849 struct sk_buff *clist;
3851 local_irq_disable();
3852 clist = sd->completion_queue;
3853 sd->completion_queue = NULL;
3857 struct sk_buff *skb = clist;
3858 clist = clist->next;
3860 WARN_ON(atomic_read(&skb->users));
3861 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3862 trace_consume_skb(skb);
3864 trace_kfree_skb(skb, net_tx_action);
3866 if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3869 __kfree_skb_defer(skb);
3872 __kfree_skb_flush();
3875 if (sd->output_queue) {
3878 local_irq_disable();
3879 head = sd->output_queue;
3880 sd->output_queue = NULL;
3881 sd->output_queue_tailp = &sd->output_queue;
3885 struct Qdisc *q = head;
3886 spinlock_t *root_lock;
3888 head = head->next_sched;
3890 root_lock = qdisc_lock(q);
3891 spin_lock(root_lock);
3892 /* We need to make sure head->next_sched is read
3893 * before clearing __QDISC_STATE_SCHED
3895 smp_mb__before_atomic();
3896 clear_bit(__QDISC_STATE_SCHED, &q->state);
3898 spin_unlock(root_lock);
3903 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3904 /* This hook is defined here for ATM LANE */
3905 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3906 unsigned char *addr) __read_mostly;
3907 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3910 static inline struct sk_buff *
3911 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3912 struct net_device *orig_dev)
3914 #ifdef CONFIG_NET_CLS_ACT
3915 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3916 struct tcf_result cl_res;
3918 /* If there's at least one ingress present somewhere (so
3919 * we get here via enabled static key), remaining devices
3920 * that are not configured with an ingress qdisc will bail
3926 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3930 qdisc_skb_cb(skb)->pkt_len = skb->len;
3931 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3932 qdisc_bstats_cpu_update(cl->q, skb);
3934 switch (tc_classify(skb, cl, &cl_res, false)) {
3936 case TC_ACT_RECLASSIFY:
3937 skb->tc_index = TC_H_MIN(cl_res.classid);
3940 qdisc_qstats_cpu_drop(cl->q);
3947 case TC_ACT_REDIRECT:
3948 /* skb_mac_header check was done by cls/act_bpf, so
3949 * we can safely push the L2 header back before
3950 * redirecting to another netdev
3952 __skb_push(skb, skb->mac_len);
3953 skb_do_redirect(skb);
3958 #endif /* CONFIG_NET_CLS_ACT */
3963 * netdev_is_rx_handler_busy - check if receive handler is registered
3964 * @dev: device to check
3966 * Check if a receive handler is already registered for a given device.
3967 * Return true if there one.
3969 * The caller must hold the rtnl_mutex.
3971 bool netdev_is_rx_handler_busy(struct net_device *dev)
3974 return dev && rtnl_dereference(dev->rx_handler);
3976 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3979 * netdev_rx_handler_register - register receive handler
3980 * @dev: device to register a handler for
3981 * @rx_handler: receive handler to register
3982 * @rx_handler_data: data pointer that is used by rx handler
3984 * Register a receive handler for a device. This handler will then be
3985 * called from __netif_receive_skb. A negative errno code is returned
3988 * The caller must hold the rtnl_mutex.
3990 * For a general description of rx_handler, see enum rx_handler_result.
3992 int netdev_rx_handler_register(struct net_device *dev,
3993 rx_handler_func_t *rx_handler,
3994 void *rx_handler_data)
3998 if (dev->rx_handler)
4001 /* Note: rx_handler_data must be set before rx_handler */
4002 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4003 rcu_assign_pointer(dev->rx_handler, rx_handler);
4007 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4010 * netdev_rx_handler_unregister - unregister receive handler
4011 * @dev: device to unregister a handler from
4013 * Unregister a receive handler from a device.
4015 * The caller must hold the rtnl_mutex.
4017 void netdev_rx_handler_unregister(struct net_device *dev)
4021 RCU_INIT_POINTER(dev->rx_handler, NULL);
4022 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4023 * section has a guarantee to see a non NULL rx_handler_data
4027 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4029 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4032 * Limit the use of PFMEMALLOC reserves to those protocols that implement
4033 * the special handling of PFMEMALLOC skbs.
4035 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4037 switch (skb->protocol) {
4038 case htons(ETH_P_ARP):
4039 case htons(ETH_P_IP):
4040 case htons(ETH_P_IPV6):
4041 case htons(ETH_P_8021Q):
4042 case htons(ETH_P_8021AD):
4049 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4050 int *ret, struct net_device *orig_dev)
4052 #ifdef CONFIG_NETFILTER_INGRESS
4053 if (nf_hook_ingress_active(skb)) {
4057 *ret = deliver_skb(skb, *pt_prev, orig_dev);
4062 ingress_retval = nf_hook_ingress(skb);
4064 return ingress_retval;
4066 #endif /* CONFIG_NETFILTER_INGRESS */
4070 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4072 struct packet_type *ptype, *pt_prev;
4073 rx_handler_func_t *rx_handler;
4074 struct net_device *orig_dev;
4075 bool deliver_exact = false;
4076 int ret = NET_RX_DROP;
4079 net_timestamp_check(!netdev_tstamp_prequeue, skb);
4081 trace_netif_receive_skb(skb);
4083 orig_dev = skb->dev;
4085 skb_reset_network_header(skb);
4086 if (!skb_transport_header_was_set(skb))
4087 skb_reset_transport_header(skb);
4088 skb_reset_mac_len(skb);
4093 skb->skb_iif = skb->dev->ifindex;
4095 __this_cpu_inc(softnet_data.processed);
4097 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4098 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4099 skb = skb_vlan_untag(skb);
4104 #ifdef CONFIG_NET_CLS_ACT
4105 if (skb->tc_verd & TC_NCLS) {
4106 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4114 list_for_each_entry_rcu(ptype, &ptype_all, list) {
4116 ret = deliver_skb(skb, pt_prev, orig_dev);
4120 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4122 ret = deliver_skb(skb, pt_prev, orig_dev);
4127 #ifdef CONFIG_NET_INGRESS
4128 if (static_key_false(&ingress_needed)) {
4129 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4133 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4137 #ifdef CONFIG_NET_CLS_ACT
4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4144 if (skb_vlan_tag_present(skb)) {
4146 ret = deliver_skb(skb, pt_prev, orig_dev);
4149 if (vlan_do_receive(&skb))
4151 else if (unlikely(!skb))
4155 rx_handler = rcu_dereference(skb->dev->rx_handler);
4158 ret = deliver_skb(skb, pt_prev, orig_dev);
4161 switch (rx_handler(&skb)) {
4162 case RX_HANDLER_CONSUMED:
4163 ret = NET_RX_SUCCESS;
4165 case RX_HANDLER_ANOTHER:
4167 case RX_HANDLER_EXACT:
4168 deliver_exact = true;
4169 case RX_HANDLER_PASS:
4176 if (unlikely(skb_vlan_tag_present(skb))) {
4177 if (skb_vlan_tag_get_id(skb))
4178 skb->pkt_type = PACKET_OTHERHOST;
4179 /* Note: we might in the future use prio bits
4180 * and set skb->priority like in vlan_do_receive()
4181 * For the time being, just ignore Priority Code Point
4186 type = skb->protocol;
4188 /* deliver only exact match when indicated */
4189 if (likely(!deliver_exact)) {
4190 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4191 &ptype_base[ntohs(type) &
4195 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4196 &orig_dev->ptype_specific);
4198 if (unlikely(skb->dev != orig_dev)) {
4199 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4200 &skb->dev->ptype_specific);
4204 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4207 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4211 atomic_long_inc(&skb->dev->rx_dropped);
4213 atomic_long_inc(&skb->dev->rx_nohandler);
4215 /* Jamal, now you will not able to escape explaining
4216 * me how you were going to use this. :-)
4225 static int __netif_receive_skb(struct sk_buff *skb)
4229 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4230 unsigned long pflags = current->flags;
4233 * PFMEMALLOC skbs are special, they should
4234 * - be delivered to SOCK_MEMALLOC sockets only
4235 * - stay away from userspace
4236 * - have bounded memory usage
4238 * Use PF_MEMALLOC as this saves us from propagating the allocation
4239 * context down to all allocation sites.
4241 current->flags |= PF_MEMALLOC;
4242 ret = __netif_receive_skb_core(skb, true);
4243 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4245 ret = __netif_receive_skb_core(skb, false);
4250 static int netif_receive_skb_internal(struct sk_buff *skb)
4254 net_timestamp_check(netdev_tstamp_prequeue, skb);
4256 if (skb_defer_rx_timestamp(skb))
4257 return NET_RX_SUCCESS;
4262 if (static_key_false(&rps_needed)) {
4263 struct rps_dev_flow voidflow, *rflow = &voidflow;
4264 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4267 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4273 ret = __netif_receive_skb(skb);
4279 * netif_receive_skb - process receive buffer from network
4280 * @skb: buffer to process
4282 * netif_receive_skb() is the main receive data processing function.
4283 * It always succeeds. The buffer may be dropped during processing
4284 * for congestion control or by the protocol layers.
4286 * This function may only be called from softirq context and interrupts
4287 * should be enabled.
4289 * Return values (usually ignored):
4290 * NET_RX_SUCCESS: no congestion
4291 * NET_RX_DROP: packet was dropped
4293 int netif_receive_skb(struct sk_buff *skb)
4295 trace_netif_receive_skb_entry(skb);
4297 return netif_receive_skb_internal(skb);
4299 EXPORT_SYMBOL(netif_receive_skb);
4301 DEFINE_PER_CPU(struct work_struct, flush_works);
4303 /* Network device is going away, flush any packets still pending */
4304 static void flush_backlog(struct work_struct *work)
4306 struct sk_buff *skb, *tmp;
4307 struct softnet_data *sd;
4310 sd = this_cpu_ptr(&softnet_data);
4312 local_irq_disable();
4314 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4315 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4316 __skb_unlink(skb, &sd->input_pkt_queue);
4318 input_queue_head_incr(sd);
4324 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4325 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4326 __skb_unlink(skb, &sd->process_queue);
4328 input_queue_head_incr(sd);
4334 static void flush_all_backlogs(void)
4340 for_each_online_cpu(cpu)
4341 queue_work_on(cpu, system_highpri_wq,
4342 per_cpu_ptr(&flush_works, cpu));
4344 for_each_online_cpu(cpu)
4345 flush_work(per_cpu_ptr(&flush_works, cpu));
4350 static int napi_gro_complete(struct sk_buff *skb)
4352 struct packet_offload *ptype;
4353 __be16 type = skb->protocol;
4354 struct list_head *head = &offload_base;
4357 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4359 if (NAPI_GRO_CB(skb)->count == 1) {
4360 skb_shinfo(skb)->gso_size = 0;
4365 list_for_each_entry_rcu(ptype, head, list) {
4366 if (ptype->type != type || !ptype->callbacks.gro_complete)
4369 err = ptype->callbacks.gro_complete(skb, 0);
4375 WARN_ON(&ptype->list == head);
4377 return NET_RX_SUCCESS;
4381 return netif_receive_skb_internal(skb);
4384 /* napi->gro_list contains packets ordered by age.
4385 * youngest packets at the head of it.
4386 * Complete skbs in reverse order to reduce latencies.
4388 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4390 struct sk_buff *skb, *prev = NULL;
4392 /* scan list and build reverse chain */
4393 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4398 for (skb = prev; skb; skb = prev) {
4401 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4405 napi_gro_complete(skb);
4409 napi->gro_list = NULL;
4411 EXPORT_SYMBOL(napi_gro_flush);
4413 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4416 unsigned int maclen = skb->dev->hard_header_len;
4417 u32 hash = skb_get_hash_raw(skb);
4419 for (p = napi->gro_list; p; p = p->next) {
4420 unsigned long diffs;
4422 NAPI_GRO_CB(p)->flush = 0;
4424 if (hash != skb_get_hash_raw(p)) {
4425 NAPI_GRO_CB(p)->same_flow = 0;
4429 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4430 diffs |= p->vlan_tci ^ skb->vlan_tci;
4431 diffs |= skb_metadata_dst_cmp(p, skb);
4432 if (maclen == ETH_HLEN)
4433 diffs |= compare_ether_header(skb_mac_header(p),
4434 skb_mac_header(skb));
4436 diffs = memcmp(skb_mac_header(p),
4437 skb_mac_header(skb),
4439 NAPI_GRO_CB(p)->same_flow = !diffs;
4443 static void skb_gro_reset_offset(struct sk_buff *skb)
4445 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4446 const skb_frag_t *frag0 = &pinfo->frags[0];
4448 NAPI_GRO_CB(skb)->data_offset = 0;
4449 NAPI_GRO_CB(skb)->frag0 = NULL;
4450 NAPI_GRO_CB(skb)->frag0_len = 0;
4452 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4454 !PageHighMem(skb_frag_page(frag0))) {
4455 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4456 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4460 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4462 struct skb_shared_info *pinfo = skb_shinfo(skb);
4464 BUG_ON(skb->end - skb->tail < grow);
4466 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4468 skb->data_len -= grow;
4471 pinfo->frags[0].page_offset += grow;
4472 skb_frag_size_sub(&pinfo->frags[0], grow);
4474 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4475 skb_frag_unref(skb, 0);
4476 memmove(pinfo->frags, pinfo->frags + 1,
4477 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4481 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4483 struct sk_buff **pp = NULL;
4484 struct packet_offload *ptype;
4485 __be16 type = skb->protocol;
4486 struct list_head *head = &offload_base;
4488 enum gro_result ret;
4491 if (!(skb->dev->features & NETIF_F_GRO))
4494 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4497 gro_list_prepare(napi, skb);
4500 list_for_each_entry_rcu(ptype, head, list) {
4501 if (ptype->type != type || !ptype->callbacks.gro_receive)
4504 skb_set_network_header(skb, skb_gro_offset(skb));
4505 skb_reset_mac_len(skb);
4506 NAPI_GRO_CB(skb)->same_flow = 0;
4507 NAPI_GRO_CB(skb)->flush = 0;
4508 NAPI_GRO_CB(skb)->free = 0;
4509 NAPI_GRO_CB(skb)->encap_mark = 0;
4510 NAPI_GRO_CB(skb)->recursion_counter = 0;
4511 NAPI_GRO_CB(skb)->is_fou = 0;
4512 NAPI_GRO_CB(skb)->is_atomic = 1;
4513 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4515 /* Setup for GRO checksum validation */
4516 switch (skb->ip_summed) {
4517 case CHECKSUM_COMPLETE:
4518 NAPI_GRO_CB(skb)->csum = skb->csum;
4519 NAPI_GRO_CB(skb)->csum_valid = 1;
4520 NAPI_GRO_CB(skb)->csum_cnt = 0;
4522 case CHECKSUM_UNNECESSARY:
4523 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4524 NAPI_GRO_CB(skb)->csum_valid = 0;
4527 NAPI_GRO_CB(skb)->csum_cnt = 0;
4528 NAPI_GRO_CB(skb)->csum_valid = 0;
4531 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4536 if (&ptype->list == head)
4539 same_flow = NAPI_GRO_CB(skb)->same_flow;
4540 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4543 struct sk_buff *nskb = *pp;
4547 napi_gro_complete(nskb);
4554 if (NAPI_GRO_CB(skb)->flush)
4557 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4558 struct sk_buff *nskb = napi->gro_list;
4560 /* locate the end of the list to select the 'oldest' flow */
4561 while (nskb->next) {
4567 napi_gro_complete(nskb);
4571 NAPI_GRO_CB(skb)->count = 1;
4572 NAPI_GRO_CB(skb)->age = jiffies;
4573 NAPI_GRO_CB(skb)->last = skb;
4574 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4575 skb->next = napi->gro_list;
4576 napi->gro_list = skb;
4580 grow = skb_gro_offset(skb) - skb_headlen(skb);
4582 gro_pull_from_frag0(skb, grow);
4591 struct packet_offload *gro_find_receive_by_type(__be16 type)
4593 struct list_head *offload_head = &offload_base;
4594 struct packet_offload *ptype;
4596 list_for_each_entry_rcu(ptype, offload_head, list) {
4597 if (ptype->type != type || !ptype->callbacks.gro_receive)
4603 EXPORT_SYMBOL(gro_find_receive_by_type);
4605 struct packet_offload *gro_find_complete_by_type(__be16 type)
4607 struct list_head *offload_head = &offload_base;
4608 struct packet_offload *ptype;
4610 list_for_each_entry_rcu(ptype, offload_head, list) {
4611 if (ptype->type != type || !ptype->callbacks.gro_complete)
4617 EXPORT_SYMBOL(gro_find_complete_by_type);
4619 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4623 if (netif_receive_skb_internal(skb))
4631 case GRO_MERGED_FREE:
4632 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4634 kmem_cache_free(skbuff_head_cache, skb);
4648 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4650 skb_mark_napi_id(skb, napi);
4651 trace_napi_gro_receive_entry(skb);
4653 skb_gro_reset_offset(skb);
4655 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4657 EXPORT_SYMBOL(napi_gro_receive);
4659 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4661 if (unlikely(skb->pfmemalloc)) {
4665 __skb_pull(skb, skb_headlen(skb));
4666 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4667 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4669 skb->dev = napi->dev;
4671 skb->encapsulation = 0;
4672 skb_shinfo(skb)->gso_type = 0;
4673 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4678 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4680 struct sk_buff *skb = napi->skb;
4683 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4686 skb_mark_napi_id(skb, napi);
4691 EXPORT_SYMBOL(napi_get_frags);
4693 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4694 struct sk_buff *skb,
4700 __skb_push(skb, ETH_HLEN);
4701 skb->protocol = eth_type_trans(skb, skb->dev);
4702 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4707 case GRO_MERGED_FREE:
4708 napi_reuse_skb(napi, skb);
4718 /* Upper GRO stack assumes network header starts at gro_offset=0
4719 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4720 * We copy ethernet header into skb->data to have a common layout.
4722 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4724 struct sk_buff *skb = napi->skb;
4725 const struct ethhdr *eth;
4726 unsigned int hlen = sizeof(*eth);
4730 skb_reset_mac_header(skb);
4731 skb_gro_reset_offset(skb);
4733 eth = skb_gro_header_fast(skb, 0);
4734 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4735 eth = skb_gro_header_slow(skb, hlen, 0);
4736 if (unlikely(!eth)) {
4737 net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4738 __func__, napi->dev->name);
4739 napi_reuse_skb(napi, skb);
4743 gro_pull_from_frag0(skb, hlen);
4744 NAPI_GRO_CB(skb)->frag0 += hlen;
4745 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4747 __skb_pull(skb, hlen);
4750 * This works because the only protocols we care about don't require
4752 * We'll fix it up properly in napi_frags_finish()
4754 skb->protocol = eth->h_proto;
4759 gro_result_t napi_gro_frags(struct napi_struct *napi)
4761 struct sk_buff *skb = napi_frags_skb(napi);
4766 trace_napi_gro_frags_entry(skb);
4768 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4770 EXPORT_SYMBOL(napi_gro_frags);
4772 /* Compute the checksum from gro_offset and return the folded value
4773 * after adding in any pseudo checksum.
4775 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4780 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4782 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4783 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4785 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4786 !skb->csum_complete_sw)
4787 netdev_rx_csum_fault(skb->dev);
4790 NAPI_GRO_CB(skb)->csum = wsum;
4791 NAPI_GRO_CB(skb)->csum_valid = 1;
4795 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4798 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4799 * Note: called with local irq disabled, but exits with local irq enabled.
4801 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4804 struct softnet_data *remsd = sd->rps_ipi_list;
4807 sd->rps_ipi_list = NULL;
4811 /* Send pending IPI's to kick RPS processing on remote cpus. */
4813 struct softnet_data *next = remsd->rps_ipi_next;
4815 if (cpu_online(remsd->cpu))
4816 smp_call_function_single_async(remsd->cpu,
4825 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4828 return sd->rps_ipi_list != NULL;
4834 static int process_backlog(struct napi_struct *napi, int quota)
4836 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4840 /* Check if we have pending ipi, its better to send them now,
4841 * not waiting net_rx_action() end.
4843 if (sd_has_rps_ipi_waiting(sd)) {
4844 local_irq_disable();
4845 net_rps_action_and_irq_enable(sd);
4848 napi->weight = weight_p;
4850 struct sk_buff *skb;
4852 while ((skb = __skb_dequeue(&sd->process_queue))) {
4854 __netif_receive_skb(skb);
4856 input_queue_head_incr(sd);
4857 if (++work >= quota)
4862 local_irq_disable();
4864 if (skb_queue_empty(&sd->input_pkt_queue)) {
4866 * Inline a custom version of __napi_complete().
4867 * only current cpu owns and manipulates this napi,
4868 * and NAPI_STATE_SCHED is the only possible flag set
4870 * We can use a plain write instead of clear_bit(),
4871 * and we dont need an smp_mb() memory barrier.
4876 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4877 &sd->process_queue);
4887 * __napi_schedule - schedule for receive
4888 * @n: entry to schedule
4890 * The entry's receive function will be scheduled to run.
4891 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4893 void __napi_schedule(struct napi_struct *n)
4895 unsigned long flags;
4897 local_irq_save(flags);
4898 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4899 local_irq_restore(flags);
4901 EXPORT_SYMBOL(__napi_schedule);
4904 * __napi_schedule_irqoff - schedule for receive
4905 * @n: entry to schedule
4907 * Variant of __napi_schedule() assuming hard irqs are masked
4909 void __napi_schedule_irqoff(struct napi_struct *n)
4911 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4913 EXPORT_SYMBOL(__napi_schedule_irqoff);
4915 void __napi_complete(struct napi_struct *n)
4917 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4919 list_del_init(&n->poll_list);
4920 smp_mb__before_atomic();
4921 clear_bit(NAPI_STATE_SCHED, &n->state);
4923 EXPORT_SYMBOL(__napi_complete);
4925 void napi_complete_done(struct napi_struct *n, int work_done)
4927 unsigned long flags;
4930 * don't let napi dequeue from the cpu poll list
4931 * just in case its running on a different cpu
4933 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4937 unsigned long timeout = 0;
4940 timeout = n->dev->gro_flush_timeout;
4943 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4944 HRTIMER_MODE_REL_PINNED);
4946 napi_gro_flush(n, false);
4948 if (likely(list_empty(&n->poll_list))) {
4949 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4951 /* If n->poll_list is not empty, we need to mask irqs */
4952 local_irq_save(flags);
4954 local_irq_restore(flags);
4957 EXPORT_SYMBOL(napi_complete_done);
4959 /* must be called under rcu_read_lock(), as we dont take a reference */
4960 static struct napi_struct *napi_by_id(unsigned int napi_id)
4962 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4963 struct napi_struct *napi;
4965 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4966 if (napi->napi_id == napi_id)
4972 #if defined(CONFIG_NET_RX_BUSY_POLL)
4973 #define BUSY_POLL_BUDGET 8
4974 bool sk_busy_loop(struct sock *sk, int nonblock)
4976 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4977 int (*busy_poll)(struct napi_struct *dev);
4978 struct napi_struct *napi;
4983 napi = napi_by_id(sk->sk_napi_id);
4987 /* Note: ndo_busy_poll method is optional in linux-4.5 */
4988 busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4994 rc = busy_poll(napi);
4995 } else if (napi_schedule_prep(napi)) {
4996 void *have = netpoll_poll_lock(napi);
4998 if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4999 rc = napi->poll(napi, BUSY_POLL_BUDGET);
5000 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
5001 if (rc == BUSY_POLL_BUDGET) {
5002 napi_complete_done(napi, rc);
5003 napi_schedule(napi);
5006 netpoll_poll_unlock(have);
5009 __NET_ADD_STATS(sock_net(sk),
5010 LINUX_MIB_BUSYPOLLRXPACKETS, rc);
5013 if (rc == LL_FLUSH_FAILED)
5014 break; /* permanent failure */
5017 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
5018 !need_resched() && !busy_loop_timeout(end_time));
5020 rc = !skb_queue_empty(&sk->sk_receive_queue);
5025 EXPORT_SYMBOL(sk_busy_loop);
5027 #endif /* CONFIG_NET_RX_BUSY_POLL */
5029 void napi_hash_add(struct napi_struct *napi)
5031 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5032 test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5035 spin_lock(&napi_hash_lock);
5037 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5039 if (unlikely(++napi_gen_id < NR_CPUS + 1))
5040 napi_gen_id = NR_CPUS + 1;
5041 } while (napi_by_id(napi_gen_id));
5042 napi->napi_id = napi_gen_id;
5044 hlist_add_head_rcu(&napi->napi_hash_node,
5045 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5047 spin_unlock(&napi_hash_lock);
5049 EXPORT_SYMBOL_GPL(napi_hash_add);
5051 /* Warning : caller is responsible to make sure rcu grace period
5052 * is respected before freeing memory containing @napi
5054 bool napi_hash_del(struct napi_struct *napi)
5056 bool rcu_sync_needed = false;
5058 spin_lock(&napi_hash_lock);
5060 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5061 rcu_sync_needed = true;
5062 hlist_del_rcu(&napi->napi_hash_node);
5064 spin_unlock(&napi_hash_lock);
5065 return rcu_sync_needed;
5067 EXPORT_SYMBOL_GPL(napi_hash_del);
5069 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5071 struct napi_struct *napi;
5073 napi = container_of(timer, struct napi_struct, timer);
5075 napi_schedule(napi);
5077 return HRTIMER_NORESTART;
5080 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5081 int (*poll)(struct napi_struct *, int), int weight)
5083 INIT_LIST_HEAD(&napi->poll_list);
5084 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5085 napi->timer.function = napi_watchdog;
5086 napi->gro_count = 0;
5087 napi->gro_list = NULL;
5090 if (weight > NAPI_POLL_WEIGHT)
5091 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5093 napi->weight = weight;
5094 list_add(&napi->dev_list, &dev->napi_list);
5096 #ifdef CONFIG_NETPOLL
5097 spin_lock_init(&napi->poll_lock);
5098 napi->poll_owner = -1;
5100 set_bit(NAPI_STATE_SCHED, &napi->state);
5101 napi_hash_add(napi);
5103 EXPORT_SYMBOL(netif_napi_add);
5105 void napi_disable(struct napi_struct *n)
5108 set_bit(NAPI_STATE_DISABLE, &n->state);
5110 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5112 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5115 hrtimer_cancel(&n->timer);
5117 clear_bit(NAPI_STATE_DISABLE, &n->state);
5119 EXPORT_SYMBOL(napi_disable);
5121 /* Must be called in process context */
5122 void netif_napi_del(struct napi_struct *napi)
5125 if (napi_hash_del(napi))
5127 list_del_init(&napi->dev_list);
5128 napi_free_frags(napi);
5130 kfree_skb_list(napi->gro_list);
5131 napi->gro_list = NULL;
5132 napi->gro_count = 0;
5134 EXPORT_SYMBOL(netif_napi_del);
5136 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5141 list_del_init(&n->poll_list);
5143 have = netpoll_poll_lock(n);
5147 /* This NAPI_STATE_SCHED test is for avoiding a race
5148 * with netpoll's poll_napi(). Only the entity which
5149 * obtains the lock and sees NAPI_STATE_SCHED set will
5150 * actually make the ->poll() call. Therefore we avoid
5151 * accidentally calling ->poll() when NAPI is not scheduled.
5154 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5155 work = n->poll(n, weight);
5156 trace_napi_poll(n, work, weight);
5159 WARN_ON_ONCE(work > weight);
5161 if (likely(work < weight))
5164 /* Drivers must not modify the NAPI state if they
5165 * consume the entire weight. In such cases this code
5166 * still "owns" the NAPI instance and therefore can
5167 * move the instance around on the list at-will.
5169 if (unlikely(napi_disable_pending(n))) {
5175 /* flush too old packets
5176 * If HZ < 1000, flush all packets.
5178 napi_gro_flush(n, HZ >= 1000);
5181 /* Some drivers may have called napi_schedule
5182 * prior to exhausting their budget.
5184 if (unlikely(!list_empty(&n->poll_list))) {
5185 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5186 n->dev ? n->dev->name : "backlog");
5190 list_add_tail(&n->poll_list, repoll);
5193 netpoll_poll_unlock(have);
5198 static __latent_entropy void net_rx_action(struct softirq_action *h)
5200 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5201 unsigned long time_limit = jiffies + 2;
5202 int budget = netdev_budget;
5206 local_irq_disable();
5207 list_splice_init(&sd->poll_list, &list);
5211 struct napi_struct *n;
5213 if (list_empty(&list)) {
5214 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5219 n = list_first_entry(&list, struct napi_struct, poll_list);
5220 budget -= napi_poll(n, &repoll);
5222 /* If softirq window is exhausted then punt.
5223 * Allow this to run for 2 jiffies since which will allow
5224 * an average latency of 1.5/HZ.
5226 if (unlikely(budget <= 0 ||
5227 time_after_eq(jiffies, time_limit))) {
5233 __kfree_skb_flush();
5234 local_irq_disable();
5236 list_splice_tail_init(&sd->poll_list, &list);
5237 list_splice_tail(&repoll, &list);
5238 list_splice(&list, &sd->poll_list);
5239 if (!list_empty(&sd->poll_list))
5240 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5242 net_rps_action_and_irq_enable(sd);
5245 struct netdev_adjacent {
5246 struct net_device *dev;
5248 /* upper master flag, there can only be one master device per list */
5251 /* counter for the number of times this device was added to us */
5254 /* private field for the users */
5257 struct list_head list;
5258 struct rcu_head rcu;
5261 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5262 struct list_head *adj_list)
5264 struct netdev_adjacent *adj;
5266 list_for_each_entry(adj, adj_list, list) {
5267 if (adj->dev == adj_dev)
5274 * netdev_has_upper_dev - Check if device is linked to an upper device
5276 * @upper_dev: upper device to check
5278 * Find out if a device is linked to specified upper device and return true
5279 * in case it is. Note that this checks only immediate upper device,
5280 * not through a complete stack of devices. The caller must hold the RTNL lock.
5282 bool netdev_has_upper_dev(struct net_device *dev,
5283 struct net_device *upper_dev)
5287 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5289 EXPORT_SYMBOL(netdev_has_upper_dev);
5292 * netdev_has_any_upper_dev - Check if device is linked to some device
5295 * Find out if a device is linked to an upper device and return true in case
5296 * it is. The caller must hold the RTNL lock.
5298 static bool netdev_has_any_upper_dev(struct net_device *dev)
5302 return !list_empty(&dev->all_adj_list.upper);
5306 * netdev_master_upper_dev_get - Get master upper device
5309 * Find a master upper device and return pointer to it or NULL in case
5310 * it's not there. The caller must hold the RTNL lock.
5312 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5314 struct netdev_adjacent *upper;
5318 if (list_empty(&dev->adj_list.upper))
5321 upper = list_first_entry(&dev->adj_list.upper,
5322 struct netdev_adjacent, list);
5323 if (likely(upper->master))
5327 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5329 void *netdev_adjacent_get_private(struct list_head *adj_list)
5331 struct netdev_adjacent *adj;
5333 adj = list_entry(adj_list, struct netdev_adjacent, list);
5335 return adj->private;
5337 EXPORT_SYMBOL(netdev_adjacent_get_private);
5340 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5342 * @iter: list_head ** of the current position
5344 * Gets the next device from the dev's upper list, starting from iter
5345 * position. The caller must hold RCU read lock.
5347 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5348 struct list_head **iter)
5350 struct netdev_adjacent *upper;
5352 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5354 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5356 if (&upper->list == &dev->adj_list.upper)
5359 *iter = &upper->list;
5363 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5366 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5368 * @iter: list_head ** of the current position
5370 * Gets the next device from the dev's upper list, starting from iter
5371 * position. The caller must hold RCU read lock.
5373 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5374 struct list_head **iter)
5376 struct netdev_adjacent *upper;
5378 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5380 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5382 if (&upper->list == &dev->all_adj_list.upper)
5385 *iter = &upper->list;
5389 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5392 * netdev_lower_get_next_private - Get the next ->private from the
5393 * lower neighbour list
5395 * @iter: list_head ** of the current position
5397 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5398 * list, starting from iter position. The caller must hold either hold the
5399 * RTNL lock or its own locking that guarantees that the neighbour lower
5400 * list will remain unchanged.
5402 void *netdev_lower_get_next_private(struct net_device *dev,
5403 struct list_head **iter)
5405 struct netdev_adjacent *lower;
5407 lower = list_entry(*iter, struct netdev_adjacent, list);
5409 if (&lower->list == &dev->adj_list.lower)
5412 *iter = lower->list.next;
5414 return lower->private;
5416 EXPORT_SYMBOL(netdev_lower_get_next_private);
5419 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5420 * lower neighbour list, RCU
5423 * @iter: list_head ** of the current position
5425 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5426 * list, starting from iter position. The caller must hold RCU read lock.
5428 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5429 struct list_head **iter)
5431 struct netdev_adjacent *lower;
5433 WARN_ON_ONCE(!rcu_read_lock_held());
5435 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5437 if (&lower->list == &dev->adj_list.lower)
5440 *iter = &lower->list;
5442 return lower->private;
5444 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5447 * netdev_lower_get_next - Get the next device from the lower neighbour
5450 * @iter: list_head ** of the current position
5452 * Gets the next netdev_adjacent from the dev's lower neighbour
5453 * list, starting from iter position. The caller must hold RTNL lock or
5454 * its own locking that guarantees that the neighbour lower
5455 * list will remain unchanged.
5457 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5459 struct netdev_adjacent *lower;
5461 lower = list_entry(*iter, struct netdev_adjacent, list);
5463 if (&lower->list == &dev->adj_list.lower)
5466 *iter = lower->list.next;
5470 EXPORT_SYMBOL(netdev_lower_get_next);
5473 * netdev_all_lower_get_next - Get the next device from all lower neighbour list
5475 * @iter: list_head ** of the current position
5477 * Gets the next netdev_adjacent from the dev's all lower neighbour
5478 * list, starting from iter position. The caller must hold RTNL lock or
5479 * its own locking that guarantees that the neighbour all lower
5480 * list will remain unchanged.
5482 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter)
5484 struct netdev_adjacent *lower;
5486 lower = list_entry(*iter, struct netdev_adjacent, list);
5488 if (&lower->list == &dev->all_adj_list.lower)
5491 *iter = lower->list.next;
5495 EXPORT_SYMBOL(netdev_all_lower_get_next);
5498 * netdev_all_lower_get_next_rcu - Get the next device from all
5499 * lower neighbour list, RCU variant
5501 * @iter: list_head ** of the current position
5503 * Gets the next netdev_adjacent from the dev's all lower neighbour
5504 * list, starting from iter position. The caller must hold RCU read lock.
5506 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev,
5507 struct list_head **iter)
5509 struct netdev_adjacent *lower;
5511 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5513 if (&lower->list == &dev->all_adj_list.lower)
5516 *iter = &lower->list;
5520 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu);
5523 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5524 * lower neighbour list, RCU
5528 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5529 * list. The caller must hold RCU read lock.
5531 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5533 struct netdev_adjacent *lower;
5535 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5536 struct netdev_adjacent, list);
5538 return lower->private;
5541 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5544 * netdev_master_upper_dev_get_rcu - Get master upper device
5547 * Find a master upper device and return pointer to it or NULL in case
5548 * it's not there. The caller must hold the RCU read lock.
5550 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5552 struct netdev_adjacent *upper;
5554 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5555 struct netdev_adjacent, list);
5556 if (upper && likely(upper->master))
5560 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5562 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5563 struct net_device *adj_dev,
5564 struct list_head *dev_list)
5566 char linkname[IFNAMSIZ+7];
5567 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5568 "upper_%s" : "lower_%s", adj_dev->name);
5569 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5572 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5574 struct list_head *dev_list)
5576 char linkname[IFNAMSIZ+7];
5577 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5578 "upper_%s" : "lower_%s", name);
5579 sysfs_remove_link(&(dev->dev.kobj), linkname);
5582 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5583 struct net_device *adj_dev,
5584 struct list_head *dev_list)
5586 return (dev_list == &dev->adj_list.upper ||
5587 dev_list == &dev->adj_list.lower) &&
5588 net_eq(dev_net(dev), dev_net(adj_dev));
5591 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5592 struct net_device *adj_dev,
5594 struct list_head *dev_list,
5595 void *private, bool master)
5597 struct netdev_adjacent *adj;
5600 adj = __netdev_find_adj(adj_dev, dev_list);
5603 adj->ref_nr += ref_nr;
5607 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5612 adj->master = master;
5613 adj->ref_nr = ref_nr;
5614 adj->private = private;
5617 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5618 adj_dev->name, dev->name, adj_dev->name);
5620 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5621 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5626 /* Ensure that master link is always the first item in list. */
5628 ret = sysfs_create_link(&(dev->dev.kobj),
5629 &(adj_dev->dev.kobj), "master");
5631 goto remove_symlinks;
5633 list_add_rcu(&adj->list, dev_list);
5635 list_add_tail_rcu(&adj->list, dev_list);
5641 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5642 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5650 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5651 struct net_device *adj_dev,
5653 struct list_head *dev_list)
5655 struct netdev_adjacent *adj;
5657 adj = __netdev_find_adj(adj_dev, dev_list);
5660 pr_err("tried to remove device %s from %s\n",
5661 dev->name, adj_dev->name);
5665 if (adj->ref_nr > ref_nr) {
5666 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5667 ref_nr, adj->ref_nr-ref_nr);
5668 adj->ref_nr -= ref_nr;
5673 sysfs_remove_link(&(dev->dev.kobj), "master");
5675 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5676 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5678 list_del_rcu(&adj->list);
5679 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5680 adj_dev->name, dev->name, adj_dev->name);
5682 kfree_rcu(adj, rcu);
5685 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5686 struct net_device *upper_dev,
5688 struct list_head *up_list,
5689 struct list_head *down_list,
5690 void *private, bool master)
5694 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5699 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5702 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5709 static int __netdev_adjacent_dev_link(struct net_device *dev,
5710 struct net_device *upper_dev,
5713 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5714 &dev->all_adj_list.upper,
5715 &upper_dev->all_adj_list.lower,
5719 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5720 struct net_device *upper_dev,
5722 struct list_head *up_list,
5723 struct list_head *down_list)
5725 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5726 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5729 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5730 struct net_device *upper_dev,
5733 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5734 &dev->all_adj_list.upper,
5735 &upper_dev->all_adj_list.lower);
5738 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5739 struct net_device *upper_dev,
5740 void *private, bool master)
5742 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5747 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5748 &dev->adj_list.upper,
5749 &upper_dev->adj_list.lower,
5752 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5759 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5760 struct net_device *upper_dev)
5762 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5763 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5764 &dev->adj_list.upper,
5765 &upper_dev->adj_list.lower);
5768 static int __netdev_upper_dev_link(struct net_device *dev,
5769 struct net_device *upper_dev, bool master,
5770 void *upper_priv, void *upper_info)
5772 struct netdev_notifier_changeupper_info changeupper_info;
5773 struct netdev_adjacent *i, *j, *to_i, *to_j;
5778 if (dev == upper_dev)
5781 /* To prevent loops, check if dev is not upper device to upper_dev. */
5782 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5785 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5788 if (master && netdev_master_upper_dev_get(dev))
5791 changeupper_info.upper_dev = upper_dev;
5792 changeupper_info.master = master;
5793 changeupper_info.linking = true;
5794 changeupper_info.upper_info = upper_info;
5796 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5797 &changeupper_info.info);
5798 ret = notifier_to_errno(ret);
5802 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5807 /* Now that we linked these devs, make all the upper_dev's
5808 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5809 * versa, and don't forget the devices itself. All of these
5810 * links are non-neighbours.
5812 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5813 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5814 pr_debug("Interlinking %s with %s, non-neighbour\n",
5815 i->dev->name, j->dev->name);
5816 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5822 /* add dev to every upper_dev's upper device */
5823 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5824 pr_debug("linking %s's upper device %s with %s\n",
5825 upper_dev->name, i->dev->name, dev->name);
5826 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5828 goto rollback_upper_mesh;
5831 /* add upper_dev to every dev's lower device */
5832 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5833 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5834 i->dev->name, upper_dev->name);
5835 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5837 goto rollback_lower_mesh;
5840 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5841 &changeupper_info.info);
5842 ret = notifier_to_errno(ret);
5844 goto rollback_lower_mesh;
5848 rollback_lower_mesh:
5850 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5853 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5858 rollback_upper_mesh:
5860 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5863 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5871 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5872 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5873 if (i == to_i && j == to_j)
5875 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5881 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5887 * netdev_upper_dev_link - Add a link to the upper device
5889 * @upper_dev: new upper device
5891 * Adds a link to device which is upper to this one. The caller must hold
5892 * the RTNL lock. On a failure a negative errno code is returned.
5893 * On success the reference counts are adjusted and the function
5896 int netdev_upper_dev_link(struct net_device *dev,
5897 struct net_device *upper_dev)
5899 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5901 EXPORT_SYMBOL(netdev_upper_dev_link);
5904 * netdev_master_upper_dev_link - Add a master link to the upper device
5906 * @upper_dev: new upper device
5907 * @upper_priv: upper device private
5908 * @upper_info: upper info to be passed down via notifier
5910 * Adds a link to device which is upper to this one. In this case, only
5911 * one master upper device can be linked, although other non-master devices
5912 * might be linked as well. The caller must hold the RTNL lock.
5913 * On a failure a negative errno code is returned. On success the reference
5914 * counts are adjusted and the function returns zero.
5916 int netdev_master_upper_dev_link(struct net_device *dev,
5917 struct net_device *upper_dev,
5918 void *upper_priv, void *upper_info)
5920 return __netdev_upper_dev_link(dev, upper_dev, true,
5921 upper_priv, upper_info);
5923 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5926 * netdev_upper_dev_unlink - Removes a link to upper device
5928 * @upper_dev: new upper device
5930 * Removes a link to device which is upper to this one. The caller must hold
5933 void netdev_upper_dev_unlink(struct net_device *dev,
5934 struct net_device *upper_dev)
5936 struct netdev_notifier_changeupper_info changeupper_info;
5937 struct netdev_adjacent *i, *j;
5940 changeupper_info.upper_dev = upper_dev;
5941 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5942 changeupper_info.linking = false;
5944 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5945 &changeupper_info.info);
5947 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5949 /* Here is the tricky part. We must remove all dev's lower
5950 * devices from all upper_dev's upper devices and vice
5951 * versa, to maintain the graph relationship.
5953 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5954 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5955 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5957 /* remove also the devices itself from lower/upper device
5960 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5961 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5963 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5964 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5966 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5967 &changeupper_info.info);
5969 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5972 * netdev_bonding_info_change - Dispatch event about slave change
5974 * @bonding_info: info to dispatch
5976 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5977 * The caller must hold the RTNL lock.
5979 void netdev_bonding_info_change(struct net_device *dev,
5980 struct netdev_bonding_info *bonding_info)
5982 struct netdev_notifier_bonding_info info;
5984 memcpy(&info.bonding_info, bonding_info,
5985 sizeof(struct netdev_bonding_info));
5986 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5989 EXPORT_SYMBOL(netdev_bonding_info_change);
5991 static void netdev_adjacent_add_links(struct net_device *dev)
5993 struct netdev_adjacent *iter;
5995 struct net *net = dev_net(dev);
5997 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5998 if (!net_eq(net, dev_net(iter->dev)))
6000 netdev_adjacent_sysfs_add(iter->dev, dev,
6001 &iter->dev->adj_list.lower);
6002 netdev_adjacent_sysfs_add(dev, iter->dev,
6003 &dev->adj_list.upper);
6006 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6007 if (!net_eq(net, dev_net(iter->dev)))
6009 netdev_adjacent_sysfs_add(iter->dev, dev,
6010 &iter->dev->adj_list.upper);
6011 netdev_adjacent_sysfs_add(dev, iter->dev,
6012 &dev->adj_list.lower);
6016 static void netdev_adjacent_del_links(struct net_device *dev)
6018 struct netdev_adjacent *iter;
6020 struct net *net = dev_net(dev);
6022 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6023 if (!net_eq(net, dev_net(iter->dev)))
6025 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6026 &iter->dev->adj_list.lower);
6027 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6028 &dev->adj_list.upper);
6031 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6032 if (!net_eq(net, dev_net(iter->dev)))
6034 netdev_adjacent_sysfs_del(iter->dev, dev->name,
6035 &iter->dev->adj_list.upper);
6036 netdev_adjacent_sysfs_del(dev, iter->dev->name,
6037 &dev->adj_list.lower);
6041 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
6043 struct netdev_adjacent *iter;
6045 struct net *net = dev_net(dev);
6047 list_for_each_entry(iter, &dev->adj_list.upper, list) {
6048 if (!net_eq(net, dev_net(iter->dev)))
6050 netdev_adjacent_sysfs_del(iter->dev, oldname,
6051 &iter->dev->adj_list.lower);
6052 netdev_adjacent_sysfs_add(iter->dev, dev,
6053 &iter->dev->adj_list.lower);
6056 list_for_each_entry(iter, &dev->adj_list.lower, list) {
6057 if (!net_eq(net, dev_net(iter->dev)))
6059 netdev_adjacent_sysfs_del(iter->dev, oldname,
6060 &iter->dev->adj_list.upper);
6061 netdev_adjacent_sysfs_add(iter->dev, dev,
6062 &iter->dev->adj_list.upper);
6066 void *netdev_lower_dev_get_private(struct net_device *dev,
6067 struct net_device *lower_dev)
6069 struct netdev_adjacent *lower;
6073 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
6077 return lower->private;
6079 EXPORT_SYMBOL(netdev_lower_dev_get_private);
6082 int dev_get_nest_level(struct net_device *dev)
6084 struct net_device *lower = NULL;
6085 struct list_head *iter;
6091 netdev_for_each_lower_dev(dev, lower, iter) {
6092 nest = dev_get_nest_level(lower);
6093 if (max_nest < nest)
6097 return max_nest + 1;
6099 EXPORT_SYMBOL(dev_get_nest_level);
6102 * netdev_lower_change - Dispatch event about lower device state change
6103 * @lower_dev: device
6104 * @lower_state_info: state to dispatch
6106 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6107 * The caller must hold the RTNL lock.
6109 void netdev_lower_state_changed(struct net_device *lower_dev,
6110 void *lower_state_info)
6112 struct netdev_notifier_changelowerstate_info changelowerstate_info;
6115 changelowerstate_info.lower_state_info = lower_state_info;
6116 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6117 &changelowerstate_info.info);
6119 EXPORT_SYMBOL(netdev_lower_state_changed);
6121 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
6122 struct neighbour *n)
6124 struct net_device *lower_dev, *stop_dev;
6125 struct list_head *iter;
6128 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6129 if (!lower_dev->netdev_ops->ndo_neigh_construct)
6131 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
6133 stop_dev = lower_dev;
6140 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6141 if (lower_dev == stop_dev)
6143 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6145 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6149 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6151 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6152 struct neighbour *n)
6154 struct net_device *lower_dev;
6155 struct list_head *iter;
6157 netdev_for_each_lower_dev(dev, lower_dev, iter) {
6158 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6160 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6163 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6165 static void dev_change_rx_flags(struct net_device *dev, int flags)
6167 const struct net_device_ops *ops = dev->netdev_ops;
6169 if (ops->ndo_change_rx_flags)
6170 ops->ndo_change_rx_flags(dev, flags);
6173 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6175 unsigned int old_flags = dev->flags;
6181 dev->flags |= IFF_PROMISC;
6182 dev->promiscuity += inc;
6183 if (dev->promiscuity == 0) {
6186 * If inc causes overflow, untouch promisc and return error.
6189 dev->flags &= ~IFF_PROMISC;
6191 dev->promiscuity -= inc;
6192 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6197 if (dev->flags != old_flags) {
6198 pr_info("device %s %s promiscuous mode\n",
6200 dev->flags & IFF_PROMISC ? "entered" : "left");
6201 if (audit_enabled) {
6202 current_uid_gid(&uid, &gid);
6203 audit_log(current->audit_context, GFP_ATOMIC,
6204 AUDIT_ANOM_PROMISCUOUS,
6205 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6206 dev->name, (dev->flags & IFF_PROMISC),
6207 (old_flags & IFF_PROMISC),
6208 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6209 from_kuid(&init_user_ns, uid),
6210 from_kgid(&init_user_ns, gid),
6211 audit_get_sessionid(current));
6214 dev_change_rx_flags(dev, IFF_PROMISC);
6217 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6222 * dev_set_promiscuity - update promiscuity count on a device
6226 * Add or remove promiscuity from a device. While the count in the device
6227 * remains above zero the interface remains promiscuous. Once it hits zero
6228 * the device reverts back to normal filtering operation. A negative inc
6229 * value is used to drop promiscuity on the device.
6230 * Return 0 if successful or a negative errno code on error.
6232 int dev_set_promiscuity(struct net_device *dev, int inc)
6234 unsigned int old_flags = dev->flags;
6237 err = __dev_set_promiscuity(dev, inc, true);
6240 if (dev->flags != old_flags)
6241 dev_set_rx_mode(dev);
6244 EXPORT_SYMBOL(dev_set_promiscuity);
6246 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6248 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6252 dev->flags |= IFF_ALLMULTI;
6253 dev->allmulti += inc;
6254 if (dev->allmulti == 0) {
6257 * If inc causes overflow, untouch allmulti and return error.
6260 dev->flags &= ~IFF_ALLMULTI;
6262 dev->allmulti -= inc;
6263 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6268 if (dev->flags ^ old_flags) {
6269 dev_change_rx_flags(dev, IFF_ALLMULTI);
6270 dev_set_rx_mode(dev);
6272 __dev_notify_flags(dev, old_flags,
6273 dev->gflags ^ old_gflags);
6279 * dev_set_allmulti - update allmulti count on a device
6283 * Add or remove reception of all multicast frames to a device. While the
6284 * count in the device remains above zero the interface remains listening
6285 * to all interfaces. Once it hits zero the device reverts back to normal
6286 * filtering operation. A negative @inc value is used to drop the counter
6287 * when releasing a resource needing all multicasts.
6288 * Return 0 if successful or a negative errno code on error.
6291 int dev_set_allmulti(struct net_device *dev, int inc)
6293 return __dev_set_allmulti(dev, inc, true);
6295 EXPORT_SYMBOL(dev_set_allmulti);
6298 * Upload unicast and multicast address lists to device and
6299 * configure RX filtering. When the device doesn't support unicast
6300 * filtering it is put in promiscuous mode while unicast addresses
6303 void __dev_set_rx_mode(struct net_device *dev)
6305 const struct net_device_ops *ops = dev->netdev_ops;
6307 /* dev_open will call this function so the list will stay sane. */
6308 if (!(dev->flags&IFF_UP))
6311 if (!netif_device_present(dev))
6314 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6315 /* Unicast addresses changes may only happen under the rtnl,
6316 * therefore calling __dev_set_promiscuity here is safe.
6318 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6319 __dev_set_promiscuity(dev, 1, false);
6320 dev->uc_promisc = true;
6321 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6322 __dev_set_promiscuity(dev, -1, false);
6323 dev->uc_promisc = false;
6327 if (ops->ndo_set_rx_mode)
6328 ops->ndo_set_rx_mode(dev);
6331 void dev_set_rx_mode(struct net_device *dev)
6333 netif_addr_lock_bh(dev);
6334 __dev_set_rx_mode(dev);
6335 netif_addr_unlock_bh(dev);
6339 * dev_get_flags - get flags reported to userspace
6342 * Get the combination of flag bits exported through APIs to userspace.
6344 unsigned int dev_get_flags(const struct net_device *dev)
6348 flags = (dev->flags & ~(IFF_PROMISC |
6353 (dev->gflags & (IFF_PROMISC |
6356 if (netif_running(dev)) {
6357 if (netif_oper_up(dev))
6358 flags |= IFF_RUNNING;
6359 if (netif_carrier_ok(dev))
6360 flags |= IFF_LOWER_UP;
6361 if (netif_dormant(dev))
6362 flags |= IFF_DORMANT;
6367 EXPORT_SYMBOL(dev_get_flags);
6369 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6371 unsigned int old_flags = dev->flags;
6377 * Set the flags on our device.
6380 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6381 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6383 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6387 * Load in the correct multicast list now the flags have changed.
6390 if ((old_flags ^ flags) & IFF_MULTICAST)
6391 dev_change_rx_flags(dev, IFF_MULTICAST);
6393 dev_set_rx_mode(dev);
6396 * Have we downed the interface. We handle IFF_UP ourselves
6397 * according to user attempts to set it, rather than blindly
6402 if ((old_flags ^ flags) & IFF_UP)
6403 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6405 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6406 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6407 unsigned int old_flags = dev->flags;
6409 dev->gflags ^= IFF_PROMISC;
6411 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6412 if (dev->flags != old_flags)
6413 dev_set_rx_mode(dev);
6416 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6417 is important. Some (broken) drivers set IFF_PROMISC, when
6418 IFF_ALLMULTI is requested not asking us and not reporting.
6420 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6421 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6423 dev->gflags ^= IFF_ALLMULTI;
6424 __dev_set_allmulti(dev, inc, false);
6430 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6431 unsigned int gchanges)
6433 unsigned int changes = dev->flags ^ old_flags;
6436 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6438 if (changes & IFF_UP) {
6439 if (dev->flags & IFF_UP)
6440 call_netdevice_notifiers(NETDEV_UP, dev);
6442 call_netdevice_notifiers(NETDEV_DOWN, dev);
6445 if (dev->flags & IFF_UP &&
6446 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6447 struct netdev_notifier_change_info change_info;
6449 change_info.flags_changed = changes;
6450 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6456 * dev_change_flags - change device settings
6458 * @flags: device state flags
6460 * Change settings on device based state flags. The flags are
6461 * in the userspace exported format.
6463 int dev_change_flags(struct net_device *dev, unsigned int flags)
6466 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6468 ret = __dev_change_flags(dev, flags);
6472 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6473 __dev_notify_flags(dev, old_flags, changes);
6476 EXPORT_SYMBOL(dev_change_flags);
6478 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6480 const struct net_device_ops *ops = dev->netdev_ops;
6482 if (ops->ndo_change_mtu)
6483 return ops->ndo_change_mtu(dev, new_mtu);
6490 * dev_set_mtu - Change maximum transfer unit
6492 * @new_mtu: new transfer unit
6494 * Change the maximum transfer size of the network device.
6496 int dev_set_mtu(struct net_device *dev, int new_mtu)
6500 if (new_mtu == dev->mtu)
6503 /* MTU must be positive. */
6507 if (!netif_device_present(dev))
6510 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6511 err = notifier_to_errno(err);
6515 orig_mtu = dev->mtu;
6516 err = __dev_set_mtu(dev, new_mtu);
6519 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6520 err = notifier_to_errno(err);
6522 /* setting mtu back and notifying everyone again,
6523 * so that they have a chance to revert changes.
6525 __dev_set_mtu(dev, orig_mtu);
6526 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6531 EXPORT_SYMBOL(dev_set_mtu);
6534 * dev_set_group - Change group this device belongs to
6536 * @new_group: group this device should belong to
6538 void dev_set_group(struct net_device *dev, int new_group)
6540 dev->group = new_group;
6542 EXPORT_SYMBOL(dev_set_group);
6545 * dev_set_mac_address - Change Media Access Control Address
6549 * Change the hardware (MAC) address of the device
6551 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6553 const struct net_device_ops *ops = dev->netdev_ops;
6556 if (!ops->ndo_set_mac_address)
6558 if (sa->sa_family != dev->type)
6560 if (!netif_device_present(dev))
6562 err = ops->ndo_set_mac_address(dev, sa);
6565 dev->addr_assign_type = NET_ADDR_SET;
6566 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6567 add_device_randomness(dev->dev_addr, dev->addr_len);
6570 EXPORT_SYMBOL(dev_set_mac_address);
6573 * dev_change_carrier - Change device carrier
6575 * @new_carrier: new value
6577 * Change device carrier
6579 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6581 const struct net_device_ops *ops = dev->netdev_ops;
6583 if (!ops->ndo_change_carrier)
6585 if (!netif_device_present(dev))
6587 return ops->ndo_change_carrier(dev, new_carrier);
6589 EXPORT_SYMBOL(dev_change_carrier);
6592 * dev_get_phys_port_id - Get device physical port ID
6596 * Get device physical port ID
6598 int dev_get_phys_port_id(struct net_device *dev,
6599 struct netdev_phys_item_id *ppid)
6601 const struct net_device_ops *ops = dev->netdev_ops;
6603 if (!ops->ndo_get_phys_port_id)
6605 return ops->ndo_get_phys_port_id(dev, ppid);
6607 EXPORT_SYMBOL(dev_get_phys_port_id);
6610 * dev_get_phys_port_name - Get device physical port name
6613 * @len: limit of bytes to copy to name
6615 * Get device physical port name
6617 int dev_get_phys_port_name(struct net_device *dev,
6618 char *name, size_t len)
6620 const struct net_device_ops *ops = dev->netdev_ops;
6622 if (!ops->ndo_get_phys_port_name)
6624 return ops->ndo_get_phys_port_name(dev, name, len);
6626 EXPORT_SYMBOL(dev_get_phys_port_name);
6629 * dev_change_proto_down - update protocol port state information
6631 * @proto_down: new value
6633 * This info can be used by switch drivers to set the phys state of the
6636 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6638 const struct net_device_ops *ops = dev->netdev_ops;
6640 if (!ops->ndo_change_proto_down)
6642 if (!netif_device_present(dev))
6644 return ops->ndo_change_proto_down(dev, proto_down);
6646 EXPORT_SYMBOL(dev_change_proto_down);
6649 * dev_change_xdp_fd - set or clear a bpf program for a device rx path
6651 * @fd: new program fd or negative value to clear
6653 * Set or clear a bpf program for a device
6655 int dev_change_xdp_fd(struct net_device *dev, int fd)
6657 const struct net_device_ops *ops = dev->netdev_ops;
6658 struct bpf_prog *prog = NULL;
6659 struct netdev_xdp xdp = {};
6665 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6667 return PTR_ERR(prog);
6670 xdp.command = XDP_SETUP_PROG;
6672 err = ops->ndo_xdp(dev, &xdp);
6673 if (err < 0 && prog)
6678 EXPORT_SYMBOL(dev_change_xdp_fd);
6681 * dev_new_index - allocate an ifindex
6682 * @net: the applicable net namespace
6684 * Returns a suitable unique value for a new device interface
6685 * number. The caller must hold the rtnl semaphore or the
6686 * dev_base_lock to be sure it remains unique.
6688 static int dev_new_index(struct net *net)
6690 int ifindex = net->ifindex;
6694 if (!__dev_get_by_index(net, ifindex))
6695 return net->ifindex = ifindex;
6699 /* Delayed registration/unregisteration */
6700 static LIST_HEAD(net_todo_list);
6701 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6703 static void net_set_todo(struct net_device *dev)
6705 list_add_tail(&dev->todo_list, &net_todo_list);
6706 dev_net(dev)->dev_unreg_count++;
6709 static void rollback_registered_many(struct list_head *head)
6711 struct net_device *dev, *tmp;
6712 LIST_HEAD(close_head);
6714 BUG_ON(dev_boot_phase);
6717 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6718 /* Some devices call without registering
6719 * for initialization unwind. Remove those
6720 * devices and proceed with the remaining.
6722 if (dev->reg_state == NETREG_UNINITIALIZED) {
6723 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6727 list_del(&dev->unreg_list);
6730 dev->dismantle = true;
6731 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6734 /* If device is running, close it first. */
6735 list_for_each_entry(dev, head, unreg_list)
6736 list_add_tail(&dev->close_list, &close_head);
6737 dev_close_many(&close_head, true);
6739 list_for_each_entry(dev, head, unreg_list) {
6740 /* And unlink it from device chain. */
6741 unlist_netdevice(dev);
6743 dev->reg_state = NETREG_UNREGISTERING;
6745 flush_all_backlogs();
6749 list_for_each_entry(dev, head, unreg_list) {
6750 struct sk_buff *skb = NULL;
6752 /* Shutdown queueing discipline. */
6756 /* Notify protocols, that we are about to destroy
6757 this device. They should clean all the things.
6759 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6761 if (!dev->rtnl_link_ops ||
6762 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6763 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6767 * Flush the unicast and multicast chains
6772 if (dev->netdev_ops->ndo_uninit)
6773 dev->netdev_ops->ndo_uninit(dev);
6776 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6778 /* Notifier chain MUST detach us all upper devices. */
6779 WARN_ON(netdev_has_any_upper_dev(dev));
6781 /* Remove entries from kobject tree */
6782 netdev_unregister_kobject(dev);
6784 /* Remove XPS queueing entries */
6785 netif_reset_xps_queues_gt(dev, 0);
6791 list_for_each_entry(dev, head, unreg_list)
6795 static void rollback_registered(struct net_device *dev)
6799 list_add(&dev->unreg_list, &single);
6800 rollback_registered_many(&single);
6804 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6805 struct net_device *upper, netdev_features_t features)
6807 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6808 netdev_features_t feature;
6811 for_each_netdev_feature(&upper_disables, feature_bit) {
6812 feature = __NETIF_F_BIT(feature_bit);
6813 if (!(upper->wanted_features & feature)
6814 && (features & feature)) {
6815 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6816 &feature, upper->name);
6817 features &= ~feature;
6824 static void netdev_sync_lower_features(struct net_device *upper,
6825 struct net_device *lower, netdev_features_t features)
6827 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6828 netdev_features_t feature;
6831 for_each_netdev_feature(&upper_disables, feature_bit) {
6832 feature = __NETIF_F_BIT(feature_bit);
6833 if (!(features & feature) && (lower->features & feature)) {
6834 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6835 &feature, lower->name);
6836 lower->wanted_features &= ~feature;
6837 netdev_update_features(lower);
6839 if (unlikely(lower->features & feature))
6840 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6841 &feature, lower->name);
6846 static netdev_features_t netdev_fix_features(struct net_device *dev,
6847 netdev_features_t features)
6849 /* Fix illegal checksum combinations */
6850 if ((features & NETIF_F_HW_CSUM) &&
6851 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6852 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6853 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6856 /* TSO requires that SG is present as well. */
6857 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6858 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6859 features &= ~NETIF_F_ALL_TSO;
6862 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6863 !(features & NETIF_F_IP_CSUM)) {
6864 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6865 features &= ~NETIF_F_TSO;
6866 features &= ~NETIF_F_TSO_ECN;
6869 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6870 !(features & NETIF_F_IPV6_CSUM)) {
6871 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6872 features &= ~NETIF_F_TSO6;
6875 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6876 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6877 features &= ~NETIF_F_TSO_MANGLEID;
6879 /* TSO ECN requires that TSO is present as well. */
6880 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6881 features &= ~NETIF_F_TSO_ECN;
6883 /* Software GSO depends on SG. */
6884 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6885 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6886 features &= ~NETIF_F_GSO;
6889 /* UFO needs SG and checksumming */
6890 if (features & NETIF_F_UFO) {
6891 /* maybe split UFO into V4 and V6? */
6892 if (!(features & NETIF_F_HW_CSUM) &&
6893 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6894 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6896 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6897 features &= ~NETIF_F_UFO;
6900 if (!(features & NETIF_F_SG)) {
6902 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6903 features &= ~NETIF_F_UFO;
6907 /* GSO partial features require GSO partial be set */
6908 if ((features & dev->gso_partial_features) &&
6909 !(features & NETIF_F_GSO_PARTIAL)) {
6911 "Dropping partially supported GSO features since no GSO partial.\n");
6912 features &= ~dev->gso_partial_features;
6915 #ifdef CONFIG_NET_RX_BUSY_POLL
6916 if (dev->netdev_ops->ndo_busy_poll)
6917 features |= NETIF_F_BUSY_POLL;
6920 features &= ~NETIF_F_BUSY_POLL;
6925 int __netdev_update_features(struct net_device *dev)
6927 struct net_device *upper, *lower;
6928 netdev_features_t features;
6929 struct list_head *iter;
6934 features = netdev_get_wanted_features(dev);
6936 if (dev->netdev_ops->ndo_fix_features)
6937 features = dev->netdev_ops->ndo_fix_features(dev, features);
6939 /* driver might be less strict about feature dependencies */
6940 features = netdev_fix_features(dev, features);
6942 /* some features can't be enabled if they're off an an upper device */
6943 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6944 features = netdev_sync_upper_features(dev, upper, features);
6946 if (dev->features == features)
6949 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6950 &dev->features, &features);
6952 if (dev->netdev_ops->ndo_set_features)
6953 err = dev->netdev_ops->ndo_set_features(dev, features);
6957 if (unlikely(err < 0)) {
6959 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6960 err, &features, &dev->features);
6961 /* return non-0 since some features might have changed and
6962 * it's better to fire a spurious notification than miss it
6968 /* some features must be disabled on lower devices when disabled
6969 * on an upper device (think: bonding master or bridge)
6971 netdev_for_each_lower_dev(dev, lower, iter)
6972 netdev_sync_lower_features(dev, lower, features);
6975 dev->features = features;
6977 return err < 0 ? 0 : 1;
6981 * netdev_update_features - recalculate device features
6982 * @dev: the device to check
6984 * Recalculate dev->features set and send notifications if it
6985 * has changed. Should be called after driver or hardware dependent
6986 * conditions might have changed that influence the features.
6988 void netdev_update_features(struct net_device *dev)
6990 if (__netdev_update_features(dev))
6991 netdev_features_change(dev);
6993 EXPORT_SYMBOL(netdev_update_features);
6996 * netdev_change_features - recalculate device features
6997 * @dev: the device to check
6999 * Recalculate dev->features set and send notifications even
7000 * if they have not changed. Should be called instead of
7001 * netdev_update_features() if also dev->vlan_features might
7002 * have changed to allow the changes to be propagated to stacked
7005 void netdev_change_features(struct net_device *dev)
7007 __netdev_update_features(dev);
7008 netdev_features_change(dev);
7010 EXPORT_SYMBOL(netdev_change_features);
7013 * netif_stacked_transfer_operstate - transfer operstate
7014 * @rootdev: the root or lower level device to transfer state from
7015 * @dev: the device to transfer operstate to
7017 * Transfer operational state from root to device. This is normally
7018 * called when a stacking relationship exists between the root
7019 * device and the device(a leaf device).
7021 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
7022 struct net_device *dev)
7024 if (rootdev->operstate == IF_OPER_DORMANT)
7025 netif_dormant_on(dev);
7027 netif_dormant_off(dev);
7029 if (netif_carrier_ok(rootdev)) {
7030 if (!netif_carrier_ok(dev))
7031 netif_carrier_on(dev);
7033 if (netif_carrier_ok(dev))
7034 netif_carrier_off(dev);
7037 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
7040 static int netif_alloc_rx_queues(struct net_device *dev)
7042 unsigned int i, count = dev->num_rx_queues;
7043 struct netdev_rx_queue *rx;
7044 size_t sz = count * sizeof(*rx);
7048 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7056 for (i = 0; i < count; i++)
7062 static void netdev_init_one_queue(struct net_device *dev,
7063 struct netdev_queue *queue, void *_unused)
7065 /* Initialize queue lock */
7066 spin_lock_init(&queue->_xmit_lock);
7067 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
7068 queue->xmit_lock_owner = -1;
7069 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
7072 dql_init(&queue->dql, HZ);
7076 static void netif_free_tx_queues(struct net_device *dev)
7081 static int netif_alloc_netdev_queues(struct net_device *dev)
7083 unsigned int count = dev->num_tx_queues;
7084 struct netdev_queue *tx;
7085 size_t sz = count * sizeof(*tx);
7087 if (count < 1 || count > 0xffff)
7090 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7098 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
7099 spin_lock_init(&dev->tx_global_lock);
7104 void netif_tx_stop_all_queues(struct net_device *dev)
7108 for (i = 0; i < dev->num_tx_queues; i++) {
7109 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
7110 netif_tx_stop_queue(txq);
7113 EXPORT_SYMBOL(netif_tx_stop_all_queues);
7116 * register_netdevice - register a network device
7117 * @dev: device to register
7119 * Take a completed network device structure and add it to the kernel
7120 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7121 * chain. 0 is returned on success. A negative errno code is returned
7122 * on a failure to set up the device, or if the name is a duplicate.
7124 * Callers must hold the rtnl semaphore. You may want
7125 * register_netdev() instead of this.
7128 * The locking appears insufficient to guarantee two parallel registers
7129 * will not get the same name.
7132 int register_netdevice(struct net_device *dev)
7135 struct net *net = dev_net(dev);
7137 BUG_ON(dev_boot_phase);
7142 /* When net_device's are persistent, this will be fatal. */
7143 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7146 spin_lock_init(&dev->addr_list_lock);
7147 netdev_set_addr_lockdep_class(dev);
7149 ret = dev_get_valid_name(net, dev, dev->name);
7153 /* Init, if this function is available */
7154 if (dev->netdev_ops->ndo_init) {
7155 ret = dev->netdev_ops->ndo_init(dev);
7163 if (((dev->hw_features | dev->features) &
7164 NETIF_F_HW_VLAN_CTAG_FILTER) &&
7165 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7166 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7167 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7174 dev->ifindex = dev_new_index(net);
7175 else if (__dev_get_by_index(net, dev->ifindex))
7178 /* Transfer changeable features to wanted_features and enable
7179 * software offloads (GSO and GRO).
7181 dev->hw_features |= NETIF_F_SOFT_FEATURES;
7182 dev->features |= NETIF_F_SOFT_FEATURES;
7183 dev->wanted_features = dev->features & dev->hw_features;
7185 if (!(dev->flags & IFF_LOOPBACK))
7186 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7188 /* If IPv4 TCP segmentation offload is supported we should also
7189 * allow the device to enable segmenting the frame with the option
7190 * of ignoring a static IP ID value. This doesn't enable the
7191 * feature itself but allows the user to enable it later.
7193 if (dev->hw_features & NETIF_F_TSO)
7194 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7195 if (dev->vlan_features & NETIF_F_TSO)
7196 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7197 if (dev->mpls_features & NETIF_F_TSO)
7198 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7199 if (dev->hw_enc_features & NETIF_F_TSO)
7200 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7202 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7204 dev->vlan_features |= NETIF_F_HIGHDMA;
7206 /* Make NETIF_F_SG inheritable to tunnel devices.
7208 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7210 /* Make NETIF_F_SG inheritable to MPLS.
7212 dev->mpls_features |= NETIF_F_SG;
7214 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7215 ret = notifier_to_errno(ret);
7219 ret = netdev_register_kobject(dev);
7222 dev->reg_state = NETREG_REGISTERED;
7224 __netdev_update_features(dev);
7227 * Default initial state at registry is that the
7228 * device is present.
7231 set_bit(__LINK_STATE_PRESENT, &dev->state);
7233 linkwatch_init_dev(dev);
7235 dev_init_scheduler(dev);
7237 list_netdevice(dev);
7238 add_device_randomness(dev->dev_addr, dev->addr_len);
7240 /* If the device has permanent device address, driver should
7241 * set dev_addr and also addr_assign_type should be set to
7242 * NET_ADDR_PERM (default value).
7244 if (dev->addr_assign_type == NET_ADDR_PERM)
7245 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7247 /* Notify protocols, that a new device appeared. */
7248 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7249 ret = notifier_to_errno(ret);
7251 rollback_registered(dev);
7252 dev->reg_state = NETREG_UNREGISTERED;
7255 * Prevent userspace races by waiting until the network
7256 * device is fully setup before sending notifications.
7258 if (!dev->rtnl_link_ops ||
7259 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7260 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7266 if (dev->netdev_ops->ndo_uninit)
7267 dev->netdev_ops->ndo_uninit(dev);
7270 EXPORT_SYMBOL(register_netdevice);
7273 * init_dummy_netdev - init a dummy network device for NAPI
7274 * @dev: device to init
7276 * This takes a network device structure and initialize the minimum
7277 * amount of fields so it can be used to schedule NAPI polls without
7278 * registering a full blown interface. This is to be used by drivers
7279 * that need to tie several hardware interfaces to a single NAPI
7280 * poll scheduler due to HW limitations.
7282 int init_dummy_netdev(struct net_device *dev)
7284 /* Clear everything. Note we don't initialize spinlocks
7285 * are they aren't supposed to be taken by any of the
7286 * NAPI code and this dummy netdev is supposed to be
7287 * only ever used for NAPI polls
7289 memset(dev, 0, sizeof(struct net_device));
7291 /* make sure we BUG if trying to hit standard
7292 * register/unregister code path
7294 dev->reg_state = NETREG_DUMMY;
7296 /* NAPI wants this */
7297 INIT_LIST_HEAD(&dev->napi_list);
7299 /* a dummy interface is started by default */
7300 set_bit(__LINK_STATE_PRESENT, &dev->state);
7301 set_bit(__LINK_STATE_START, &dev->state);
7303 /* Note : We dont allocate pcpu_refcnt for dummy devices,
7304 * because users of this 'device' dont need to change
7310 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7314 * register_netdev - register a network device
7315 * @dev: device to register
7317 * Take a completed network device structure and add it to the kernel
7318 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7319 * chain. 0 is returned on success. A negative errno code is returned
7320 * on a failure to set up the device, or if the name is a duplicate.
7322 * This is a wrapper around register_netdevice that takes the rtnl semaphore
7323 * and expands the device name if you passed a format string to
7326 int register_netdev(struct net_device *dev)
7331 err = register_netdevice(dev);
7335 EXPORT_SYMBOL(register_netdev);
7337 int netdev_refcnt_read(const struct net_device *dev)
7341 for_each_possible_cpu(i)
7342 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7345 EXPORT_SYMBOL(netdev_refcnt_read);
7348 * netdev_wait_allrefs - wait until all references are gone.
7349 * @dev: target net_device
7351 * This is called when unregistering network devices.
7353 * Any protocol or device that holds a reference should register
7354 * for netdevice notification, and cleanup and put back the
7355 * reference if they receive an UNREGISTER event.
7356 * We can get stuck here if buggy protocols don't correctly
7359 static void netdev_wait_allrefs(struct net_device *dev)
7361 unsigned long rebroadcast_time, warning_time;
7364 linkwatch_forget_dev(dev);
7366 rebroadcast_time = warning_time = jiffies;
7367 refcnt = netdev_refcnt_read(dev);
7369 while (refcnt != 0) {
7370 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7373 /* Rebroadcast unregister notification */
7374 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7380 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7381 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7383 /* We must not have linkwatch events
7384 * pending on unregister. If this
7385 * happens, we simply run the queue
7386 * unscheduled, resulting in a noop
7389 linkwatch_run_queue();
7394 rebroadcast_time = jiffies;
7399 refcnt = netdev_refcnt_read(dev);
7401 if (time_after(jiffies, warning_time + 10 * HZ)) {
7402 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7404 warning_time = jiffies;
7413 * register_netdevice(x1);
7414 * register_netdevice(x2);
7416 * unregister_netdevice(y1);
7417 * unregister_netdevice(y2);
7423 * We are invoked by rtnl_unlock().
7424 * This allows us to deal with problems:
7425 * 1) We can delete sysfs objects which invoke hotplug
7426 * without deadlocking with linkwatch via keventd.
7427 * 2) Since we run with the RTNL semaphore not held, we can sleep
7428 * safely in order to wait for the netdev refcnt to drop to zero.
7430 * We must not return until all unregister events added during
7431 * the interval the lock was held have been completed.
7433 void netdev_run_todo(void)
7435 struct list_head list;
7437 /* Snapshot list, allow later requests */
7438 list_replace_init(&net_todo_list, &list);
7443 /* Wait for rcu callbacks to finish before next phase */
7444 if (!list_empty(&list))
7447 while (!list_empty(&list)) {
7448 struct net_device *dev
7449 = list_first_entry(&list, struct net_device, todo_list);
7450 list_del(&dev->todo_list);
7453 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7456 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7457 pr_err("network todo '%s' but state %d\n",
7458 dev->name, dev->reg_state);
7463 dev->reg_state = NETREG_UNREGISTERED;
7465 netdev_wait_allrefs(dev);
7468 BUG_ON(netdev_refcnt_read(dev));
7469 BUG_ON(!list_empty(&dev->ptype_all));
7470 BUG_ON(!list_empty(&dev->ptype_specific));
7471 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7472 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7473 WARN_ON(dev->dn_ptr);
7475 if (dev->destructor)
7476 dev->destructor(dev);
7478 /* Report a network device has been unregistered */
7480 dev_net(dev)->dev_unreg_count--;
7482 wake_up(&netdev_unregistering_wq);
7484 /* Free network device */
7485 kobject_put(&dev->dev.kobj);
7489 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7490 * all the same fields in the same order as net_device_stats, with only
7491 * the type differing, but rtnl_link_stats64 may have additional fields
7492 * at the end for newer counters.
7494 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7495 const struct net_device_stats *netdev_stats)
7497 #if BITS_PER_LONG == 64
7498 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7499 memcpy(stats64, netdev_stats, sizeof(*stats64));
7500 /* zero out counters that only exist in rtnl_link_stats64 */
7501 memset((char *)stats64 + sizeof(*netdev_stats), 0,
7502 sizeof(*stats64) - sizeof(*netdev_stats));
7504 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7505 const unsigned long *src = (const unsigned long *)netdev_stats;
7506 u64 *dst = (u64 *)stats64;
7508 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7509 for (i = 0; i < n; i++)
7511 /* zero out counters that only exist in rtnl_link_stats64 */
7512 memset((char *)stats64 + n * sizeof(u64), 0,
7513 sizeof(*stats64) - n * sizeof(u64));
7516 EXPORT_SYMBOL(netdev_stats_to_stats64);
7519 * dev_get_stats - get network device statistics
7520 * @dev: device to get statistics from
7521 * @storage: place to store stats
7523 * Get network statistics from device. Return @storage.
7524 * The device driver may provide its own method by setting
7525 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7526 * otherwise the internal statistics structure is used.
7528 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7529 struct rtnl_link_stats64 *storage)
7531 const struct net_device_ops *ops = dev->netdev_ops;
7533 if (ops->ndo_get_stats64) {
7534 memset(storage, 0, sizeof(*storage));
7535 ops->ndo_get_stats64(dev, storage);
7536 } else if (ops->ndo_get_stats) {
7537 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7539 netdev_stats_to_stats64(storage, &dev->stats);
7541 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7542 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7543 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7546 EXPORT_SYMBOL(dev_get_stats);
7548 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7550 struct netdev_queue *queue = dev_ingress_queue(dev);
7552 #ifdef CONFIG_NET_CLS_ACT
7555 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7558 netdev_init_one_queue(dev, queue, NULL);
7559 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7560 queue->qdisc_sleeping = &noop_qdisc;
7561 rcu_assign_pointer(dev->ingress_queue, queue);
7566 static const struct ethtool_ops default_ethtool_ops;
7568 void netdev_set_default_ethtool_ops(struct net_device *dev,
7569 const struct ethtool_ops *ops)
7571 if (dev->ethtool_ops == &default_ethtool_ops)
7572 dev->ethtool_ops = ops;
7574 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7576 void netdev_freemem(struct net_device *dev)
7578 char *addr = (char *)dev - dev->padded;
7584 * alloc_netdev_mqs - allocate network device
7585 * @sizeof_priv: size of private data to allocate space for
7586 * @name: device name format string
7587 * @name_assign_type: origin of device name
7588 * @setup: callback to initialize device
7589 * @txqs: the number of TX subqueues to allocate
7590 * @rxqs: the number of RX subqueues to allocate
7592 * Allocates a struct net_device with private data area for driver use
7593 * and performs basic initialization. Also allocates subqueue structs
7594 * for each queue on the device.
7596 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7597 unsigned char name_assign_type,
7598 void (*setup)(struct net_device *),
7599 unsigned int txqs, unsigned int rxqs)
7601 struct net_device *dev;
7603 struct net_device *p;
7605 BUG_ON(strlen(name) >= sizeof(dev->name));
7608 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7614 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7619 alloc_size = sizeof(struct net_device);
7621 /* ensure 32-byte alignment of private area */
7622 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7623 alloc_size += sizeof_priv;
7625 /* ensure 32-byte alignment of whole construct */
7626 alloc_size += NETDEV_ALIGN - 1;
7628 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7630 p = vzalloc(alloc_size);
7634 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7635 dev->padded = (char *)dev - (char *)p;
7637 dev->pcpu_refcnt = alloc_percpu(int);
7638 if (!dev->pcpu_refcnt)
7641 if (dev_addr_init(dev))
7647 dev_net_set(dev, &init_net);
7649 dev->gso_max_size = GSO_MAX_SIZE;
7650 dev->gso_max_segs = GSO_MAX_SEGS;
7652 INIT_LIST_HEAD(&dev->napi_list);
7653 INIT_LIST_HEAD(&dev->unreg_list);
7654 INIT_LIST_HEAD(&dev->close_list);
7655 INIT_LIST_HEAD(&dev->link_watch_list);
7656 INIT_LIST_HEAD(&dev->adj_list.upper);
7657 INIT_LIST_HEAD(&dev->adj_list.lower);
7658 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7659 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7660 INIT_LIST_HEAD(&dev->ptype_all);
7661 INIT_LIST_HEAD(&dev->ptype_specific);
7662 #ifdef CONFIG_NET_SCHED
7663 hash_init(dev->qdisc_hash);
7665 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7668 if (!dev->tx_queue_len) {
7669 dev->priv_flags |= IFF_NO_QUEUE;
7670 dev->tx_queue_len = 1;
7673 dev->num_tx_queues = txqs;
7674 dev->real_num_tx_queues = txqs;
7675 if (netif_alloc_netdev_queues(dev))
7679 dev->num_rx_queues = rxqs;
7680 dev->real_num_rx_queues = rxqs;
7681 if (netif_alloc_rx_queues(dev))
7685 strcpy(dev->name, name);
7686 dev->name_assign_type = name_assign_type;
7687 dev->group = INIT_NETDEV_GROUP;
7688 if (!dev->ethtool_ops)
7689 dev->ethtool_ops = &default_ethtool_ops;
7691 nf_hook_ingress_init(dev);
7700 free_percpu(dev->pcpu_refcnt);
7702 netdev_freemem(dev);
7705 EXPORT_SYMBOL(alloc_netdev_mqs);
7708 * free_netdev - free network device
7711 * This function does the last stage of destroying an allocated device
7712 * interface. The reference to the device object is released.
7713 * If this is the last reference then it will be freed.
7714 * Must be called in process context.
7716 void free_netdev(struct net_device *dev)
7718 struct napi_struct *p, *n;
7721 netif_free_tx_queues(dev);
7726 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7728 /* Flush device addresses */
7729 dev_addr_flush(dev);
7731 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7734 free_percpu(dev->pcpu_refcnt);
7735 dev->pcpu_refcnt = NULL;
7737 /* Compatibility with error handling in drivers */
7738 if (dev->reg_state == NETREG_UNINITIALIZED) {
7739 netdev_freemem(dev);
7743 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7744 dev->reg_state = NETREG_RELEASED;
7746 /* will free via device release */
7747 put_device(&dev->dev);
7749 EXPORT_SYMBOL(free_netdev);
7752 * synchronize_net - Synchronize with packet receive processing
7754 * Wait for packets currently being received to be done.
7755 * Does not block later packets from starting.
7757 void synchronize_net(void)
7760 if (rtnl_is_locked())
7761 synchronize_rcu_expedited();
7765 EXPORT_SYMBOL(synchronize_net);
7768 * unregister_netdevice_queue - remove device from the kernel
7772 * This function shuts down a device interface and removes it
7773 * from the kernel tables.
7774 * If head not NULL, device is queued to be unregistered later.
7776 * Callers must hold the rtnl semaphore. You may want
7777 * unregister_netdev() instead of this.
7780 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7785 list_move_tail(&dev->unreg_list, head);
7787 rollback_registered(dev);
7788 /* Finish processing unregister after unlock */
7792 EXPORT_SYMBOL(unregister_netdevice_queue);
7795 * unregister_netdevice_many - unregister many devices
7796 * @head: list of devices
7798 * Note: As most callers use a stack allocated list_head,
7799 * we force a list_del() to make sure stack wont be corrupted later.
7801 void unregister_netdevice_many(struct list_head *head)
7803 struct net_device *dev;
7805 if (!list_empty(head)) {
7806 rollback_registered_many(head);
7807 list_for_each_entry(dev, head, unreg_list)
7812 EXPORT_SYMBOL(unregister_netdevice_many);
7815 * unregister_netdev - remove device from the kernel
7818 * This function shuts down a device interface and removes it
7819 * from the kernel tables.
7821 * This is just a wrapper for unregister_netdevice that takes
7822 * the rtnl semaphore. In general you want to use this and not
7823 * unregister_netdevice.
7825 void unregister_netdev(struct net_device *dev)
7828 unregister_netdevice(dev);
7831 EXPORT_SYMBOL(unregister_netdev);
7834 * dev_change_net_namespace - move device to different nethost namespace
7836 * @net: network namespace
7837 * @pat: If not NULL name pattern to try if the current device name
7838 * is already taken in the destination network namespace.
7840 * This function shuts down a device interface and moves it
7841 * to a new network namespace. On success 0 is returned, on
7842 * a failure a netagive errno code is returned.
7844 * Callers must hold the rtnl semaphore.
7847 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7853 /* Don't allow namespace local devices to be moved. */
7855 if (dev->features & NETIF_F_NETNS_LOCAL)
7858 /* Ensure the device has been registrered */
7859 if (dev->reg_state != NETREG_REGISTERED)
7862 /* Get out if there is nothing todo */
7864 if (net_eq(dev_net(dev), net))
7867 /* Pick the destination device name, and ensure
7868 * we can use it in the destination network namespace.
7871 if (__dev_get_by_name(net, dev->name)) {
7872 /* We get here if we can't use the current device name */
7875 if (dev_get_valid_name(net, dev, pat) < 0)
7880 * And now a mini version of register_netdevice unregister_netdevice.
7883 /* If device is running close it first. */
7886 /* And unlink it from device chain */
7888 unlist_netdevice(dev);
7892 /* Shutdown queueing discipline. */
7895 /* Notify protocols, that we are about to destroy
7896 this device. They should clean all the things.
7898 Note that dev->reg_state stays at NETREG_REGISTERED.
7899 This is wanted because this way 8021q and macvlan know
7900 the device is just moving and can keep their slaves up.
7902 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7904 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7905 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7908 * Flush the unicast and multicast chains
7913 /* Send a netdev-removed uevent to the old namespace */
7914 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7915 netdev_adjacent_del_links(dev);
7917 /* Actually switch the network namespace */
7918 dev_net_set(dev, net);
7920 /* If there is an ifindex conflict assign a new one */
7921 if (__dev_get_by_index(net, dev->ifindex))
7922 dev->ifindex = dev_new_index(net);
7924 /* Send a netdev-add uevent to the new namespace */
7925 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7926 netdev_adjacent_add_links(dev);
7928 /* Fixup kobjects */
7929 err = device_rename(&dev->dev, dev->name);
7932 /* Add the device back in the hashes */
7933 list_netdevice(dev);
7935 /* Notify protocols, that a new device appeared. */
7936 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7939 * Prevent userspace races by waiting until the network
7940 * device is fully setup before sending notifications.
7942 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7949 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7951 static int dev_cpu_callback(struct notifier_block *nfb,
7952 unsigned long action,
7955 struct sk_buff **list_skb;
7956 struct sk_buff *skb;
7957 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7958 struct softnet_data *sd, *oldsd;
7960 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7963 local_irq_disable();
7964 cpu = smp_processor_id();
7965 sd = &per_cpu(softnet_data, cpu);
7966 oldsd = &per_cpu(softnet_data, oldcpu);
7968 /* Find end of our completion_queue. */
7969 list_skb = &sd->completion_queue;
7971 list_skb = &(*list_skb)->next;
7972 /* Append completion queue from offline CPU. */
7973 *list_skb = oldsd->completion_queue;
7974 oldsd->completion_queue = NULL;
7976 /* Append output queue from offline CPU. */
7977 if (oldsd->output_queue) {
7978 *sd->output_queue_tailp = oldsd->output_queue;
7979 sd->output_queue_tailp = oldsd->output_queue_tailp;
7980 oldsd->output_queue = NULL;
7981 oldsd->output_queue_tailp = &oldsd->output_queue;
7983 /* Append NAPI poll list from offline CPU, with one exception :
7984 * process_backlog() must be called by cpu owning percpu backlog.
7985 * We properly handle process_queue & input_pkt_queue later.
7987 while (!list_empty(&oldsd->poll_list)) {
7988 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7992 list_del_init(&napi->poll_list);
7993 if (napi->poll == process_backlog)
7996 ____napi_schedule(sd, napi);
7999 raise_softirq_irqoff(NET_TX_SOFTIRQ);
8002 /* Process offline CPU's input_pkt_queue */
8003 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
8005 input_queue_head_incr(oldsd);
8007 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
8009 input_queue_head_incr(oldsd);
8017 * netdev_increment_features - increment feature set by one
8018 * @all: current feature set
8019 * @one: new feature set
8020 * @mask: mask feature set
8022 * Computes a new feature set after adding a device with feature set
8023 * @one to the master device with current feature set @all. Will not
8024 * enable anything that is off in @mask. Returns the new feature set.
8026 netdev_features_t netdev_increment_features(netdev_features_t all,
8027 netdev_features_t one, netdev_features_t mask)
8029 if (mask & NETIF_F_HW_CSUM)
8030 mask |= NETIF_F_CSUM_MASK;
8031 mask |= NETIF_F_VLAN_CHALLENGED;
8033 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
8034 all &= one | ~NETIF_F_ALL_FOR_ALL;
8036 /* If one device supports hw checksumming, set for all. */
8037 if (all & NETIF_F_HW_CSUM)
8038 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
8042 EXPORT_SYMBOL(netdev_increment_features);
8044 static struct hlist_head * __net_init netdev_create_hash(void)
8047 struct hlist_head *hash;
8049 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
8051 for (i = 0; i < NETDEV_HASHENTRIES; i++)
8052 INIT_HLIST_HEAD(&hash[i]);
8057 /* Initialize per network namespace state */
8058 static int __net_init netdev_init(struct net *net)
8060 if (net != &init_net)
8061 INIT_LIST_HEAD(&net->dev_base_head);
8063 net->dev_name_head = netdev_create_hash();
8064 if (net->dev_name_head == NULL)
8067 net->dev_index_head = netdev_create_hash();
8068 if (net->dev_index_head == NULL)
8074 kfree(net->dev_name_head);
8080 * netdev_drivername - network driver for the device
8081 * @dev: network device
8083 * Determine network driver for device.
8085 const char *netdev_drivername(const struct net_device *dev)
8087 const struct device_driver *driver;
8088 const struct device *parent;
8089 const char *empty = "";
8091 parent = dev->dev.parent;
8095 driver = parent->driver;
8096 if (driver && driver->name)
8097 return driver->name;
8101 static void __netdev_printk(const char *level, const struct net_device *dev,
8102 struct va_format *vaf)
8104 if (dev && dev->dev.parent) {
8105 dev_printk_emit(level[1] - '0',
8108 dev_driver_string(dev->dev.parent),
8109 dev_name(dev->dev.parent),
8110 netdev_name(dev), netdev_reg_state(dev),
8113 printk("%s%s%s: %pV",
8114 level, netdev_name(dev), netdev_reg_state(dev), vaf);
8116 printk("%s(NULL net_device): %pV", level, vaf);
8120 void netdev_printk(const char *level, const struct net_device *dev,
8121 const char *format, ...)
8123 struct va_format vaf;
8126 va_start(args, format);
8131 __netdev_printk(level, dev, &vaf);
8135 EXPORT_SYMBOL(netdev_printk);
8137 #define define_netdev_printk_level(func, level) \
8138 void func(const struct net_device *dev, const char *fmt, ...) \
8140 struct va_format vaf; \
8143 va_start(args, fmt); \
8148 __netdev_printk(level, dev, &vaf); \
8152 EXPORT_SYMBOL(func);
8154 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8155 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8156 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8157 define_netdev_printk_level(netdev_err, KERN_ERR);
8158 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8159 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8160 define_netdev_printk_level(netdev_info, KERN_INFO);
8162 static void __net_exit netdev_exit(struct net *net)
8164 kfree(net->dev_name_head);
8165 kfree(net->dev_index_head);
8168 static struct pernet_operations __net_initdata netdev_net_ops = {
8169 .init = netdev_init,
8170 .exit = netdev_exit,
8173 static void __net_exit default_device_exit(struct net *net)
8175 struct net_device *dev, *aux;
8177 * Push all migratable network devices back to the
8178 * initial network namespace
8181 for_each_netdev_safe(net, dev, aux) {
8183 char fb_name[IFNAMSIZ];
8185 /* Ignore unmoveable devices (i.e. loopback) */
8186 if (dev->features & NETIF_F_NETNS_LOCAL)
8189 /* Leave virtual devices for the generic cleanup */
8190 if (dev->rtnl_link_ops)
8193 /* Push remaining network devices to init_net */
8194 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8195 err = dev_change_net_namespace(dev, &init_net, fb_name);
8197 pr_emerg("%s: failed to move %s to init_net: %d\n",
8198 __func__, dev->name, err);
8205 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8207 /* Return with the rtnl_lock held when there are no network
8208 * devices unregistering in any network namespace in net_list.
8212 DEFINE_WAIT_FUNC(wait, woken_wake_function);
8214 add_wait_queue(&netdev_unregistering_wq, &wait);
8216 unregistering = false;
8218 list_for_each_entry(net, net_list, exit_list) {
8219 if (net->dev_unreg_count > 0) {
8220 unregistering = true;
8228 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8230 remove_wait_queue(&netdev_unregistering_wq, &wait);
8233 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8235 /* At exit all network devices most be removed from a network
8236 * namespace. Do this in the reverse order of registration.
8237 * Do this across as many network namespaces as possible to
8238 * improve batching efficiency.
8240 struct net_device *dev;
8242 LIST_HEAD(dev_kill_list);
8244 /* To prevent network device cleanup code from dereferencing
8245 * loopback devices or network devices that have been freed
8246 * wait here for all pending unregistrations to complete,
8247 * before unregistring the loopback device and allowing the
8248 * network namespace be freed.
8250 * The netdev todo list containing all network devices
8251 * unregistrations that happen in default_device_exit_batch
8252 * will run in the rtnl_unlock() at the end of
8253 * default_device_exit_batch.
8255 rtnl_lock_unregistering(net_list);
8256 list_for_each_entry(net, net_list, exit_list) {
8257 for_each_netdev_reverse(net, dev) {
8258 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8259 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8261 unregister_netdevice_queue(dev, &dev_kill_list);
8264 unregister_netdevice_many(&dev_kill_list);
8268 static struct pernet_operations __net_initdata default_device_ops = {
8269 .exit = default_device_exit,
8270 .exit_batch = default_device_exit_batch,
8274 * Initialize the DEV module. At boot time this walks the device list and
8275 * unhooks any devices that fail to initialise (normally hardware not
8276 * present) and leaves us with a valid list of present and active devices.
8281 * This is called single threaded during boot, so no need
8282 * to take the rtnl semaphore.
8284 static int __init net_dev_init(void)
8286 int i, rc = -ENOMEM;
8288 BUG_ON(!dev_boot_phase);
8290 if (dev_proc_init())
8293 if (netdev_kobject_init())
8296 INIT_LIST_HEAD(&ptype_all);
8297 for (i = 0; i < PTYPE_HASH_SIZE; i++)
8298 INIT_LIST_HEAD(&ptype_base[i]);
8300 INIT_LIST_HEAD(&offload_base);
8302 if (register_pernet_subsys(&netdev_net_ops))
8306 * Initialise the packet receive queues.
8309 for_each_possible_cpu(i) {
8310 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8311 struct softnet_data *sd = &per_cpu(softnet_data, i);
8313 INIT_WORK(flush, flush_backlog);
8315 skb_queue_head_init(&sd->input_pkt_queue);
8316 skb_queue_head_init(&sd->process_queue);
8317 INIT_LIST_HEAD(&sd->poll_list);
8318 sd->output_queue_tailp = &sd->output_queue;
8320 sd->csd.func = rps_trigger_softirq;
8325 sd->backlog.poll = process_backlog;
8326 sd->backlog.weight = weight_p;
8331 /* The loopback device is special if any other network devices
8332 * is present in a network namespace the loopback device must
8333 * be present. Since we now dynamically allocate and free the
8334 * loopback device ensure this invariant is maintained by
8335 * keeping the loopback device as the first device on the
8336 * list of network devices. Ensuring the loopback devices
8337 * is the first device that appears and the last network device
8340 if (register_pernet_device(&loopback_net_ops))
8343 if (register_pernet_device(&default_device_ops))
8346 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8347 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8349 hotcpu_notifier(dev_cpu_callback, 0);
8356 subsys_initcall(net_dev_init);