net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138 #include <linux/netfilter_ingress.h>
 139
 140 #include "net-sysfs.h"
 141
 142 /* Instead of increasing this, you should create a hash table. */
 143 #define MAX_GRO_SKBS 8
 144
 145 /* This should be increased if a protocol with a bigger head is added. */
 146 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 147
 148 static DEFINE_SPINLOCK(ptype_lock);
 149 static DEFINE_SPINLOCK(offload_lock);
 150 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 151 struct list_head ptype_all __read_mostly;       /* Taps */
 152 static struct list_head offload_base __read_mostly;
 153
 154 static int netif_rx_internal(struct sk_buff *skb);
 155 static int call_netdevice_notifiers_info(unsigned long val,
 156                                          struct net_device *dev,
 157                                          struct netdev_notifier_info *info);
 158
 159 /*
 160  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 161  * semaphore.
 162  *
 163  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 164  *
 165  * Writers must hold the rtnl semaphore while they loop through the
 166  * dev_base_head list, and hold dev_base_lock for writing when they do the
 167  * actual updates.  This allows pure readers to access the list even
 168  * while a writer is preparing to update it.
 169  *
 170  * To put it another way, dev_base_lock is held for writing only to
 171  * protect against pure readers; the rtnl semaphore provides the
 172  * protection against other writers.
 173  *
 174  * See, for example usages, register_netdevice() and
 175  * unregister_netdevice(), which must be called with the rtnl
 176  * semaphore held.
 177  */
 178 DEFINE_RWLOCK(dev_base_lock);
 179 EXPORT_SYMBOL(dev_base_lock);
 180
 181 /* protects napi_hash addition/deletion and napi_gen_id */
 182 static DEFINE_SPINLOCK(napi_hash_lock);
 183
 184 static unsigned int napi_gen_id;
 185 static DEFINE_HASHTABLE(napi_hash, 8);
 186
 187 static seqcount_t devnet_rename_seq;
 188
 189 static inline void dev_base_seq_inc(struct net *net)
 190 {
 191         while (++net->dev_base_seq == 0);
 192 }
 193
 194 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 195 {
 196         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 197
 198         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 199 }
 200
 201 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 202 {
 203         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 204 }
 205
 206 static inline void rps_lock(struct softnet_data *sd)
 207 {
 208 #ifdef CONFIG_RPS
 209         spin_lock(&sd->input_pkt_queue.lock);
 210 #endif
 211 }
 212
 213 static inline void rps_unlock(struct softnet_data *sd)
 214 {
 215 #ifdef CONFIG_RPS
 216         spin_unlock(&sd->input_pkt_queue.lock);
 217 #endif
 218 }
 219
 220 /* Device list insertion */
 221 static void list_netdevice(struct net_device *dev)
 222 {
 223         struct net *net = dev_net(dev);
 224
 225         ASSERT_RTNL();
 226
 227         write_lock_bh(&dev_base_lock);
 228         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 229         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 230         hlist_add_head_rcu(&dev->index_hlist,
 231                            dev_index_hash(net, dev->ifindex));
 232         write_unlock_bh(&dev_base_lock);
 233
 234         dev_base_seq_inc(net);
 235 }
 236
 237 /* Device list removal
 238  * caller must respect a RCU grace period before freeing/reusing dev
 239  */
 240 static void unlist_netdevice(struct net_device *dev)
 241 {
 242         ASSERT_RTNL();
 243
 244         /* Unlink dev from the device chain */
 245         write_lock_bh(&dev_base_lock);
 246         list_del_rcu(&dev->dev_list);
 247         hlist_del_rcu(&dev->name_hlist);
 248         hlist_del_rcu(&dev->index_hlist);
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(dev_net(dev));
 252 }
 253
 254 /*
 255  *      Our notifier list
 256  */
 257
 258 static RAW_NOTIFIER_HEAD(netdev_chain);
 259
 260 /*
 261  *      Device drivers call our routines to queue packets here. We empty the
 262  *      queue in the local softnet handler.
 263  */
 264
 265 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 266 EXPORT_PER_CPU_SYMBOL(softnet_data);
 267
 268 #ifdef CONFIG_LOCKDEP
 269 /*
 270  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 271  * according to dev->type
 272  */
 273 static const unsigned short netdev_lock_type[] =
 274         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 275          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 276          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 277          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 278          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 279          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 280          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 281          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 282          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 283          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 284          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 285          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 286          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 287          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 288          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 289
 290 static const char *const netdev_lock_name[] =
 291         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 292          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 293          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 294          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 295          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 296          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 297          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 298          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 299          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 300          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 301          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 302          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 303          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 304          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 305          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 306
 307 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 309
 310 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 311 {
 312         int i;
 313
 314         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 315                 if (netdev_lock_type[i] == dev_type)
 316                         return i;
 317         /* the last key is used by default */
 318         return ARRAY_SIZE(netdev_lock_type) - 1;
 319 }
 320
 321 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 322                                                  unsigned short dev_type)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev_type);
 327         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 328                                    netdev_lock_name[i]);
 329 }
 330
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333         int i;
 334
 335         i = netdev_lock_pos(dev->type);
 336         lockdep_set_class_and_name(&dev->addr_list_lock,
 337                                    &netdev_addr_lock_key[i],
 338                                    netdev_lock_name[i]);
 339 }
 340 #else
 341 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 342                                                  unsigned short dev_type)
 343 {
 344 }
 345 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 346 {
 347 }
 348 #endif
 349
 350 /*******************************************************************************
 351
 352                 Protocol management and registration routines
 353
 354 *******************************************************************************/
 355
 356 /*
 357  *      Add a protocol ID to the list. Now that the input handler is
 358  *      smarter we can dispense with all the messy stuff that used to be
 359  *      here.
 360  *
 361  *      BEWARE!!! Protocol handlers, mangling input packets,
 362  *      MUST BE last in hash buckets and checking protocol handlers
 363  *      MUST start from promiscuous ptype_all chain in net_bh.
 364  *      It is true now, do not change it.
 365  *      Explanation follows: if protocol handler, mangling packet, will
 366  *      be the first on list, it is not able to sense, that packet
 367  *      is cloned and should be copied-on-write, so that it will
 368  *      change it and subsequent readers will get broken packet.
 369  *                                                      --ANK (980803)
 370  */
 371
 372 static inline struct list_head *ptype_head(const struct packet_type *pt)
 373 {
 374         if (pt->type == htons(ETH_P_ALL))
 375                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 376         else
 377                 return pt->dev ? &pt->dev->ptype_specific :
 378                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 379 }
 380
 381 /**
 382  *      dev_add_pack - add packet handler
 383  *      @pt: packet type declaration
 384  *
 385  *      Add a protocol handler to the networking stack. The passed &packet_type
 386  *      is linked into kernel lists and may not be freed until it has been
 387  *      removed from the kernel lists.
 388  *
 389  *      This call does not sleep therefore it can not
 390  *      guarantee all CPU's that are in middle of receiving packets
 391  *      will see the new packet type (until the next received packet).
 392  */
 393
 394 void dev_add_pack(struct packet_type *pt)
 395 {
 396         struct list_head *head = ptype_head(pt);
 397
 398         spin_lock(&ptype_lock);
 399         list_add_rcu(&pt->list, head);
 400         spin_unlock(&ptype_lock);
 401 }
 402 EXPORT_SYMBOL(dev_add_pack);
 403
 404 /**
 405  *      __dev_remove_pack        - remove packet handler
 406  *      @pt: packet type declaration
 407  *
 408  *      Remove a protocol handler that was previously added to the kernel
 409  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 410  *      from the kernel lists and can be freed or reused once this function
 411  *      returns.
 412  *
 413  *      The packet type might still be in use by receivers
 414  *      and must not be freed until after all the CPU's have gone
 415  *      through a quiescent state.
 416  */
 417 void __dev_remove_pack(struct packet_type *pt)
 418 {
 419         struct list_head *head = ptype_head(pt);
 420         struct packet_type *pt1;
 421
 422         spin_lock(&ptype_lock);
 423
 424         list_for_each_entry(pt1, head, list) {
 425                 if (pt == pt1) {
 426                         list_del_rcu(&pt->list);
 427                         goto out;
 428                 }
 429         }
 430
 431         pr_warn("dev_remove_pack: %p not found\n", pt);
 432 out:
 433         spin_unlock(&ptype_lock);
 434 }
 435 EXPORT_SYMBOL(__dev_remove_pack);
 436
 437 /**
 438  *      dev_remove_pack  - remove packet handler
 439  *      @pt: packet type declaration
 440  *
 441  *      Remove a protocol handler that was previously added to the kernel
 442  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 443  *      from the kernel lists and can be freed or reused once this function
 444  *      returns.
 445  *
 446  *      This call sleeps to guarantee that no CPU is looking at the packet
 447  *      type after return.
 448  */
 449 void dev_remove_pack(struct packet_type *pt)
 450 {
 451         __dev_remove_pack(pt);
 452
 453         synchronize_net();
 454 }
 455 EXPORT_SYMBOL(dev_remove_pack);
 456
 457
 458 /**
 459  *      dev_add_offload - register offload handlers
 460  *      @po: protocol offload declaration
 461  *
 462  *      Add protocol offload handlers to the networking stack. The passed
 463  *      &proto_offload is linked into kernel lists and may not be freed until
 464  *      it has been removed from the kernel lists.
 465  *
 466  *      This call does not sleep therefore it can not
 467  *      guarantee all CPU's that are in middle of receiving packets
 468  *      will see the new offload handlers (until the next received packet).
 469  */
 470 void dev_add_offload(struct packet_offload *po)
 471 {
 472         struct packet_offload *elem;
 473
 474         spin_lock(&offload_lock);
 475         list_for_each_entry(elem, &offload_base, list) {
 476                 if (po->priority < elem->priority)
 477                         break;
 478         }
 479         list_add_rcu(&po->list, elem->list.prev);
 480         spin_unlock(&offload_lock);
 481 }
 482 EXPORT_SYMBOL(dev_add_offload);
 483
 484 /**
 485  *      __dev_remove_offload     - remove offload handler
 486  *      @po: packet offload declaration
 487  *
 488  *      Remove a protocol offload handler that was previously added to the
 489  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 490  *      is removed from the kernel lists and can be freed or reused once this
 491  *      function returns.
 492  *
 493  *      The packet type might still be in use by receivers
 494  *      and must not be freed until after all the CPU's have gone
 495  *      through a quiescent state.
 496  */
 497 static void __dev_remove_offload(struct packet_offload *po)
 498 {
 499         struct list_head *head = &offload_base;
 500         struct packet_offload *po1;
 501
 502         spin_lock(&offload_lock);
 503
 504         list_for_each_entry(po1, head, list) {
 505                 if (po == po1) {
 506                         list_del_rcu(&po->list);
 507                         goto out;
 508                 }
 509         }
 510
 511         pr_warn("dev_remove_offload: %p not found\n", po);
 512 out:
 513         spin_unlock(&offload_lock);
 514 }
 515
 516 /**
 517  *      dev_remove_offload       - remove packet offload handler
 518  *      @po: packet offload declaration
 519  *
 520  *      Remove a packet offload handler that was previously added to the kernel
 521  *      offload handlers by dev_add_offload(). The passed &offload_type is
 522  *      removed from the kernel lists and can be freed or reused once this
 523  *      function returns.
 524  *
 525  *      This call sleeps to guarantee that no CPU is looking at the packet
 526  *      type after return.
 527  */
 528 void dev_remove_offload(struct packet_offload *po)
 529 {
 530         __dev_remove_offload(po);
 531
 532         synchronize_net();
 533 }
 534 EXPORT_SYMBOL(dev_remove_offload);
 535
 536 /******************************************************************************
 537
 538                       Device Boot-time Settings Routines
 539
 540 *******************************************************************************/
 541
 542 /* Boot time configuration table */
 543 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 544
 545 /**
 546  *      netdev_boot_setup_add   - add new setup entry
 547  *      @name: name of the device
 548  *      @map: configured settings for the device
 549  *
 550  *      Adds new setup entry to the dev_boot_setup list.  The function
 551  *      returns 0 on error and 1 on success.  This is a generic routine to
 552  *      all netdevices.
 553  */
 554 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 555 {
 556         struct netdev_boot_setup *s;
 557         int i;
 558
 559         s = dev_boot_setup;
 560         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 561                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 562                         memset(s[i].name, 0, sizeof(s[i].name));
 563                         strlcpy(s[i].name, name, IFNAMSIZ);
 564                         memcpy(&s[i].map, map, sizeof(s[i].map));
 565                         break;
 566                 }
 567         }
 568
 569         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 570 }
 571
 572 /**
 573  *      netdev_boot_setup_check - check boot time settings
 574  *      @dev: the netdevice
 575  *
 576  *      Check boot time settings for the device.
 577  *      The found settings are set for the device to be used
 578  *      later in the device probing.
 579  *      Returns 0 if no settings found, 1 if they are.
 580  */
 581 int netdev_boot_setup_check(struct net_device *dev)
 582 {
 583         struct netdev_boot_setup *s = dev_boot_setup;
 584         int i;
 585
 586         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 587                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 588                     !strcmp(dev->name, s[i].name)) {
 589                         dev->irq        = s[i].map.irq;
 590                         dev->base_addr  = s[i].map.base_addr;
 591                         dev->mem_start  = s[i].map.mem_start;
 592                         dev->mem_end    = s[i].map.mem_end;
 593                         return 1;
 594                 }
 595         }
 596         return 0;
 597 }
 598 EXPORT_SYMBOL(netdev_boot_setup_check);
 599
 600
 601 /**
 602  *      netdev_boot_base        - get address from boot time settings
 603  *      @prefix: prefix for network device
 604  *      @unit: id for network device
 605  *
 606  *      Check boot time settings for the base address of device.
 607  *      The found settings are set for the device to be used
 608  *      later in the device probing.
 609  *      Returns 0 if no settings found.
 610  */
 611 unsigned long netdev_boot_base(const char *prefix, int unit)
 612 {
 613         const struct netdev_boot_setup *s = dev_boot_setup;
 614         char name[IFNAMSIZ];
 615         int i;
 616
 617         sprintf(name, "%s%d", prefix, unit);
 618
 619         /*
 620          * If device already registered then return base of 1
 621          * to indicate not to probe for this interface
 622          */
 623         if (__dev_get_by_name(&init_net, name))
 624                 return 1;
 625
 626         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 627                 if (!strcmp(name, s[i].name))
 628                         return s[i].map.base_addr;
 629         return 0;
 630 }
 631
 632 /*
 633  * Saves at boot time configured settings for any netdevice.
 634  */
 635 int __init netdev_boot_setup(char *str)
 636 {
 637         int ints[5];
 638         struct ifmap map;
 639
 640         str = get_options(str, ARRAY_SIZE(ints), ints);
 641         if (!str || !*str)
 642                 return 0;
 643
 644         /* Save settings */
 645         memset(&map, 0, sizeof(map));
 646         if (ints[0] > 0)
 647                 map.irq = ints[1];
 648         if (ints[0] > 1)
 649                 map.base_addr = ints[2];
 650         if (ints[0] > 2)
 651                 map.mem_start = ints[3];
 652         if (ints[0] > 3)
 653                 map.mem_end = ints[4];
 654
 655         /* Add new entry to the list */
 656         return netdev_boot_setup_add(str, &map);
 657 }
 658
 659 __setup("netdev=", netdev_boot_setup);
 660
 661 /*******************************************************************************
 662
 663                             Device Interface Subroutines
 664
 665 *******************************************************************************/
 666
 667 /**
 668  *      dev_get_iflink  - get 'iflink' value of a interface
 669  *      @dev: targeted interface
 670  *
 671  *      Indicates the ifindex the interface is linked to.
 672  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 673  */
 674
 675 int dev_get_iflink(const struct net_device *dev)
 676 {
 677         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 678                 return dev->netdev_ops->ndo_get_iflink(dev);
 679
 680         /* If dev->rtnl_link_ops is set, it's a virtual interface. */
 681         if (dev->rtnl_link_ops)
 682                 return 0;
 683
 684         return dev->ifindex;
 685 }
 686 EXPORT_SYMBOL(dev_get_iflink);
 687
 688 /**
 689  *      __dev_get_by_name       - find a device by its name
 690  *      @net: the applicable net namespace
 691  *      @name: name to find
 692  *
 693  *      Find an interface by name. Must be called under RTNL semaphore
 694  *      or @dev_base_lock. If the name is found a pointer to the device
 695  *      is returned. If the name is not found then %NULL is returned. The
 696  *      reference counters are not incremented so the caller must be
 697  *      careful with locks.
 698  */
 699
 700 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 701 {
 702         struct net_device *dev;
 703         struct hlist_head *head = dev_name_hash(net, name);
 704
 705         hlist_for_each_entry(dev, head, name_hlist)
 706                 if (!strncmp(dev->name, name, IFNAMSIZ))
 707                         return dev;
 708
 709         return NULL;
 710 }
 711 EXPORT_SYMBOL(__dev_get_by_name);
 712
 713 /**
 714  *      dev_get_by_name_rcu     - find a device by its name
 715  *      @net: the applicable net namespace
 716  *      @name: name to find
 717  *
 718  *      Find an interface by name.
 719  *      If the name is found a pointer to the device is returned.
 720  *      If the name is not found then %NULL is returned.
 721  *      The reference counters are not incremented so the caller must be
 722  *      careful with locks. The caller must hold RCU lock.
 723  */
 724
 725 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 726 {
 727         struct net_device *dev;
 728         struct hlist_head *head = dev_name_hash(net, name);
 729
 730         hlist_for_each_entry_rcu(dev, head, name_hlist)
 731                 if (!strncmp(dev->name, name, IFNAMSIZ))
 732                         return dev;
 733
 734         return NULL;
 735 }
 736 EXPORT_SYMBOL(dev_get_by_name_rcu);
 737
 738 /**
 739  *      dev_get_by_name         - find a device by its name
 740  *      @net: the applicable net namespace
 741  *      @name: name to find
 742  *
 743  *      Find an interface by name. This can be called from any
 744  *      context and does its own locking. The returned handle has
 745  *      the usage count incremented and the caller must use dev_put() to
 746  *      release it when it is no longer needed. %NULL is returned if no
 747  *      matching device is found.
 748  */
 749
 750 struct net_device *dev_get_by_name(struct net *net, const char *name)
 751 {
 752         struct net_device *dev;
 753
 754         rcu_read_lock();
 755         dev = dev_get_by_name_rcu(net, name);
 756         if (dev)
 757                 dev_hold(dev);
 758         rcu_read_unlock();
 759         return dev;
 760 }
 761 EXPORT_SYMBOL(dev_get_by_name);
 762
 763 /**
 764  *      __dev_get_by_index - find a device by its ifindex
 765  *      @net: the applicable net namespace
 766  *      @ifindex: index of device
 767  *
 768  *      Search for an interface by index. Returns %NULL if the device
 769  *      is not found or a pointer to the device. The device has not
 770  *      had its reference counter increased so the caller must be careful
 771  *      about locking. The caller must hold either the RTNL semaphore
 772  *      or @dev_base_lock.
 773  */
 774
 775 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 776 {
 777         struct net_device *dev;
 778         struct hlist_head *head = dev_index_hash(net, ifindex);
 779
 780         hlist_for_each_entry(dev, head, index_hlist)
 781                 if (dev->ifindex == ifindex)
 782                         return dev;
 783
 784         return NULL;
 785 }
 786 EXPORT_SYMBOL(__dev_get_by_index);
 787
 788 /**
 789  *      dev_get_by_index_rcu - find a device by its ifindex
 790  *      @net: the applicable net namespace
 791  *      @ifindex: index of device
 792  *
 793  *      Search for an interface by index. Returns %NULL if the device
 794  *      is not found or a pointer to the device. The device has not
 795  *      had its reference counter increased so the caller must be careful
 796  *      about locking. The caller must hold RCU lock.
 797  */
 798
 799 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 800 {
 801         struct net_device *dev;
 802         struct hlist_head *head = dev_index_hash(net, ifindex);
 803
 804         hlist_for_each_entry_rcu(dev, head, index_hlist)
 805                 if (dev->ifindex == ifindex)
 806                         return dev;
 807
 808         return NULL;
 809 }
 810 EXPORT_SYMBOL(dev_get_by_index_rcu);
 811
 812
 813 /**
 814  *      dev_get_by_index - find a device by its ifindex
 815  *      @net: the applicable net namespace
 816  *      @ifindex: index of device
 817  *
 818  *      Search for an interface by index. Returns NULL if the device
 819  *      is not found or a pointer to the device. The device returned has
 820  *      had a reference added and the pointer is safe until the user calls
 821  *      dev_put to indicate they have finished with it.
 822  */
 823
 824 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 825 {
 826         struct net_device *dev;
 827
 828         rcu_read_lock();
 829         dev = dev_get_by_index_rcu(net, ifindex);
 830         if (dev)
 831                 dev_hold(dev);
 832         rcu_read_unlock();
 833         return dev;
 834 }
 835 EXPORT_SYMBOL(dev_get_by_index);
 836
 837 /**
 838  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 839  *      @net: network namespace
 840  *      @name: a pointer to the buffer where the name will be stored.
 841  *      @ifindex: the ifindex of the interface to get the name from.
 842  *
 843  *      The use of raw_seqcount_begin() and cond_resched() before
 844  *      retrying is required as we want to give the writers a chance
 845  *      to complete when CONFIG_PREEMPT is not set.
 846  */
 847 int netdev_get_name(struct net *net, char *name, int ifindex)
 848 {
 849         struct net_device *dev;
 850         unsigned int seq;
 851
 852 retry:
 853         seq = raw_seqcount_begin(&devnet_rename_seq);
 854         rcu_read_lock();
 855         dev = dev_get_by_index_rcu(net, ifindex);
 856         if (!dev) {
 857                 rcu_read_unlock();
 858                 return -ENODEV;
 859         }
 860
 861         strcpy(name, dev->name);
 862         rcu_read_unlock();
 863         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 864                 cond_resched();
 865                 goto retry;
 866         }
 867
 868         return 0;
 869 }
 870
 871 /**
 872  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 873  *      @net: the applicable net namespace
 874  *      @type: media type of device
 875  *      @ha: hardware address
 876  *
 877  *      Search for an interface by MAC address. Returns NULL if the device
 878  *      is not found or a pointer to the device.
 879  *      The caller must hold RCU or RTNL.
 880  *      The returned device has not had its ref count increased
 881  *      and the caller must therefore be careful about locking
 882  *
 883  */
 884
 885 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 886                                        const char *ha)
 887 {
 888         struct net_device *dev;
 889
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type &&
 892                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 893                         return dev;
 894
 895         return NULL;
 896 }
 897 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 898
 899 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 900 {
 901         struct net_device *dev;
 902
 903         ASSERT_RTNL();
 904         for_each_netdev(net, dev)
 905                 if (dev->type == type)
 906                         return dev;
 907
 908         return NULL;
 909 }
 910 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 911
 912 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 913 {
 914         struct net_device *dev, *ret = NULL;
 915
 916         rcu_read_lock();
 917         for_each_netdev_rcu(net, dev)
 918                 if (dev->type == type) {
 919                         dev_hold(dev);
 920                         ret = dev;
 921                         break;
 922                 }
 923         rcu_read_unlock();
 924         return ret;
 925 }
 926 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 927
 928 /**
 929  *      __dev_get_by_flags - find any device with given flags
 930  *      @net: the applicable net namespace
 931  *      @if_flags: IFF_* values
 932  *      @mask: bitmask of bits in if_flags to check
 933  *
 934  *      Search for any interface with the given flags. Returns NULL if a device
 935  *      is not found or a pointer to the device. Must be called inside
 936  *      rtnl_lock(), and result refcount is unchanged.
 937  */
 938
 939 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 940                                       unsigned short mask)
 941 {
 942         struct net_device *dev, *ret;
 943
 944         ASSERT_RTNL();
 945
 946         ret = NULL;
 947         for_each_netdev(net, dev) {
 948                 if (((dev->flags ^ if_flags) & mask) == 0) {
 949                         ret = dev;
 950                         break;
 951                 }
 952         }
 953         return ret;
 954 }
 955 EXPORT_SYMBOL(__dev_get_by_flags);
 956
 957 /**
 958  *      dev_valid_name - check if name is okay for network device
 959  *      @name: name string
 960  *
 961  *      Network device names need to be valid file names to
 962  *      to allow sysfs to work.  We also disallow any kind of
 963  *      whitespace.
 964  */
 965 bool dev_valid_name(const char *name)
 966 {
 967         if (*name == '\0')
 968                 return false;
 969         if (strlen(name) >= IFNAMSIZ)
 970                 return false;
 971         if (!strcmp(name, ".") || !strcmp(name, ".."))
 972                 return false;
 973
 974         while (*name) {
 975                 if (*name == '/' || *name == ':' || isspace(*name))
 976                         return false;
 977                 name++;
 978         }
 979         return true;
 980 }
 981 EXPORT_SYMBOL(dev_valid_name);
 982
 983 /**
 984  *      __dev_alloc_name - allocate a name for a device
 985  *      @net: network namespace to allocate the device name in
 986  *      @name: name format string
 987  *      @buf:  scratch buffer and result name string
 988  *
 989  *      Passed a format string - eg "lt%d" it will try and find a suitable
 990  *      id. It scans list of devices to build up a free map, then chooses
 991  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 992  *      while allocating the name and adding the device in order to avoid
 993  *      duplicates.
 994  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 995  *      Returns the number of the unit assigned or a negative errno code.
 996  */
 997
 998 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 999 {
1000         int i = 0;
1001         const char *p;
1002         const int max_netdevices = 8*PAGE_SIZE;
1003         unsigned long *inuse;
1004         struct net_device *d;
1005
1006         p = strnchr(name, IFNAMSIZ-1, '%');
1007         if (p) {
1008                 /*
1009                  * Verify the string as this thing may have come from
1010                  * the user.  There must be either one "%d" and no other "%"
1011                  * characters.
1012                  */
1013                 if (p[1] != 'd' || strchr(p + 2, '%'))
1014                         return -EINVAL;
1015
1016                 /* Use one page as a bit array of possible slots */
1017                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1018                 if (!inuse)
1019                         return -ENOMEM;
1020
1021                 for_each_netdev(net, d) {
1022                         if (!sscanf(d->name, name, &i))
1023                                 continue;
1024                         if (i < 0 || i >= max_netdevices)
1025                                 continue;
1026
1027                         /*  avoid cases where sscanf is not exact inverse of printf */
1028                         snprintf(buf, IFNAMSIZ, name, i);
1029                         if (!strncmp(buf, d->name, IFNAMSIZ))
1030                                 set_bit(i, inuse);
1031                 }
1032
1033                 i = find_first_zero_bit(inuse, max_netdevices);
1034                 free_page((unsigned long) inuse);
1035         }
1036
1037         if (buf != name)
1038                 snprintf(buf, IFNAMSIZ, name, i);
1039         if (!__dev_get_by_name(net, buf))
1040                 return i;
1041
1042         /* It is possible to run out of possible slots
1043          * when the name is long and there isn't enough space left
1044          * for the digits, or if all bits are used.
1045          */
1046         return -ENFILE;
1047 }
1048
1049 /**
1050  *      dev_alloc_name - allocate a name for a device
1051  *      @dev: device
1052  *      @name: name format string
1053  *
1054  *      Passed a format string - eg "lt%d" it will try and find a suitable
1055  *      id. It scans list of devices to build up a free map, then chooses
1056  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1057  *      while allocating the name and adding the device in order to avoid
1058  *      duplicates.
1059  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1060  *      Returns the number of the unit assigned or a negative errno code.
1061  */
1062
1063 int dev_alloc_name(struct net_device *dev, const char *name)
1064 {
1065         char buf[IFNAMSIZ];
1066         struct net *net;
1067         int ret;
1068
1069         BUG_ON(!dev_net(dev));
1070         net = dev_net(dev);
1071         ret = __dev_alloc_name(net, name, buf);
1072         if (ret >= 0)
1073                 strlcpy(dev->name, buf, IFNAMSIZ);
1074         return ret;
1075 }
1076 EXPORT_SYMBOL(dev_alloc_name);
1077
1078 static int dev_alloc_name_ns(struct net *net,
1079                              struct net_device *dev,
1080                              const char *name)
1081 {
1082         char buf[IFNAMSIZ];
1083         int ret;
1084
1085         ret = __dev_alloc_name(net, name, buf);
1086         if (ret >= 0)
1087                 strlcpy(dev->name, buf, IFNAMSIZ);
1088         return ret;
1089 }
1090
1091 static int dev_get_valid_name(struct net *net,
1092                               struct net_device *dev,
1093                               const char *name)
1094 {
1095         BUG_ON(!net);
1096
1097         if (!dev_valid_name(name))
1098                 return -EINVAL;
1099
1100         if (strchr(name, '%'))
1101                 return dev_alloc_name_ns(net, dev, name);
1102         else if (__dev_get_by_name(net, name))
1103                 return -EEXIST;
1104         else if (dev->name != name)
1105                 strlcpy(dev->name, name, IFNAMSIZ);
1106
1107         return 0;
1108 }
1109
1110 /**
1111  *      dev_change_name - change name of a device
1112  *      @dev: device
1113  *      @newname: name (or format string) must be at least IFNAMSIZ
1114  *
1115  *      Change name of a device, can pass format strings "eth%d".
1116  *      for wildcarding.
1117  */
1118 int dev_change_name(struct net_device *dev, const char *newname)
1119 {
1120         unsigned char old_assign_type;
1121         char oldname[IFNAMSIZ];
1122         int err = 0;
1123         int ret;
1124         struct net *net;
1125
1126         ASSERT_RTNL();
1127         BUG_ON(!dev_net(dev));
1128
1129         net = dev_net(dev);
1130         if (dev->flags & IFF_UP)
1131                 return -EBUSY;
1132
1133         write_seqcount_begin(&devnet_rename_seq);
1134
1135         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1136                 write_seqcount_end(&devnet_rename_seq);
1137                 return 0;
1138         }
1139
1140         memcpy(oldname, dev->name, IFNAMSIZ);
1141
1142         err = dev_get_valid_name(net, dev, newname);
1143         if (err < 0) {
1144                 write_seqcount_end(&devnet_rename_seq);
1145                 return err;
1146         }
1147
1148         if (oldname[0] && !strchr(oldname, '%'))
1149                 netdev_info(dev, "renamed from %s\n", oldname);
1150
1151         old_assign_type = dev->name_assign_type;
1152         dev->name_assign_type = NET_NAME_RENAMED;
1153
1154 rollback:
1155         ret = device_rename(&dev->dev, dev->name);
1156         if (ret) {
1157                 memcpy(dev->name, oldname, IFNAMSIZ);
1158                 dev->name_assign_type = old_assign_type;
1159                 write_seqcount_end(&devnet_rename_seq);
1160                 return ret;
1161         }
1162
1163         write_seqcount_end(&devnet_rename_seq);
1164
1165         netdev_adjacent_rename_links(dev, oldname);
1166
1167         write_lock_bh(&dev_base_lock);
1168         hlist_del_rcu(&dev->name_hlist);
1169         write_unlock_bh(&dev_base_lock);
1170
1171         synchronize_rcu();
1172
1173         write_lock_bh(&dev_base_lock);
1174         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1175         write_unlock_bh(&dev_base_lock);
1176
1177         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1178         ret = notifier_to_errno(ret);
1179
1180         if (ret) {
1181                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1182                 if (err >= 0) {
1183                         err = ret;
1184                         write_seqcount_begin(&devnet_rename_seq);
1185                         memcpy(dev->name, oldname, IFNAMSIZ);
1186                         memcpy(oldname, newname, IFNAMSIZ);
1187                         dev->name_assign_type = old_assign_type;
1188                         old_assign_type = NET_NAME_RENAMED;
1189                         goto rollback;
1190                 } else {
1191                         pr_err("%s: name change rollback failed: %d\n",
1192                                dev->name, ret);
1193                 }
1194         }
1195
1196         return err;
1197 }
1198
1199 /**
1200  *      dev_set_alias - change ifalias of a device
1201  *      @dev: device
1202  *      @alias: name up to IFALIASZ
1203  *      @len: limit of bytes to copy from info
1204  *
1205  *      Set ifalias for a device,
1206  */
1207 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1208 {
1209         char *new_ifalias;
1210
1211         ASSERT_RTNL();
1212
1213         if (len >= IFALIASZ)
1214                 return -EINVAL;
1215
1216         if (!len) {
1217                 kfree(dev->ifalias);
1218                 dev->ifalias = NULL;
1219                 return 0;
1220         }
1221
1222         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1223         if (!new_ifalias)
1224                 return -ENOMEM;
1225         dev->ifalias = new_ifalias;
1226
1227         strlcpy(dev->ifalias, alias, len+1);
1228         return len;
1229 }
1230
1231
1232 /**
1233  *      netdev_features_change - device changes features
1234  *      @dev: device to cause notification
1235  *
1236  *      Called to indicate a device has changed features.
1237  */
1238 void netdev_features_change(struct net_device *dev)
1239 {
1240         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1241 }
1242 EXPORT_SYMBOL(netdev_features_change);
1243
1244 /**
1245  *      netdev_state_change - device changes state
1246  *      @dev: device to cause notification
1247  *
1248  *      Called to indicate a device has changed state. This function calls
1249  *      the notifier chains for netdev_chain and sends a NEWLINK message
1250  *      to the routing socket.
1251  */
1252 void netdev_state_change(struct net_device *dev)
1253 {
1254         if (dev->flags & IFF_UP) {
1255                 struct netdev_notifier_change_info change_info;
1256
1257                 change_info.flags_changed = 0;
1258                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1259                                               &change_info.info);
1260                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1261         }
1262 }
1263 EXPORT_SYMBOL(netdev_state_change);
1264
1265 /**
1266  *      netdev_notify_peers - notify network peers about existence of @dev
1267  *      @dev: network device
1268  *
1269  * Generate traffic such that interested network peers are aware of
1270  * @dev, such as by generating a gratuitous ARP. This may be used when
1271  * a device wants to inform the rest of the network about some sort of
1272  * reconfiguration such as a failover event or virtual machine
1273  * migration.
1274  */
1275 void netdev_notify_peers(struct net_device *dev)
1276 {
1277         rtnl_lock();
1278         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1279         rtnl_unlock();
1280 }
1281 EXPORT_SYMBOL(netdev_notify_peers);
1282
1283 static int __dev_open(struct net_device *dev)
1284 {
1285         const struct net_device_ops *ops = dev->netdev_ops;
1286         int ret;
1287
1288         ASSERT_RTNL();
1289
1290         if (!netif_device_present(dev))
1291                 return -ENODEV;
1292
1293         /* Block netpoll from trying to do any rx path servicing.
1294          * If we don't do this there is a chance ndo_poll_controller
1295          * or ndo_poll may be running while we open the device
1296          */
1297         netpoll_poll_disable(dev);
1298
1299         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1300         ret = notifier_to_errno(ret);
1301         if (ret)
1302                 return ret;
1303
1304         set_bit(__LINK_STATE_START, &dev->state);
1305
1306         if (ops->ndo_validate_addr)
1307                 ret = ops->ndo_validate_addr(dev);
1308
1309         if (!ret && ops->ndo_open)
1310                 ret = ops->ndo_open(dev);
1311
1312         netpoll_poll_enable(dev);
1313
1314         if (ret)
1315                 clear_bit(__LINK_STATE_START, &dev->state);
1316         else {
1317                 dev->flags |= IFF_UP;
1318                 dev_set_rx_mode(dev);
1319                 dev_activate(dev);
1320                 add_device_randomness(dev->dev_addr, dev->addr_len);
1321         }
1322
1323         return ret;
1324 }
1325
1326 /**
1327  *      dev_open        - prepare an interface for use.
1328  *      @dev:   device to open
1329  *
1330  *      Takes a device from down to up state. The device's private open
1331  *      function is invoked and then the multicast lists are loaded. Finally
1332  *      the device is moved into the up state and a %NETDEV_UP message is
1333  *      sent to the netdev notifier chain.
1334  *
1335  *      Calling this function on an active interface is a nop. On a failure
1336  *      a negative errno code is returned.
1337  */
1338 int dev_open(struct net_device *dev)
1339 {
1340         int ret;
1341
1342         if (dev->flags & IFF_UP)
1343                 return 0;
1344
1345         ret = __dev_open(dev);
1346         if (ret < 0)
1347                 return ret;
1348
1349         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1350         call_netdevice_notifiers(NETDEV_UP, dev);
1351
1352         return ret;
1353 }
1354 EXPORT_SYMBOL(dev_open);
1355
1356 static int __dev_close_many(struct list_head *head)
1357 {
1358         struct net_device *dev;
1359
1360         ASSERT_RTNL();
1361         might_sleep();
1362
1363         list_for_each_entry(dev, head, close_list) {
1364                 /* Temporarily disable netpoll until the interface is down */
1365                 netpoll_poll_disable(dev);
1366
1367                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1368
1369                 clear_bit(__LINK_STATE_START, &dev->state);
1370
1371                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1372                  * can be even on different cpu. So just clear netif_running().
1373                  *
1374                  * dev->stop() will invoke napi_disable() on all of it's
1375                  * napi_struct instances on this device.
1376                  */
1377                 smp_mb__after_atomic(); /* Commit netif_running(). */
1378         }
1379
1380         dev_deactivate_many(head);
1381
1382         list_for_each_entry(dev, head, close_list) {
1383                 const struct net_device_ops *ops = dev->netdev_ops;
1384
1385                 /*
1386                  *      Call the device specific close. This cannot fail.
1387                  *      Only if device is UP
1388                  *
1389                  *      We allow it to be called even after a DETACH hot-plug
1390                  *      event.
1391                  */
1392                 if (ops->ndo_stop)
1393                         ops->ndo_stop(dev);
1394
1395                 dev->flags &= ~IFF_UP;
1396                 netpoll_poll_enable(dev);
1397         }
1398
1399         return 0;
1400 }
1401
1402 static int __dev_close(struct net_device *dev)
1403 {
1404         int retval;
1405         LIST_HEAD(single);
1406
1407         list_add(&dev->close_list, &single);
1408         retval = __dev_close_many(&single);
1409         list_del(&single);
1410
1411         return retval;
1412 }
1413
1414 int dev_close_many(struct list_head *head, bool unlink)
1415 {
1416         struct net_device *dev, *tmp;
1417
1418         /* Remove the devices that don't need to be closed */
1419         list_for_each_entry_safe(dev, tmp, head, close_list)
1420                 if (!(dev->flags & IFF_UP))
1421                         list_del_init(&dev->close_list);
1422
1423         __dev_close_many(head);
1424
1425         list_for_each_entry_safe(dev, tmp, head, close_list) {
1426                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1427                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1428                 if (unlink)
1429                         list_del_init(&dev->close_list);
1430         }
1431
1432         return 0;
1433 }
1434 EXPORT_SYMBOL(dev_close_many);
1435
1436 /**
1437  *      dev_close - shutdown an interface.
1438  *      @dev: device to shutdown
1439  *
1440  *      This function moves an active device into down state. A
1441  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1442  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1443  *      chain.
1444  */
1445 int dev_close(struct net_device *dev)
1446 {
1447         if (dev->flags & IFF_UP) {
1448                 LIST_HEAD(single);
1449
1450                 list_add(&dev->close_list, &single);
1451                 dev_close_many(&single, true);
1452                 list_del(&single);
1453         }
1454         return 0;
1455 }
1456 EXPORT_SYMBOL(dev_close);
1457
1458
1459 /**
1460  *      dev_disable_lro - disable Large Receive Offload on a device
1461  *      @dev: device
1462  *
1463  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1464  *      called under RTNL.  This is needed if received packets may be
1465  *      forwarded to another interface.
1466  */
1467 void dev_disable_lro(struct net_device *dev)
1468 {
1469         struct net_device *lower_dev;
1470         struct list_head *iter;
1471
1472         dev->wanted_features &= ~NETIF_F_LRO;
1473         netdev_update_features(dev);
1474
1475         if (unlikely(dev->features & NETIF_F_LRO))
1476                 netdev_WARN(dev, "failed to disable LRO!\n");
1477
1478         netdev_for_each_lower_dev(dev, lower_dev, iter)
1479                 dev_disable_lro(lower_dev);
1480 }
1481 EXPORT_SYMBOL(dev_disable_lro);
1482
1483 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1484                                    struct net_device *dev)
1485 {
1486         struct netdev_notifier_info info;
1487
1488         netdev_notifier_info_init(&info, dev);
1489         return nb->notifier_call(nb, val, &info);
1490 }
1491
1492 static int dev_boot_phase = 1;
1493
1494 /**
1495  *      register_netdevice_notifier - register a network notifier block
1496  *      @nb: notifier
1497  *
1498  *      Register a notifier to be called when network device events occur.
1499  *      The notifier passed is linked into the kernel structures and must
1500  *      not be reused until it has been unregistered. A negative errno code
1501  *      is returned on a failure.
1502  *
1503  *      When registered all registration and up events are replayed
1504  *      to the new notifier to allow device to have a race free
1505  *      view of the network device list.
1506  */
1507
1508 int register_netdevice_notifier(struct notifier_block *nb)
1509 {
1510         struct net_device *dev;
1511         struct net_device *last;
1512         struct net *net;
1513         int err;
1514
1515         rtnl_lock();
1516         err = raw_notifier_chain_register(&netdev_chain, nb);
1517         if (err)
1518                 goto unlock;
1519         if (dev_boot_phase)
1520                 goto unlock;
1521         for_each_net(net) {
1522                 for_each_netdev(net, dev) {
1523                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1524                         err = notifier_to_errno(err);
1525                         if (err)
1526                                 goto rollback;
1527
1528                         if (!(dev->flags & IFF_UP))
1529                                 continue;
1530
1531                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1532                 }
1533         }
1534
1535 unlock:
1536         rtnl_unlock();
1537         return err;
1538
1539 rollback:
1540         last = dev;
1541         for_each_net(net) {
1542                 for_each_netdev(net, dev) {
1543                         if (dev == last)
1544                                 goto outroll;
1545
1546                         if (dev->flags & IFF_UP) {
1547                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1548                                                         dev);
1549                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1550                         }
1551                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1552                 }
1553         }
1554
1555 outroll:
1556         raw_notifier_chain_unregister(&netdev_chain, nb);
1557         goto unlock;
1558 }
1559 EXPORT_SYMBOL(register_netdevice_notifier);
1560
1561 /**
1562  *      unregister_netdevice_notifier - unregister a network notifier block
1563  *      @nb: notifier
1564  *
1565  *      Unregister a notifier previously registered by
1566  *      register_netdevice_notifier(). The notifier is unlinked into the
1567  *      kernel structures and may then be reused. A negative errno code
1568  *      is returned on a failure.
1569  *
1570  *      After unregistering unregister and down device events are synthesized
1571  *      for all devices on the device list to the removed notifier to remove
1572  *      the need for special case cleanup code.
1573  */
1574
1575 int unregister_netdevice_notifier(struct notifier_block *nb)
1576 {
1577         struct net_device *dev;
1578         struct net *net;
1579         int err;
1580
1581         rtnl_lock();
1582         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1583         if (err)
1584                 goto unlock;
1585
1586         for_each_net(net) {
1587                 for_each_netdev(net, dev) {
1588                         if (dev->flags & IFF_UP) {
1589                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1590                                                         dev);
1591                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1592                         }
1593                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1594                 }
1595         }
1596 unlock:
1597         rtnl_unlock();
1598         return err;
1599 }
1600 EXPORT_SYMBOL(unregister_netdevice_notifier);
1601
1602 /**
1603  *      call_netdevice_notifiers_info - call all network notifier blocks
1604  *      @val: value passed unmodified to notifier function
1605  *      @dev: net_device pointer passed unmodified to notifier function
1606  *      @info: notifier information data
1607  *
1608  *      Call all network notifier blocks.  Parameters and return value
1609  *      are as for raw_notifier_call_chain().
1610  */
1611
1612 static int call_netdevice_notifiers_info(unsigned long val,
1613                                          struct net_device *dev,
1614                                          struct netdev_notifier_info *info)
1615 {
1616         ASSERT_RTNL();
1617         netdev_notifier_info_init(info, dev);
1618         return raw_notifier_call_chain(&netdev_chain, val, info);
1619 }
1620
1621 /**
1622  *      call_netdevice_notifiers - call all network notifier blocks
1623  *      @val: value passed unmodified to notifier function
1624  *      @dev: net_device pointer passed unmodified to notifier function
1625  *
1626  *      Call all network notifier blocks.  Parameters and return value
1627  *      are as for raw_notifier_call_chain().
1628  */
1629
1630 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1631 {
1632         struct netdev_notifier_info info;
1633
1634         return call_netdevice_notifiers_info(val, dev, &info);
1635 }
1636 EXPORT_SYMBOL(call_netdevice_notifiers);
1637
1638 #ifdef CONFIG_NET_INGRESS
1639 static struct static_key ingress_needed __read_mostly;
1640
1641 void net_inc_ingress_queue(void)
1642 {
1643         static_key_slow_inc(&ingress_needed);
1644 }
1645 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1646
1647 void net_dec_ingress_queue(void)
1648 {
1649         static_key_slow_dec(&ingress_needed);
1650 }
1651 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1652 #endif
1653
1654 static struct static_key netstamp_needed __read_mostly;
1655 #ifdef HAVE_JUMP_LABEL
1656 /* We are not allowed to call static_key_slow_dec() from irq context
1657  * If net_disable_timestamp() is called from irq context, defer the
1658  * static_key_slow_dec() calls.
1659  */
1660 static atomic_t netstamp_needed_deferred;
1661 #endif
1662
1663 void net_enable_timestamp(void)
1664 {
1665 #ifdef HAVE_JUMP_LABEL
1666         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1667
1668         if (deferred) {
1669                 while (--deferred)
1670                         static_key_slow_dec(&netstamp_needed);
1671                 return;
1672         }
1673 #endif
1674         static_key_slow_inc(&netstamp_needed);
1675 }
1676 EXPORT_SYMBOL(net_enable_timestamp);
1677
1678 void net_disable_timestamp(void)
1679 {
1680 #ifdef HAVE_JUMP_LABEL
1681         if (in_interrupt()) {
1682                 atomic_inc(&netstamp_needed_deferred);
1683                 return;
1684         }
1685 #endif
1686         static_key_slow_dec(&netstamp_needed);
1687 }
1688 EXPORT_SYMBOL(net_disable_timestamp);
1689
1690 static inline void net_timestamp_set(struct sk_buff *skb)
1691 {
1692         skb->tstamp.tv64 = 0;
1693         if (static_key_false(&netstamp_needed))
1694                 __net_timestamp(skb);
1695 }
1696
1697 #define net_timestamp_check(COND, SKB)                  \
1698         if (static_key_false(&netstamp_needed)) {               \
1699                 if ((COND) && !(SKB)->tstamp.tv64)      \
1700                         __net_timestamp(SKB);           \
1701         }                                               \
1702
1703 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1704 {
1705         unsigned int len;
1706
1707         if (!(dev->flags & IFF_UP))
1708                 return false;
1709
1710         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1711         if (skb->len <= len)
1712                 return true;
1713
1714         /* if TSO is enabled, we don't care about the length as the packet
1715          * could be forwarded without being segmented before
1716          */
1717         if (skb_is_gso(skb))
1718                 return true;
1719
1720         return false;
1721 }
1722 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1723
1724 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1725 {
1726         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1727                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1728                         atomic_long_inc(&dev->rx_dropped);
1729                         kfree_skb(skb);
1730                         return NET_RX_DROP;
1731                 }
1732         }
1733
1734         if (unlikely(!is_skb_forwardable(dev, skb))) {
1735                 atomic_long_inc(&dev->rx_dropped);
1736                 kfree_skb(skb);
1737                 return NET_RX_DROP;
1738         }
1739
1740         skb_scrub_packet(skb, true);
1741         skb->priority = 0;
1742         skb->protocol = eth_type_trans(skb, dev);
1743         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1744
1745         return 0;
1746 }
1747 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1748
1749 /**
1750  * dev_forward_skb - loopback an skb to another netif
1751  *
1752  * @dev: destination network device
1753  * @skb: buffer to forward
1754  *
1755  * return values:
1756  *      NET_RX_SUCCESS  (no congestion)
1757  *      NET_RX_DROP     (packet was dropped, but freed)
1758  *
1759  * dev_forward_skb can be used for injecting an skb from the
1760  * start_xmit function of one device into the receive queue
1761  * of another device.
1762  *
1763  * The receiving device may be in another namespace, so
1764  * we have to clear all information in the skb that could
1765  * impact namespace isolation.
1766  */
1767 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1768 {
1769         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1770 }
1771 EXPORT_SYMBOL_GPL(dev_forward_skb);
1772
1773 static inline int deliver_skb(struct sk_buff *skb,
1774                               struct packet_type *pt_prev,
1775                               struct net_device *orig_dev)
1776 {
1777         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1778                 return -ENOMEM;
1779         atomic_inc(&skb->users);
1780         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1781 }
1782
1783 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1784                                           struct packet_type **pt,
1785                                           struct net_device *orig_dev,
1786                                           __be16 type,
1787                                           struct list_head *ptype_list)
1788 {
1789         struct packet_type *ptype, *pt_prev = *pt;
1790
1791         list_for_each_entry_rcu(ptype, ptype_list, list) {
1792                 if (ptype->type != type)
1793                         continue;
1794                 if (pt_prev)
1795                         deliver_skb(skb, pt_prev, orig_dev);
1796                 pt_prev = ptype;
1797         }
1798         *pt = pt_prev;
1799 }
1800
1801 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1802 {
1803         if (!ptype->af_packet_priv || !skb->sk)
1804                 return false;
1805
1806         if (ptype->id_match)
1807                 return ptype->id_match(ptype, skb->sk);
1808         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1809                 return true;
1810
1811         return false;
1812 }
1813
1814 /*
1815  *      Support routine. Sends outgoing frames to any network
1816  *      taps currently in use.
1817  */
1818
1819 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1820 {
1821         struct packet_type *ptype;
1822         struct sk_buff *skb2 = NULL;
1823         struct packet_type *pt_prev = NULL;
1824         struct list_head *ptype_list = &ptype_all;
1825
1826         rcu_read_lock();
1827 again:
1828         list_for_each_entry_rcu(ptype, ptype_list, list) {
1829                 /* Never send packets back to the socket
1830                  * they originated from - MvS (miquels@drinkel.ow.org)
1831                  */
1832                 if (skb_loop_sk(ptype, skb))
1833                         continue;
1834
1835                 if (pt_prev) {
1836                         deliver_skb(skb2, pt_prev, skb->dev);
1837                         pt_prev = ptype;
1838                         continue;
1839                 }
1840
1841                 /* need to clone skb, done only once */
1842                 skb2 = skb_clone(skb, GFP_ATOMIC);
1843                 if (!skb2)
1844                         goto out_unlock;
1845
1846                 net_timestamp_set(skb2);
1847
1848                 /* skb->nh should be correctly
1849                  * set by sender, so that the second statement is
1850                  * just protection against buggy protocols.
1851                  */
1852                 skb_reset_mac_header(skb2);
1853
1854                 if (skb_network_header(skb2) < skb2->data ||
1855                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1856                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1857                                              ntohs(skb2->protocol),
1858                                              dev->name);
1859                         skb_reset_network_header(skb2);
1860                 }
1861
1862                 skb2->transport_header = skb2->network_header;
1863                 skb2->pkt_type = PACKET_OUTGOING;
1864                 pt_prev = ptype;
1865         }
1866
1867         if (ptype_list == &ptype_all) {
1868                 ptype_list = &dev->ptype_all;
1869                 goto again;
1870         }
1871 out_unlock:
1872         if (pt_prev)
1873                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1874         rcu_read_unlock();
1875 }
1876
1877 /**
1878  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1879  * @dev: Network device
1880  * @txq: number of queues available
1881  *
1882  * If real_num_tx_queues is changed the tc mappings may no longer be
1883  * valid. To resolve this verify the tc mapping remains valid and if
1884  * not NULL the mapping. With no priorities mapping to this
1885  * offset/count pair it will no longer be used. In the worst case TC0
1886  * is invalid nothing can be done so disable priority mappings. If is
1887  * expected that drivers will fix this mapping if they can before
1888  * calling netif_set_real_num_tx_queues.
1889  */
1890 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1891 {
1892         int i;
1893         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1894
1895         /* If TC0 is invalidated disable TC mapping */
1896         if (tc->offset + tc->count > txq) {
1897                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1898                 dev->num_tc = 0;
1899                 return;
1900         }
1901
1902         /* Invalidated prio to tc mappings set to TC0 */
1903         for (i = 1; i < TC_BITMASK + 1; i++) {
1904                 int q = netdev_get_prio_tc_map(dev, i);
1905
1906                 tc = &dev->tc_to_txq[q];
1907                 if (tc->offset + tc->count > txq) {
1908                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1909                                 i, q);
1910                         netdev_set_prio_tc_map(dev, i, 0);
1911                 }
1912         }
1913 }
1914
1915 #ifdef CONFIG_XPS
1916 static DEFINE_MUTEX(xps_map_mutex);
1917 #define xmap_dereference(P)             \
1918         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1919
1920 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1921                                         int cpu, u16 index)
1922 {
1923         struct xps_map *map = NULL;
1924         int pos;
1925
1926         if (dev_maps)
1927                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1928
1929         for (pos = 0; map && pos < map->len; pos++) {
1930                 if (map->queues[pos] == index) {
1931                         if (map->len > 1) {
1932                                 map->queues[pos] = map->queues[--map->len];
1933                         } else {
1934                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1935                                 kfree_rcu(map, rcu);
1936                                 map = NULL;
1937                         }
1938                         break;
1939                 }
1940         }
1941
1942         return map;
1943 }
1944
1945 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1946 {
1947         struct xps_dev_maps *dev_maps;
1948         int cpu, i;
1949         bool active = false;
1950
1951         mutex_lock(&xps_map_mutex);
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953
1954         if (!dev_maps)
1955                 goto out_no_maps;
1956
1957         for_each_possible_cpu(cpu) {
1958                 for (i = index; i < dev->num_tx_queues; i++) {
1959                         if (!remove_xps_queue(dev_maps, cpu, i))
1960                                 break;
1961                 }
1962                 if (i == dev->num_tx_queues)
1963                         active = true;
1964         }
1965
1966         if (!active) {
1967                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1968                 kfree_rcu(dev_maps, rcu);
1969         }
1970
1971         for (i = index; i < dev->num_tx_queues; i++)
1972                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1973                                              NUMA_NO_NODE);
1974
1975 out_no_maps:
1976         mutex_unlock(&xps_map_mutex);
1977 }
1978
1979 static struct xps_map *expand_xps_map(struct xps_map *map,
1980                                       int cpu, u16 index)
1981 {
1982         struct xps_map *new_map;
1983         int alloc_len = XPS_MIN_MAP_ALLOC;
1984         int i, pos;
1985
1986         for (pos = 0; map && pos < map->len; pos++) {
1987                 if (map->queues[pos] != index)
1988                         continue;
1989                 return map;
1990         }
1991
1992         /* Need to add queue to this CPU's existing map */
1993         if (map) {
1994                 if (pos < map->alloc_len)
1995                         return map;
1996
1997                 alloc_len = map->alloc_len * 2;
1998         }
1999
2000         /* Need to allocate new map to store queue on this CPU's map */
2001         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2002                                cpu_to_node(cpu));
2003         if (!new_map)
2004                 return NULL;
2005
2006         for (i = 0; i < pos; i++)
2007                 new_map->queues[i] = map->queues[i];
2008         new_map->alloc_len = alloc_len;
2009         new_map->len = pos;
2010
2011         return new_map;
2012 }
2013
2014 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2015                         u16 index)
2016 {
2017         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2018         struct xps_map *map, *new_map;
2019         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2020         int cpu, numa_node_id = -2;
2021         bool active = false;
2022
2023         mutex_lock(&xps_map_mutex);
2024
2025         dev_maps = xmap_dereference(dev->xps_maps);
2026
2027         /* allocate memory for queue storage */
2028         for_each_online_cpu(cpu) {
2029                 if (!cpumask_test_cpu(cpu, mask))
2030                         continue;
2031
2032                 if (!new_dev_maps)
2033                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2034                 if (!new_dev_maps) {
2035                         mutex_unlock(&xps_map_mutex);
2036                         return -ENOMEM;
2037                 }
2038
2039                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2040                                  NULL;
2041
2042                 map = expand_xps_map(map, cpu, index);
2043                 if (!map)
2044                         goto error;
2045
2046                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2047         }
2048
2049         if (!new_dev_maps)
2050                 goto out_no_new_maps;
2051
2052         for_each_possible_cpu(cpu) {
2053                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2054                         /* add queue to CPU maps */
2055                         int pos = 0;
2056
2057                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2058                         while ((pos < map->len) && (map->queues[pos] != index))
2059                                 pos++;
2060
2061                         if (pos == map->len)
2062                                 map->queues[map->len++] = index;
2063 #ifdef CONFIG_NUMA
2064                         if (numa_node_id == -2)
2065                                 numa_node_id = cpu_to_node(cpu);
2066                         else if (numa_node_id != cpu_to_node(cpu))
2067                                 numa_node_id = -1;
2068 #endif
2069                 } else if (dev_maps) {
2070                         /* fill in the new device map from the old device map */
2071                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2072                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2073                 }
2074
2075         }
2076
2077         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2078
2079         /* Cleanup old maps */
2080         if (dev_maps) {
2081                 for_each_possible_cpu(cpu) {
2082                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2083                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2084                         if (map && map != new_map)
2085                                 kfree_rcu(map, rcu);
2086                 }
2087
2088                 kfree_rcu(dev_maps, rcu);
2089         }
2090
2091         dev_maps = new_dev_maps;
2092         active = true;
2093
2094 out_no_new_maps:
2095         /* update Tx queue numa node */
2096         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2097                                      (numa_node_id >= 0) ? numa_node_id :
2098                                      NUMA_NO_NODE);
2099
2100         if (!dev_maps)
2101                 goto out_no_maps;
2102
2103         /* removes queue from unused CPUs */
2104         for_each_possible_cpu(cpu) {
2105                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2106                         continue;
2107
2108                 if (remove_xps_queue(dev_maps, cpu, index))
2109                         active = true;
2110         }
2111
2112         /* free map if not active */
2113         if (!active) {
2114                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2115                 kfree_rcu(dev_maps, rcu);
2116         }
2117
2118 out_no_maps:
2119         mutex_unlock(&xps_map_mutex);
2120
2121         return 0;
2122 error:
2123         /* remove any maps that we added */
2124         for_each_possible_cpu(cpu) {
2125                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2126                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2127                                  NULL;
2128                 if (new_map && new_map != map)
2129                         kfree(new_map);
2130         }
2131
2132         mutex_unlock(&xps_map_mutex);
2133
2134         kfree(new_dev_maps);
2135         return -ENOMEM;
2136 }
2137 EXPORT_SYMBOL(netif_set_xps_queue);
2138
2139 #endif
2140 /*
2141  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2142  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2143  */
2144 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2145 {
2146         int rc;
2147
2148         if (txq < 1 || txq > dev->num_tx_queues)
2149                 return -EINVAL;
2150
2151         if (dev->reg_state == NETREG_REGISTERED ||
2152             dev->reg_state == NETREG_UNREGISTERING) {
2153                 ASSERT_RTNL();
2154
2155                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2156                                                   txq);
2157                 if (rc)
2158                         return rc;
2159
2160                 if (dev->num_tc)
2161                         netif_setup_tc(dev, txq);
2162
2163                 if (txq < dev->real_num_tx_queues) {
2164                         qdisc_reset_all_tx_gt(dev, txq);
2165 #ifdef CONFIG_XPS
2166                         netif_reset_xps_queues_gt(dev, txq);
2167 #endif
2168                 }
2169         }
2170
2171         dev->real_num_tx_queues = txq;
2172         return 0;
2173 }
2174 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2175
2176 #ifdef CONFIG_SYSFS
2177 /**
2178  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2179  *      @dev: Network device
2180  *      @rxq: Actual number of RX queues
2181  *
2182  *      This must be called either with the rtnl_lock held or before
2183  *      registration of the net device.  Returns 0 on success, or a
2184  *      negative error code.  If called before registration, it always
2185  *      succeeds.
2186  */
2187 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2188 {
2189         int rc;
2190
2191         if (rxq < 1 || rxq > dev->num_rx_queues)
2192                 return -EINVAL;
2193
2194         if (dev->reg_state == NETREG_REGISTERED) {
2195                 ASSERT_RTNL();
2196
2197                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2198                                                   rxq);
2199                 if (rc)
2200                         return rc;
2201         }
2202
2203         dev->real_num_rx_queues = rxq;
2204         return 0;
2205 }
2206 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2207 #endif
2208
2209 /**
2210  * netif_get_num_default_rss_queues - default number of RSS queues
2211  *
2212  * This routine should set an upper limit on the number of RSS queues
2213  * used by default by multiqueue devices.
2214  */
2215 int netif_get_num_default_rss_queues(void)
2216 {
2217         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2218 }
2219 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2220
2221 static inline void __netif_reschedule(struct Qdisc *q)
2222 {
2223         struct softnet_data *sd;
2224         unsigned long flags;
2225
2226         local_irq_save(flags);
2227         sd = this_cpu_ptr(&softnet_data);
2228         q->next_sched = NULL;
2229         *sd->output_queue_tailp = q;
2230         sd->output_queue_tailp = &q->next_sched;
2231         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2232         local_irq_restore(flags);
2233 }
2234
2235 void __netif_schedule(struct Qdisc *q)
2236 {
2237         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2238                 __netif_reschedule(q);
2239 }
2240 EXPORT_SYMBOL(__netif_schedule);
2241
2242 struct dev_kfree_skb_cb {
2243         enum skb_free_reason reason;
2244 };
2245
2246 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2247 {
2248         return (struct dev_kfree_skb_cb *)skb->cb;
2249 }
2250
2251 void netif_schedule_queue(struct netdev_queue *txq)
2252 {
2253         rcu_read_lock();
2254         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2255                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2256
2257                 __netif_schedule(q);
2258         }
2259         rcu_read_unlock();
2260 }
2261 EXPORT_SYMBOL(netif_schedule_queue);
2262
2263 /**
2264  *      netif_wake_subqueue - allow sending packets on subqueue
2265  *      @dev: network device
2266  *      @queue_index: sub queue index
2267  *
2268  * Resume individual transmit queue of a device with multiple transmit queues.
2269  */
2270 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2271 {
2272         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2273
2274         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2275                 struct Qdisc *q;
2276
2277                 rcu_read_lock();
2278                 q = rcu_dereference(txq->qdisc);
2279                 __netif_schedule(q);
2280                 rcu_read_unlock();
2281         }
2282 }
2283 EXPORT_SYMBOL(netif_wake_subqueue);
2284
2285 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2286 {
2287         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2288                 struct Qdisc *q;
2289
2290                 rcu_read_lock();
2291                 q = rcu_dereference(dev_queue->qdisc);
2292                 __netif_schedule(q);
2293                 rcu_read_unlock();
2294         }
2295 }
2296 EXPORT_SYMBOL(netif_tx_wake_queue);
2297
2298 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2299 {
2300         unsigned long flags;
2301
2302         if (likely(atomic_read(&skb->users) == 1)) {
2303                 smp_rmb();
2304                 atomic_set(&skb->users, 0);
2305         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2306                 return;
2307         }
2308         get_kfree_skb_cb(skb)->reason = reason;
2309         local_irq_save(flags);
2310         skb->next = __this_cpu_read(softnet_data.completion_queue);
2311         __this_cpu_write(softnet_data.completion_queue, skb);
2312         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2313         local_irq_restore(flags);
2314 }
2315 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2316
2317 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2318 {
2319         if (in_irq() || irqs_disabled())
2320                 __dev_kfree_skb_irq(skb, reason);
2321         else
2322                 dev_kfree_skb(skb);
2323 }
2324 EXPORT_SYMBOL(__dev_kfree_skb_any);
2325
2326
2327 /**
2328  * netif_device_detach - mark device as removed
2329  * @dev: network device
2330  *
2331  * Mark device as removed from system and therefore no longer available.
2332  */
2333 void netif_device_detach(struct net_device *dev)
2334 {
2335         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2336             netif_running(dev)) {
2337                 netif_tx_stop_all_queues(dev);
2338         }
2339 }
2340 EXPORT_SYMBOL(netif_device_detach);
2341
2342 /**
2343  * netif_device_attach - mark device as attached
2344  * @dev: network device
2345  *
2346  * Mark device as attached from system and restart if needed.
2347  */
2348 void netif_device_attach(struct net_device *dev)
2349 {
2350         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2351             netif_running(dev)) {
2352                 netif_tx_wake_all_queues(dev);
2353                 __netdev_watchdog_up(dev);
2354         }
2355 }
2356 EXPORT_SYMBOL(netif_device_attach);
2357
2358 /*
2359  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2360  * to be used as a distribution range.
2361  */
2362 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2363                   unsigned int num_tx_queues)
2364 {
2365         u32 hash;
2366         u16 qoffset = 0;
2367         u16 qcount = num_tx_queues;
2368
2369         if (skb_rx_queue_recorded(skb)) {
2370                 hash = skb_get_rx_queue(skb);
2371                 while (unlikely(hash >= num_tx_queues))
2372                         hash -= num_tx_queues;
2373                 return hash;
2374         }
2375
2376         if (dev->num_tc) {
2377                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2378                 qoffset = dev->tc_to_txq[tc].offset;
2379                 qcount = dev->tc_to_txq[tc].count;
2380         }
2381
2382         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2383 }
2384 EXPORT_SYMBOL(__skb_tx_hash);
2385
2386 static void skb_warn_bad_offload(const struct sk_buff *skb)
2387 {
2388         static const netdev_features_t null_features = 0;
2389         struct net_device *dev = skb->dev;
2390         const char *driver = "";
2391
2392         if (!net_ratelimit())
2393                 return;
2394
2395         if (dev && dev->dev.parent)
2396                 driver = dev_driver_string(dev->dev.parent);
2397
2398         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2399              "gso_type=%d ip_summed=%d\n",
2400              driver, dev ? &dev->features : &null_features,
2401              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2402              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2403              skb_shinfo(skb)->gso_type, skb->ip_summed);
2404 }
2405
2406 /*
2407  * Invalidate hardware checksum when packet is to be mangled, and
2408  * complete checksum manually on outgoing path.
2409  */
2410 int skb_checksum_help(struct sk_buff *skb)
2411 {
2412         __wsum csum;
2413         int ret = 0, offset;
2414
2415         if (skb->ip_summed == CHECKSUM_COMPLETE)
2416                 goto out_set_summed;
2417
2418         if (unlikely(skb_shinfo(skb)->gso_size)) {
2419                 skb_warn_bad_offload(skb);
2420                 return -EINVAL;
2421         }
2422
2423         /* Before computing a checksum, we should make sure no frag could
2424          * be modified by an external entity : checksum could be wrong.
2425          */
2426         if (skb_has_shared_frag(skb)) {
2427                 ret = __skb_linearize(skb);
2428                 if (ret)
2429                         goto out;
2430         }
2431
2432         offset = skb_checksum_start_offset(skb);
2433         BUG_ON(offset >= skb_headlen(skb));
2434         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2435
2436         offset += skb->csum_offset;
2437         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2438
2439         if (skb_cloned(skb) &&
2440             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2441                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2442                 if (ret)
2443                         goto out;
2444         }
2445
2446         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2447 out_set_summed:
2448         skb->ip_summed = CHECKSUM_NONE;
2449 out:
2450         return ret;
2451 }
2452 EXPORT_SYMBOL(skb_checksum_help);
2453
2454 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2455 {
2456         __be16 type = skb->protocol;
2457
2458         /* Tunnel gso handlers can set protocol to ethernet. */
2459         if (type == htons(ETH_P_TEB)) {
2460                 struct ethhdr *eth;
2461
2462                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2463                         return 0;
2464
2465                 eth = (struct ethhdr *)skb_mac_header(skb);
2466                 type = eth->h_proto;
2467         }
2468
2469         return __vlan_get_protocol(skb, type, depth);
2470 }
2471
2472 /**
2473  *      skb_mac_gso_segment - mac layer segmentation handler.
2474  *      @skb: buffer to segment
2475  *      @features: features for the output path (see dev->features)
2476  */
2477 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2478                                     netdev_features_t features)
2479 {
2480         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2481         struct packet_offload *ptype;
2482         int vlan_depth = skb->mac_len;
2483         __be16 type = skb_network_protocol(skb, &vlan_depth);
2484
2485         if (unlikely(!type))
2486                 return ERR_PTR(-EINVAL);
2487
2488         __skb_pull(skb, vlan_depth);
2489
2490         rcu_read_lock();
2491         list_for_each_entry_rcu(ptype, &offload_base, list) {
2492                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2493                         segs = ptype->callbacks.gso_segment(skb, features);
2494                         break;
2495                 }
2496         }
2497         rcu_read_unlock();
2498
2499         __skb_push(skb, skb->data - skb_mac_header(skb));
2500
2501         return segs;
2502 }
2503 EXPORT_SYMBOL(skb_mac_gso_segment);
2504
2505
2506 /* openvswitch calls this on rx path, so we need a different check.
2507  */
2508 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2509 {
2510         if (tx_path)
2511                 return skb->ip_summed != CHECKSUM_PARTIAL;
2512         else
2513                 return skb->ip_summed == CHECKSUM_NONE;
2514 }
2515
2516 /**
2517  *      __skb_gso_segment - Perform segmentation on skb.
2518  *      @skb: buffer to segment
2519  *      @features: features for the output path (see dev->features)
2520  *      @tx_path: whether it is called in TX path
2521  *
2522  *      This function segments the given skb and returns a list of segments.
2523  *
2524  *      It may return NULL if the skb requires no segmentation.  This is
2525  *      only possible when GSO is used for verifying header integrity.
2526  */
2527 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2528                                   netdev_features_t features, bool tx_path)
2529 {
2530         if (unlikely(skb_needs_check(skb, tx_path))) {
2531                 int err;
2532
2533                 skb_warn_bad_offload(skb);
2534
2535                 err = skb_cow_head(skb, 0);
2536                 if (err < 0)
2537                         return ERR_PTR(err);
2538         }
2539
2540         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2541         SKB_GSO_CB(skb)->encap_level = 0;
2542
2543         skb_reset_mac_header(skb);
2544         skb_reset_mac_len(skb);
2545
2546         return skb_mac_gso_segment(skb, features);
2547 }
2548 EXPORT_SYMBOL(__skb_gso_segment);
2549
2550 /* Take action when hardware reception checksum errors are detected. */
2551 #ifdef CONFIG_BUG
2552 void netdev_rx_csum_fault(struct net_device *dev)
2553 {
2554         if (net_ratelimit()) {
2555                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2556                 dump_stack();
2557         }
2558 }
2559 EXPORT_SYMBOL(netdev_rx_csum_fault);
2560 #endif
2561
2562 /* Actually, we should eliminate this check as soon as we know, that:
2563  * 1. IOMMU is present and allows to map all the memory.
2564  * 2. No high memory really exists on this machine.
2565  */
2566
2567 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2568 {
2569 #ifdef CONFIG_HIGHMEM
2570         int i;
2571         if (!(dev->features & NETIF_F_HIGHDMA)) {
2572                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2573                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2574                         if (PageHighMem(skb_frag_page(frag)))
2575                                 return 1;
2576                 }
2577         }
2578
2579         if (PCI_DMA_BUS_IS_PHYS) {
2580                 struct device *pdev = dev->dev.parent;
2581
2582                 if (!pdev)
2583                         return 0;
2584                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2585                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2586                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2587                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2588                                 return 1;
2589                 }
2590         }
2591 #endif
2592         return 0;
2593 }
2594
2595 /* If MPLS offload request, verify we are testing hardware MPLS features
2596  * instead of standard features for the netdev.
2597  */
2598 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2599 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2600                                            netdev_features_t features,
2601                                            __be16 type)
2602 {
2603         if (eth_p_mpls(type))
2604                 features &= skb->dev->mpls_features;
2605
2606         return features;
2607 }
2608 #else
2609 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2610                                            netdev_features_t features,
2611                                            __be16 type)
2612 {
2613         return features;
2614 }
2615 #endif
2616
2617 static netdev_features_t harmonize_features(struct sk_buff *skb,
2618         netdev_features_t features)
2619 {
2620         int tmp;
2621         __be16 type;
2622
2623         type = skb_network_protocol(skb, &tmp);
2624         features = net_mpls_features(skb, features, type);
2625
2626         if (skb->ip_summed != CHECKSUM_NONE &&
2627             !can_checksum_protocol(features, type)) {
2628                 features &= ~NETIF_F_ALL_CSUM;
2629         } else if (illegal_highdma(skb->dev, skb)) {
2630                 features &= ~NETIF_F_SG;
2631         }
2632
2633         return features;
2634 }
2635
2636 netdev_features_t passthru_features_check(struct sk_buff *skb,
2637                                           struct net_device *dev,
2638                                           netdev_features_t features)
2639 {
2640         return features;
2641 }
2642 EXPORT_SYMBOL(passthru_features_check);
2643
2644 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2645                                              struct net_device *dev,
2646                                              netdev_features_t features)
2647 {
2648         return vlan_features_check(skb, features);
2649 }
2650
2651 netdev_features_t netif_skb_features(struct sk_buff *skb)
2652 {
2653         struct net_device *dev = skb->dev;
2654         netdev_features_t features = dev->features;
2655         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2656
2657         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2658                 features &= ~NETIF_F_GSO_MASK;
2659
2660         /* If encapsulation offload request, verify we are testing
2661          * hardware encapsulation features instead of standard
2662          * features for the netdev
2663          */
2664         if (skb->encapsulation)
2665                 features &= dev->hw_enc_features;
2666
2667         if (skb_vlan_tagged(skb))
2668                 features = netdev_intersect_features(features,
2669                                                      dev->vlan_features |
2670                                                      NETIF_F_HW_VLAN_CTAG_TX |
2671                                                      NETIF_F_HW_VLAN_STAG_TX);
2672
2673         if (dev->netdev_ops->ndo_features_check)
2674                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2675                                                                 features);
2676         else
2677                 features &= dflt_features_check(skb, dev, features);
2678
2679         return harmonize_features(skb, features);
2680 }
2681 EXPORT_SYMBOL(netif_skb_features);
2682
2683 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2684                     struct netdev_queue *txq, bool more)
2685 {
2686         unsigned int len;
2687         int rc;
2688
2689         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2690                 dev_queue_xmit_nit(skb, dev);
2691
2692         len = skb->len;
2693         trace_net_dev_start_xmit(skb, dev);
2694         rc = netdev_start_xmit(skb, dev, txq, more);
2695         trace_net_dev_xmit(skb, rc, dev, len);
2696
2697         return rc;
2698 }
2699
2700 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2701                                     struct netdev_queue *txq, int *ret)
2702 {
2703         struct sk_buff *skb = first;
2704         int rc = NETDEV_TX_OK;
2705
2706         while (skb) {
2707                 struct sk_buff *next = skb->next;
2708
2709                 skb->next = NULL;
2710                 rc = xmit_one(skb, dev, txq, next != NULL);
2711                 if (unlikely(!dev_xmit_complete(rc))) {
2712                         skb->next = next;
2713                         goto out;
2714                 }
2715
2716                 skb = next;
2717                 if (netif_xmit_stopped(txq) && skb) {
2718                         rc = NETDEV_TX_BUSY;
2719                         break;
2720                 }
2721         }
2722
2723 out:
2724         *ret = rc;
2725         return skb;
2726 }
2727
2728 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2729                                           netdev_features_t features)
2730 {
2731         if (skb_vlan_tag_present(skb) &&
2732             !vlan_hw_offload_capable(features, skb->vlan_proto))
2733                 skb = __vlan_hwaccel_push_inside(skb);
2734         return skb;
2735 }
2736
2737 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2738 {
2739         netdev_features_t features;
2740
2741         if (skb->next)
2742                 return skb;
2743
2744         features = netif_skb_features(skb);
2745         skb = validate_xmit_vlan(skb, features);
2746         if (unlikely(!skb))
2747                 goto out_null;
2748
2749         if (netif_needs_gso(skb, features)) {
2750                 struct sk_buff *segs;
2751
2752                 segs = skb_gso_segment(skb, features);
2753                 if (IS_ERR(segs)) {
2754                         goto out_kfree_skb;
2755                 } else if (segs) {
2756                         consume_skb(skb);
2757                         skb = segs;
2758                 }
2759         } else {
2760                 if (skb_needs_linearize(skb, features) &&
2761                     __skb_linearize(skb))
2762                         goto out_kfree_skb;
2763
2764                 /* If packet is not checksummed and device does not
2765                  * support checksumming for this protocol, complete
2766                  * checksumming here.
2767                  */
2768                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2769                         if (skb->encapsulation)
2770                                 skb_set_inner_transport_header(skb,
2771                                                                skb_checksum_start_offset(skb));
2772                         else
2773                                 skb_set_transport_header(skb,
2774                                                          skb_checksum_start_offset(skb));
2775                         if (!(features & NETIF_F_ALL_CSUM) &&
2776                             skb_checksum_help(skb))
2777                                 goto out_kfree_skb;
2778                 }
2779         }
2780
2781         return skb;
2782
2783 out_kfree_skb:
2784         kfree_skb(skb);
2785 out_null:
2786         return NULL;
2787 }
2788
2789 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2790 {
2791         struct sk_buff *next, *head = NULL, *tail;
2792
2793         for (; skb != NULL; skb = next) {
2794                 next = skb->next;
2795                 skb->next = NULL;
2796
2797                 /* in case skb wont be segmented, point to itself */
2798                 skb->prev = skb;
2799
2800                 skb = validate_xmit_skb(skb, dev);
2801                 if (!skb)
2802                         continue;
2803
2804                 if (!head)
2805                         head = skb;
2806                 else
2807                         tail->next = skb;
2808                 /* If skb was segmented, skb->prev points to
2809                  * the last segment. If not, it still contains skb.
2810                  */
2811                 tail = skb->prev;
2812         }
2813         return head;
2814 }
2815
2816 static void qdisc_pkt_len_init(struct sk_buff *skb)
2817 {
2818         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2819
2820         qdisc_skb_cb(skb)->pkt_len = skb->len;
2821
2822         /* To get more precise estimation of bytes sent on wire,
2823          * we add to pkt_len the headers size of all segments
2824          */
2825         if (shinfo->gso_size)  {
2826                 unsigned int hdr_len;
2827                 u16 gso_segs = shinfo->gso_segs;
2828
2829                 /* mac layer + network layer */
2830                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2831
2832                 /* + transport layer */
2833                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2834                         hdr_len += tcp_hdrlen(skb);
2835                 else
2836                         hdr_len += sizeof(struct udphdr);
2837
2838                 if (shinfo->gso_type & SKB_GSO_DODGY)
2839                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2840                                                 shinfo->gso_size);
2841
2842                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2843         }
2844 }
2845
2846 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2847                                  struct net_device *dev,
2848                                  struct netdev_queue *txq)
2849 {
2850         spinlock_t *root_lock = qdisc_lock(q);
2851         bool contended;
2852         int rc;
2853
2854         qdisc_pkt_len_init(skb);
2855         qdisc_calculate_pkt_len(skb, q);
2856         /*
2857          * Heuristic to force contended enqueues to serialize on a
2858          * separate lock before trying to get qdisc main lock.
2859          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2860          * often and dequeue packets faster.
2861          */
2862         contended = qdisc_is_running(q);
2863         if (unlikely(contended))
2864                 spin_lock(&q->busylock);
2865
2866         spin_lock(root_lock);
2867         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2868                 kfree_skb(skb);
2869                 rc = NET_XMIT_DROP;
2870         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2871                    qdisc_run_begin(q)) {
2872                 /*
2873                  * This is a work-conserving queue; there are no old skbs
2874                  * waiting to be sent out; and the qdisc is not running -
2875                  * xmit the skb directly.
2876                  */
2877
2878                 qdisc_bstats_update(q, skb);
2879
2880                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2881                         if (unlikely(contended)) {
2882                                 spin_unlock(&q->busylock);
2883                                 contended = false;
2884                         }
2885                         __qdisc_run(q);
2886                 } else
2887                         qdisc_run_end(q);
2888
2889                 rc = NET_XMIT_SUCCESS;
2890         } else {
2891                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2892                 if (qdisc_run_begin(q)) {
2893                         if (unlikely(contended)) {
2894                                 spin_unlock(&q->busylock);
2895                                 contended = false;
2896                         }
2897                         __qdisc_run(q);
2898                 }
2899         }
2900         spin_unlock(root_lock);
2901         if (unlikely(contended))
2902                 spin_unlock(&q->busylock);
2903         return rc;
2904 }
2905
2906 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2907 static void skb_update_prio(struct sk_buff *skb)
2908 {
2909         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2910
2911         if (!skb->priority && skb->sk && map) {
2912                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2913
2914                 if (prioidx < map->priomap_len)
2915                         skb->priority = map->priomap[prioidx];
2916         }
2917 }
2918 #else
2919 #define skb_update_prio(skb)
2920 #endif
2921
2922 DEFINE_PER_CPU(int, xmit_recursion);
2923 EXPORT_SYMBOL(xmit_recursion);
2924
2925 #define RECURSION_LIMIT 10
2926
2927 /**
2928  *      dev_loopback_xmit - loop back @skb
2929  *      @skb: buffer to transmit
2930  */
2931 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2932 {
2933         skb_reset_mac_header(skb);
2934         __skb_pull(skb, skb_network_offset(skb));
2935         skb->pkt_type = PACKET_LOOPBACK;
2936         skb->ip_summed = CHECKSUM_UNNECESSARY;
2937         WARN_ON(!skb_dst(skb));
2938         skb_dst_force(skb);
2939         netif_rx_ni(skb);
2940         return 0;
2941 }
2942 EXPORT_SYMBOL(dev_loopback_xmit);
2943
2944 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2945 {
2946 #ifdef CONFIG_XPS
2947         struct xps_dev_maps *dev_maps;
2948         struct xps_map *map;
2949         int queue_index = -1;
2950
2951         rcu_read_lock();
2952         dev_maps = rcu_dereference(dev->xps_maps);
2953         if (dev_maps) {
2954                 map = rcu_dereference(
2955                     dev_maps->cpu_map[skb->sender_cpu - 1]);
2956                 if (map) {
2957                         if (map->len == 1)
2958                                 queue_index = map->queues[0];
2959                         else
2960                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2961                                                                            map->len)];
2962                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2963                                 queue_index = -1;
2964                 }
2965         }
2966         rcu_read_unlock();
2967
2968         return queue_index;
2969 #else
2970         return -1;
2971 #endif
2972 }
2973
2974 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2975 {
2976         struct sock *sk = skb->sk;
2977         int queue_index = sk_tx_queue_get(sk);
2978
2979         if (queue_index < 0 || skb->ooo_okay ||
2980             queue_index >= dev->real_num_tx_queues) {
2981                 int new_index = get_xps_queue(dev, skb);
2982                 if (new_index < 0)
2983                         new_index = skb_tx_hash(dev, skb);
2984
2985                 if (queue_index != new_index && sk &&
2986                     rcu_access_pointer(sk->sk_dst_cache))
2987                         sk_tx_queue_set(sk, new_index);
2988
2989                 queue_index = new_index;
2990         }
2991
2992         return queue_index;
2993 }
2994
2995 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2996                                     struct sk_buff *skb,
2997                                     void *accel_priv)
2998 {
2999         int queue_index = 0;
3000
3001 #ifdef CONFIG_XPS
3002         if (skb->sender_cpu == 0)
3003                 skb->sender_cpu = raw_smp_processor_id() + 1;
3004 #endif
3005
3006         if (dev->real_num_tx_queues != 1) {
3007                 const struct net_device_ops *ops = dev->netdev_ops;
3008                 if (ops->ndo_select_queue)
3009                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3010                                                             __netdev_pick_tx);
3011                 else
3012                         queue_index = __netdev_pick_tx(dev, skb);
3013
3014                 if (!accel_priv)
3015                         queue_index = netdev_cap_txqueue(dev, queue_index);
3016         }
3017
3018         skb_set_queue_mapping(skb, queue_index);
3019         return netdev_get_tx_queue(dev, queue_index);
3020 }
3021
3022 /**
3023  *      __dev_queue_xmit - transmit a buffer
3024  *      @skb: buffer to transmit
3025  *      @accel_priv: private data used for L2 forwarding offload
3026  *
3027  *      Queue a buffer for transmission to a network device. The caller must
3028  *      have set the device and priority and built the buffer before calling
3029  *      this function. The function can be called from an interrupt.
3030  *
3031  *      A negative errno code is returned on a failure. A success does not
3032  *      guarantee the frame will be transmitted as it may be dropped due
3033  *      to congestion or traffic shaping.
3034  *
3035  * -----------------------------------------------------------------------------------
3036  *      I notice this method can also return errors from the queue disciplines,
3037  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3038  *      be positive.
3039  *
3040  *      Regardless of the return value, the skb is consumed, so it is currently
3041  *      difficult to retry a send to this method.  (You can bump the ref count
3042  *      before sending to hold a reference for retry if you are careful.)
3043  *
3044  *      When calling this method, interrupts MUST be enabled.  This is because
3045  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3046  *          --BLG
3047  */
3048 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3049 {
3050         struct net_device *dev = skb->dev;
3051         struct netdev_queue *txq;
3052         struct Qdisc *q;
3053         int rc = -ENOMEM;
3054
3055         skb_reset_mac_header(skb);
3056
3057         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3058                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3059
3060         /* Disable soft irqs for various locks below. Also
3061          * stops preemption for RCU.
3062          */
3063         rcu_read_lock_bh();
3064
3065         skb_update_prio(skb);
3066
3067         /* If device/qdisc don't need skb->dst, release it right now while
3068          * its hot in this cpu cache.
3069          */
3070         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3071                 skb_dst_drop(skb);
3072         else
3073                 skb_dst_force(skb);
3074
3075         txq = netdev_pick_tx(dev, skb, accel_priv);
3076         q = rcu_dereference_bh(txq->qdisc);
3077
3078 #ifdef CONFIG_NET_CLS_ACT
3079         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3080 #endif
3081         trace_net_dev_queue(skb);
3082         if (q->enqueue) {
3083                 rc = __dev_xmit_skb(skb, q, dev, txq);
3084                 goto out;
3085         }
3086
3087         /* The device has no queue. Common case for software devices:
3088            loopback, all the sorts of tunnels...
3089
3090            Really, it is unlikely that netif_tx_lock protection is necessary
3091            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3092            counters.)
3093            However, it is possible, that they rely on protection
3094            made by us here.
3095
3096            Check this and shot the lock. It is not prone from deadlocks.
3097            Either shot noqueue qdisc, it is even simpler 8)
3098          */
3099         if (dev->flags & IFF_UP) {
3100                 int cpu = smp_processor_id(); /* ok because BHs are off */
3101
3102                 if (txq->xmit_lock_owner != cpu) {
3103
3104                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3105                                 goto recursion_alert;
3106
3107                         skb = validate_xmit_skb(skb, dev);
3108                         if (!skb)
3109                                 goto drop;
3110
3111                         HARD_TX_LOCK(dev, txq, cpu);
3112
3113                         if (!netif_xmit_stopped(txq)) {
3114                                 __this_cpu_inc(xmit_recursion);
3115                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3116                                 __this_cpu_dec(xmit_recursion);
3117                                 if (dev_xmit_complete(rc)) {
3118                                         HARD_TX_UNLOCK(dev, txq);
3119                                         goto out;
3120                                 }
3121                         }
3122                         HARD_TX_UNLOCK(dev, txq);
3123                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3124                                              dev->name);
3125                 } else {
3126                         /* Recursion is detected! It is possible,
3127                          * unfortunately
3128                          */
3129 recursion_alert:
3130                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3131                                              dev->name);
3132                 }
3133         }
3134
3135         rc = -ENETDOWN;
3136 drop:
3137         rcu_read_unlock_bh();
3138
3139         atomic_long_inc(&dev->tx_dropped);
3140         kfree_skb_list(skb);
3141         return rc;
3142 out:
3143         rcu_read_unlock_bh();
3144         return rc;
3145 }
3146
3147 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3148 {
3149         return __dev_queue_xmit(skb, NULL);
3150 }
3151 EXPORT_SYMBOL(dev_queue_xmit_sk);
3152
3153 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3154 {
3155         return __dev_queue_xmit(skb, accel_priv);
3156 }
3157 EXPORT_SYMBOL(dev_queue_xmit_accel);
3158
3159
3160 /*=======================================================================
3161                         Receiver routines
3162   =======================================================================*/
3163
3164 int netdev_max_backlog __read_mostly = 1000;
3165 EXPORT_SYMBOL(netdev_max_backlog);
3166
3167 int netdev_tstamp_prequeue __read_mostly = 1;
3168 int netdev_budget __read_mostly = 300;
3169 int weight_p __read_mostly = 64;            /* old backlog weight */
3170
3171 /* Called with irq disabled */
3172 static inline void ____napi_schedule(struct softnet_data *sd,
3173                                      struct napi_struct *napi)
3174 {
3175         list_add_tail(&napi->poll_list, &sd->poll_list);
3176         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3177 }
3178
3179 #ifdef CONFIG_RPS
3180
3181 /* One global table that all flow-based protocols share. */
3182 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3183 EXPORT_SYMBOL(rps_sock_flow_table);
3184 u32 rps_cpu_mask __read_mostly;
3185 EXPORT_SYMBOL(rps_cpu_mask);
3186
3187 struct static_key rps_needed __read_mostly;
3188
3189 static struct rps_dev_flow *
3190 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3191             struct rps_dev_flow *rflow, u16 next_cpu)
3192 {
3193         if (next_cpu < nr_cpu_ids) {
3194 #ifdef CONFIG_RFS_ACCEL
3195                 struct netdev_rx_queue *rxqueue;
3196                 struct rps_dev_flow_table *flow_table;
3197                 struct rps_dev_flow *old_rflow;
3198                 u32 flow_id;
3199                 u16 rxq_index;
3200                 int rc;
3201
3202                 /* Should we steer this flow to a different hardware queue? */
3203                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3204                     !(dev->features & NETIF_F_NTUPLE))
3205                         goto out;
3206                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3207                 if (rxq_index == skb_get_rx_queue(skb))
3208                         goto out;
3209
3210                 rxqueue = dev->_rx + rxq_index;
3211                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3212                 if (!flow_table)
3213                         goto out;
3214                 flow_id = skb_get_hash(skb) & flow_table->mask;
3215                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3216                                                         rxq_index, flow_id);
3217                 if (rc < 0)
3218                         goto out;
3219                 old_rflow = rflow;
3220                 rflow = &flow_table->flows[flow_id];
3221                 rflow->filter = rc;
3222                 if (old_rflow->filter == rflow->filter)
3223                         old_rflow->filter = RPS_NO_FILTER;
3224         out:
3225 #endif
3226                 rflow->last_qtail =
3227                         per_cpu(softnet_data, next_cpu).input_queue_head;
3228         }
3229
3230         rflow->cpu = next_cpu;
3231         return rflow;
3232 }
3233
3234 /*
3235  * get_rps_cpu is called from netif_receive_skb and returns the target
3236  * CPU from the RPS map of the receiving queue for a given skb.
3237  * rcu_read_lock must be held on entry.
3238  */
3239 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3240                        struct rps_dev_flow **rflowp)
3241 {
3242         const struct rps_sock_flow_table *sock_flow_table;
3243         struct netdev_rx_queue *rxqueue = dev->_rx;
3244         struct rps_dev_flow_table *flow_table;
3245         struct rps_map *map;
3246         int cpu = -1;
3247         u32 tcpu;
3248         u32 hash;
3249
3250         if (skb_rx_queue_recorded(skb)) {
3251                 u16 index = skb_get_rx_queue(skb);
3252
3253                 if (unlikely(index >= dev->real_num_rx_queues)) {
3254                         WARN_ONCE(dev->real_num_rx_queues > 1,
3255                                   "%s received packet on queue %u, but number "
3256                                   "of RX queues is %u\n",
3257                                   dev->name, index, dev->real_num_rx_queues);
3258                         goto done;
3259                 }
3260                 rxqueue += index;
3261         }
3262
3263         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3264
3265         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3266         map = rcu_dereference(rxqueue->rps_map);
3267         if (!flow_table && !map)
3268                 goto done;
3269
3270         skb_reset_network_header(skb);
3271         hash = skb_get_hash(skb);
3272         if (!hash)
3273                 goto done;
3274
3275         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3276         if (flow_table && sock_flow_table) {
3277                 struct rps_dev_flow *rflow;
3278                 u32 next_cpu;
3279                 u32 ident;
3280
3281                 /* First check into global flow table if there is a match */
3282                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3283                 if ((ident ^ hash) & ~rps_cpu_mask)
3284                         goto try_rps;
3285
3286                 next_cpu = ident & rps_cpu_mask;
3287
3288                 /* OK, now we know there is a match,
3289                  * we can look at the local (per receive queue) flow table
3290                  */
3291                 rflow = &flow_table->flows[hash & flow_table->mask];
3292                 tcpu = rflow->cpu;
3293
3294                 /*
3295                  * If the desired CPU (where last recvmsg was done) is
3296                  * different from current CPU (one in the rx-queue flow
3297                  * table entry), switch if one of the following holds:
3298                  *   - Current CPU is unset (>= nr_cpu_ids).
3299                  *   - Current CPU is offline.
3300                  *   - The current CPU's queue tail has advanced beyond the
3301                  *     last packet that was enqueued using this table entry.
3302                  *     This guarantees that all previous packets for the flow
3303                  *     have been dequeued, thus preserving in order delivery.
3304                  */
3305                 if (unlikely(tcpu != next_cpu) &&
3306                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3307                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3308                       rflow->last_qtail)) >= 0)) {
3309                         tcpu = next_cpu;
3310                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3311                 }
3312
3313                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3314                         *rflowp = rflow;
3315                         cpu = tcpu;
3316                         goto done;
3317                 }
3318         }
3319
3320 try_rps:
3321
3322         if (map) {
3323                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3324                 if (cpu_online(tcpu)) {
3325                         cpu = tcpu;
3326                         goto done;
3327                 }
3328         }
3329
3330 done:
3331         return cpu;
3332 }
3333
3334 #ifdef CONFIG_RFS_ACCEL
3335
3336 /**
3337  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3338  * @dev: Device on which the filter was set
3339  * @rxq_index: RX queue index
3340  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3341  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3342  *
3343  * Drivers that implement ndo_rx_flow_steer() should periodically call
3344  * this function for each installed filter and remove the filters for
3345  * which it returns %true.
3346  */
3347 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3348                          u32 flow_id, u16 filter_id)
3349 {
3350         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3351         struct rps_dev_flow_table *flow_table;
3352         struct rps_dev_flow *rflow;
3353         bool expire = true;
3354         unsigned int cpu;
3355
3356         rcu_read_lock();
3357         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3358         if (flow_table && flow_id <= flow_table->mask) {
3359                 rflow = &flow_table->flows[flow_id];
3360                 cpu = ACCESS_ONCE(rflow->cpu);
3361                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3362                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3363                            rflow->last_qtail) <
3364                      (int)(10 * flow_table->mask)))
3365                         expire = false;
3366         }
3367         rcu_read_unlock();
3368         return expire;
3369 }
3370 EXPORT_SYMBOL(rps_may_expire_flow);
3371
3372 #endif /* CONFIG_RFS_ACCEL */
3373
3374 /* Called from hardirq (IPI) context */
3375 static void rps_trigger_softirq(void *data)
3376 {
3377         struct softnet_data *sd = data;
3378
3379         ____napi_schedule(sd, &sd->backlog);
3380         sd->received_rps++;
3381 }
3382
3383 #endif /* CONFIG_RPS */
3384
3385 /*
3386  * Check if this softnet_data structure is another cpu one
3387  * If yes, queue it to our IPI list and return 1
3388  * If no, return 0
3389  */
3390 static int rps_ipi_queued(struct softnet_data *sd)
3391 {
3392 #ifdef CONFIG_RPS
3393         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3394
3395         if (sd != mysd) {
3396                 sd->rps_ipi_next = mysd->rps_ipi_list;
3397                 mysd->rps_ipi_list = sd;
3398
3399                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3400                 return 1;
3401         }
3402 #endif /* CONFIG_RPS */
3403         return 0;
3404 }
3405
3406 #ifdef CONFIG_NET_FLOW_LIMIT
3407 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3408 #endif
3409
3410 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3411 {
3412 #ifdef CONFIG_NET_FLOW_LIMIT
3413         struct sd_flow_limit *fl;
3414         struct softnet_data *sd;
3415         unsigned int old_flow, new_flow;
3416
3417         if (qlen < (netdev_max_backlog >> 1))
3418                 return false;
3419
3420         sd = this_cpu_ptr(&softnet_data);
3421
3422         rcu_read_lock();
3423         fl = rcu_dereference(sd->flow_limit);
3424         if (fl) {
3425                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3426                 old_flow = fl->history[fl->history_head];
3427                 fl->history[fl->history_head] = new_flow;
3428
3429                 fl->history_head++;
3430                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3431
3432                 if (likely(fl->buckets[old_flow]))
3433                         fl->buckets[old_flow]--;
3434
3435                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3436                         fl->count++;
3437                         rcu_read_unlock();
3438                         return true;
3439                 }
3440         }
3441         rcu_read_unlock();
3442 #endif
3443         return false;
3444 }
3445
3446 /*
3447  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3448  * queue (may be a remote CPU queue).
3449  */
3450 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3451                               unsigned int *qtail)
3452 {
3453         struct softnet_data *sd;
3454         unsigned long flags;
3455         unsigned int qlen;
3456
3457         sd = &per_cpu(softnet_data, cpu);
3458
3459         local_irq_save(flags);
3460
3461         rps_lock(sd);
3462         qlen = skb_queue_len(&sd->input_pkt_queue);
3463         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3464                 if (qlen) {
3465 enqueue:
3466                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3467                         input_queue_tail_incr_save(sd, qtail);
3468                         rps_unlock(sd);
3469                         local_irq_restore(flags);
3470                         return NET_RX_SUCCESS;
3471                 }
3472
3473                 /* Schedule NAPI for backlog device
3474                  * We can use non atomic operation since we own the queue lock
3475                  */
3476                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3477                         if (!rps_ipi_queued(sd))
3478                                 ____napi_schedule(sd, &sd->backlog);
3479                 }
3480                 goto enqueue;
3481         }
3482
3483         sd->dropped++;
3484         rps_unlock(sd);
3485
3486         local_irq_restore(flags);
3487
3488         atomic_long_inc(&skb->dev->rx_dropped);
3489         kfree_skb(skb);
3490         return NET_RX_DROP;
3491 }
3492
3493 static int netif_rx_internal(struct sk_buff *skb)
3494 {
3495         int ret;
3496
3497         net_timestamp_check(netdev_tstamp_prequeue, skb);
3498
3499         trace_netif_rx(skb);
3500 #ifdef CONFIG_RPS
3501         if (static_key_false(&rps_needed)) {
3502                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3503                 int cpu;
3504
3505                 preempt_disable();
3506                 rcu_read_lock();
3507
3508                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3509                 if (cpu < 0)
3510                         cpu = smp_processor_id();
3511
3512                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3513
3514                 rcu_read_unlock();
3515                 preempt_enable();
3516         } else
3517 #endif
3518         {
3519                 unsigned int qtail;
3520                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3521                 put_cpu();
3522         }
3523         return ret;
3524 }
3525
3526 /**
3527  *      netif_rx        -       post buffer to the network code
3528  *      @skb: buffer to post
3529  *
3530  *      This function receives a packet from a device driver and queues it for
3531  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3532  *      may be dropped during processing for congestion control or by the
3533  *      protocol layers.
3534  *
3535  *      return values:
3536  *      NET_RX_SUCCESS  (no congestion)
3537  *      NET_RX_DROP     (packet was dropped)
3538  *
3539  */
3540
3541 int netif_rx(struct sk_buff *skb)
3542 {
3543         trace_netif_rx_entry(skb);
3544
3545         return netif_rx_internal(skb);
3546 }
3547 EXPORT_SYMBOL(netif_rx);
3548
3549 int netif_rx_ni(struct sk_buff *skb)
3550 {
3551         int err;
3552
3553         trace_netif_rx_ni_entry(skb);
3554
3555         preempt_disable();
3556         err = netif_rx_internal(skb);
3557         if (local_softirq_pending())
3558                 do_softirq();
3559         preempt_enable();
3560
3561         return err;
3562 }
3563 EXPORT_SYMBOL(netif_rx_ni);
3564
3565 static void net_tx_action(struct softirq_action *h)
3566 {
3567         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3568
3569         if (sd->completion_queue) {
3570                 struct sk_buff *clist;
3571
3572                 local_irq_disable();
3573                 clist = sd->completion_queue;
3574                 sd->completion_queue = NULL;
3575                 local_irq_enable();
3576
3577                 while (clist) {
3578                         struct sk_buff *skb = clist;
3579                         clist = clist->next;
3580
3581                         WARN_ON(atomic_read(&skb->users));
3582                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3583                                 trace_consume_skb(skb);
3584                         else
3585                                 trace_kfree_skb(skb, net_tx_action);
3586                         __kfree_skb(skb);
3587                 }
3588         }
3589
3590         if (sd->output_queue) {
3591                 struct Qdisc *head;
3592
3593                 local_irq_disable();
3594                 head = sd->output_queue;
3595                 sd->output_queue = NULL;
3596                 sd->output_queue_tailp = &sd->output_queue;
3597                 local_irq_enable();
3598
3599                 while (head) {
3600                         struct Qdisc *q = head;
3601                         spinlock_t *root_lock;
3602
3603                         head = head->next_sched;
3604
3605                         root_lock = qdisc_lock(q);
3606                         if (spin_trylock(root_lock)) {
3607                                 smp_mb__before_atomic();
3608                                 clear_bit(__QDISC_STATE_SCHED,
3609                                           &q->state);
3610                                 qdisc_run(q);
3611                                 spin_unlock(root_lock);
3612                         } else {
3613                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3614                                               &q->state)) {
3615                                         __netif_reschedule(q);
3616                                 } else {
3617                                         smp_mb__before_atomic();
3618                                         clear_bit(__QDISC_STATE_SCHED,
3619                                                   &q->state);
3620                                 }
3621                         }
3622                 }
3623         }
3624 }
3625
3626 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3627     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3628 /* This hook is defined here for ATM LANE */
3629 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3630                              unsigned char *addr) __read_mostly;
3631 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3632 #endif
3633
3634 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3635                                          struct packet_type **pt_prev,
3636                                          int *ret, struct net_device *orig_dev)
3637 {
3638 #ifdef CONFIG_NET_CLS_ACT
3639         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3640         struct tcf_result cl_res;
3641
3642         /* If there's at least one ingress present somewhere (so
3643          * we get here via enabled static key), remaining devices
3644          * that are not configured with an ingress qdisc will bail
3645          * out here.
3646          */
3647         if (!cl)
3648                 return skb;
3649         if (*pt_prev) {
3650                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3651                 *pt_prev = NULL;
3652         }
3653
3654         qdisc_skb_cb(skb)->pkt_len = skb->len;
3655         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3656         qdisc_bstats_update_cpu(cl->q, skb);
3657
3658         switch (tc_classify(skb, cl, &cl_res)) {
3659         case TC_ACT_OK:
3660         case TC_ACT_RECLASSIFY:
3661                 skb->tc_index = TC_H_MIN(cl_res.classid);
3662                 break;
3663         case TC_ACT_SHOT:
3664                 qdisc_qstats_drop_cpu(cl->q);
3665         case TC_ACT_STOLEN:
3666         case TC_ACT_QUEUED:
3667                 kfree_skb(skb);
3668                 return NULL;
3669         default:
3670                 break;
3671         }
3672 #endif /* CONFIG_NET_CLS_ACT */
3673         return skb;
3674 }
3675
3676 /**
3677  *      netdev_rx_handler_register - register receive handler
3678  *      @dev: device to register a handler for
3679  *      @rx_handler: receive handler to register
3680  *      @rx_handler_data: data pointer that is used by rx handler
3681  *
3682  *      Register a receive handler for a device. This handler will then be
3683  *      called from __netif_receive_skb. A negative errno code is returned
3684  *      on a failure.
3685  *
3686  *      The caller must hold the rtnl_mutex.
3687  *
3688  *      For a general description of rx_handler, see enum rx_handler_result.
3689  */
3690 int netdev_rx_handler_register(struct net_device *dev,
3691                                rx_handler_func_t *rx_handler,
3692                                void *rx_handler_data)
3693 {
3694         ASSERT_RTNL();
3695
3696         if (dev->rx_handler)
3697                 return -EBUSY;
3698
3699         /* Note: rx_handler_data must be set before rx_handler */
3700         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3701         rcu_assign_pointer(dev->rx_handler, rx_handler);
3702
3703         return 0;
3704 }
3705 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3706
3707 /**
3708  *      netdev_rx_handler_unregister - unregister receive handler
3709  *      @dev: device to unregister a handler from
3710  *
3711  *      Unregister a receive handler from a device.
3712  *
3713  *      The caller must hold the rtnl_mutex.
3714  */
3715 void netdev_rx_handler_unregister(struct net_device *dev)
3716 {
3717
3718         ASSERT_RTNL();
3719         RCU_INIT_POINTER(dev->rx_handler, NULL);
3720         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3721          * section has a guarantee to see a non NULL rx_handler_data
3722          * as well.
3723          */
3724         synchronize_net();
3725         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3726 }
3727 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3728
3729 /*
3730  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3731  * the special handling of PFMEMALLOC skbs.
3732  */
3733 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3734 {
3735         switch (skb->protocol) {
3736         case htons(ETH_P_ARP):
3737         case htons(ETH_P_IP):
3738         case htons(ETH_P_IPV6):
3739         case htons(ETH_P_8021Q):
3740         case htons(ETH_P_8021AD):
3741                 return true;
3742         default:
3743                 return false;
3744         }
3745 }
3746
3747 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3748                              int *ret, struct net_device *orig_dev)
3749 {
3750 #ifdef CONFIG_NETFILTER_INGRESS
3751         if (nf_hook_ingress_active(skb)) {
3752                 if (*pt_prev) {
3753                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3754                         *pt_prev = NULL;
3755                 }
3756
3757                 return nf_hook_ingress(skb);
3758         }
3759 #endif /* CONFIG_NETFILTER_INGRESS */
3760         return 0;
3761 }
3762
3763 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3764 {
3765         struct packet_type *ptype, *pt_prev;
3766         rx_handler_func_t *rx_handler;
3767         struct net_device *orig_dev;
3768         bool deliver_exact = false;
3769         int ret = NET_RX_DROP;
3770         __be16 type;
3771
3772         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3773
3774         trace_netif_receive_skb(skb);
3775
3776         orig_dev = skb->dev;
3777
3778         skb_reset_network_header(skb);
3779         if (!skb_transport_header_was_set(skb))
3780                 skb_reset_transport_header(skb);
3781         skb_reset_mac_len(skb);
3782
3783         pt_prev = NULL;
3784
3785         rcu_read_lock();
3786
3787 another_round:
3788         skb->skb_iif = skb->dev->ifindex;
3789
3790         __this_cpu_inc(softnet_data.processed);
3791
3792         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3793             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3794                 skb = skb_vlan_untag(skb);
3795                 if (unlikely(!skb))
3796                         goto unlock;
3797         }
3798
3799 #ifdef CONFIG_NET_CLS_ACT
3800         if (skb->tc_verd & TC_NCLS) {
3801                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3802                 goto ncls;
3803         }
3804 #endif
3805
3806         if (pfmemalloc)
3807                 goto skip_taps;
3808
3809         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3810                 if (pt_prev)
3811                         ret = deliver_skb(skb, pt_prev, orig_dev);
3812                 pt_prev = ptype;
3813         }
3814
3815         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3816                 if (pt_prev)
3817                         ret = deliver_skb(skb, pt_prev, orig_dev);
3818                 pt_prev = ptype;
3819         }
3820
3821 skip_taps:
3822 #ifdef CONFIG_NET_INGRESS
3823         if (static_key_false(&ingress_needed)) {
3824                 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3825                 if (!skb)
3826                         goto unlock;
3827
3828                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3829                         goto unlock;
3830         }
3831 #endif
3832 #ifdef CONFIG_NET_CLS_ACT
3833         skb->tc_verd = 0;
3834 ncls:
3835 #endif
3836         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3837                 goto drop;
3838
3839         if (skb_vlan_tag_present(skb)) {
3840                 if (pt_prev) {
3841                         ret = deliver_skb(skb, pt_prev, orig_dev);
3842                         pt_prev = NULL;
3843                 }
3844                 if (vlan_do_receive(&skb))
3845                         goto another_round;
3846                 else if (unlikely(!skb))
3847                         goto unlock;
3848         }
3849
3850         rx_handler = rcu_dereference(skb->dev->rx_handler);
3851         if (rx_handler) {
3852                 if (pt_prev) {
3853                         ret = deliver_skb(skb, pt_prev, orig_dev);
3854                         pt_prev = NULL;
3855                 }
3856                 switch (rx_handler(&skb)) {
3857                 case RX_HANDLER_CONSUMED:
3858                         ret = NET_RX_SUCCESS;
3859                         goto unlock;
3860                 case RX_HANDLER_ANOTHER:
3861                         goto another_round;
3862                 case RX_HANDLER_EXACT:
3863                         deliver_exact = true;
3864                 case RX_HANDLER_PASS:
3865                         break;
3866                 default:
3867                         BUG();
3868                 }
3869         }
3870
3871         if (unlikely(skb_vlan_tag_present(skb))) {
3872                 if (skb_vlan_tag_get_id(skb))
3873                         skb->pkt_type = PACKET_OTHERHOST;
3874                 /* Note: we might in the future use prio bits
3875                  * and set skb->priority like in vlan_do_receive()
3876                  * For the time being, just ignore Priority Code Point
3877                  */
3878                 skb->vlan_tci = 0;
3879         }
3880
3881         type = skb->protocol;
3882
3883         /* deliver only exact match when indicated */
3884         if (likely(!deliver_exact)) {
3885                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3886                                        &ptype_base[ntohs(type) &
3887                                                    PTYPE_HASH_MASK]);
3888         }
3889
3890         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3891                                &orig_dev->ptype_specific);
3892
3893         if (unlikely(skb->dev != orig_dev)) {
3894                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3895                                        &skb->dev->ptype_specific);
3896         }
3897
3898         if (pt_prev) {
3899                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3900                         goto drop;
3901                 else
3902                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3903         } else {
3904 drop:
3905                 atomic_long_inc(&skb->dev->rx_dropped);
3906                 kfree_skb(skb);
3907                 /* Jamal, now you will not able to escape explaining
3908                  * me how you were going to use this. :-)
3909                  */
3910                 ret = NET_RX_DROP;
3911         }
3912
3913 unlock:
3914         rcu_read_unlock();
3915         return ret;
3916 }
3917
3918 static int __netif_receive_skb(struct sk_buff *skb)
3919 {
3920         int ret;
3921
3922         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3923                 unsigned long pflags = current->flags;
3924
3925                 /*
3926                  * PFMEMALLOC skbs are special, they should
3927                  * - be delivered to SOCK_MEMALLOC sockets only
3928                  * - stay away from userspace
3929                  * - have bounded memory usage
3930                  *
3931                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3932                  * context down to all allocation sites.
3933                  */
3934                 current->flags |= PF_MEMALLOC;
3935                 ret = __netif_receive_skb_core(skb, true);
3936                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3937         } else
3938                 ret = __netif_receive_skb_core(skb, false);
3939
3940         return ret;
3941 }
3942
3943 static int netif_receive_skb_internal(struct sk_buff *skb)
3944 {
3945         net_timestamp_check(netdev_tstamp_prequeue, skb);
3946
3947         if (skb_defer_rx_timestamp(skb))
3948                 return NET_RX_SUCCESS;
3949
3950 #ifdef CONFIG_RPS
3951         if (static_key_false(&rps_needed)) {
3952                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3953                 int cpu, ret;
3954
3955                 rcu_read_lock();
3956
3957                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3958
3959                 if (cpu >= 0) {
3960                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3961                         rcu_read_unlock();
3962                         return ret;
3963                 }
3964                 rcu_read_unlock();
3965         }
3966 #endif
3967         return __netif_receive_skb(skb);
3968 }
3969
3970 /**
3971  *      netif_receive_skb - process receive buffer from network
3972  *      @skb: buffer to process
3973  *
3974  *      netif_receive_skb() is the main receive data processing function.
3975  *      It always succeeds. The buffer may be dropped during processing
3976  *      for congestion control or by the protocol layers.
3977  *
3978  *      This function may only be called from softirq context and interrupts
3979  *      should be enabled.
3980  *
3981  *      Return values (usually ignored):
3982  *      NET_RX_SUCCESS: no congestion
3983  *      NET_RX_DROP: packet was dropped
3984  */
3985 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
3986 {
3987         trace_netif_receive_skb_entry(skb);
3988
3989         return netif_receive_skb_internal(skb);
3990 }
3991 EXPORT_SYMBOL(netif_receive_skb_sk);
3992
3993 /* Network device is going away, flush any packets still pending
3994  * Called with irqs disabled.
3995  */
3996 static void flush_backlog(void *arg)
3997 {
3998         struct net_device *dev = arg;
3999         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4000         struct sk_buff *skb, *tmp;
4001
4002         rps_lock(sd);
4003         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4004                 if (skb->dev == dev) {
4005                         __skb_unlink(skb, &sd->input_pkt_queue);
4006                         kfree_skb(skb);
4007                         input_queue_head_incr(sd);
4008                 }
4009         }
4010         rps_unlock(sd);
4011
4012         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4013                 if (skb->dev == dev) {
4014                         __skb_unlink(skb, &sd->process_queue);
4015                         kfree_skb(skb);
4016                         input_queue_head_incr(sd);
4017                 }
4018         }
4019 }
4020
4021 static int napi_gro_complete(struct sk_buff *skb)
4022 {
4023         struct packet_offload *ptype;
4024         __be16 type = skb->protocol;
4025         struct list_head *head = &offload_base;
4026         int err = -ENOENT;
4027
4028         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4029
4030         if (NAPI_GRO_CB(skb)->count == 1) {
4031                 skb_shinfo(skb)->gso_size = 0;
4032                 goto out;
4033         }
4034
4035         rcu_read_lock();
4036         list_for_each_entry_rcu(ptype, head, list) {
4037                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4038                         continue;
4039
4040                 err = ptype->callbacks.gro_complete(skb, 0);
4041                 break;
4042         }
4043         rcu_read_unlock();
4044
4045         if (err) {
4046                 WARN_ON(&ptype->list == head);
4047                 kfree_skb(skb);
4048                 return NET_RX_SUCCESS;
4049         }
4050
4051 out:
4052         return netif_receive_skb_internal(skb);
4053 }
4054
4055 /* napi->gro_list contains packets ordered by age.
4056  * youngest packets at the head of it.
4057  * Complete skbs in reverse order to reduce latencies.
4058  */
4059 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4060 {
4061         struct sk_buff *skb, *prev = NULL;
4062
4063         /* scan list and build reverse chain */
4064         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4065                 skb->prev = prev;
4066                 prev = skb;
4067         }
4068
4069         for (skb = prev; skb; skb = prev) {
4070                 skb->next = NULL;
4071
4072                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4073                         return;
4074
4075                 prev = skb->prev;
4076                 napi_gro_complete(skb);
4077                 napi->gro_count--;
4078         }
4079
4080         napi->gro_list = NULL;
4081 }
4082 EXPORT_SYMBOL(napi_gro_flush);
4083
4084 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4085 {
4086         struct sk_buff *p;
4087         unsigned int maclen = skb->dev->hard_header_len;
4088         u32 hash = skb_get_hash_raw(skb);
4089
4090         for (p = napi->gro_list; p; p = p->next) {
4091                 unsigned long diffs;
4092
4093                 NAPI_GRO_CB(p)->flush = 0;
4094
4095                 if (hash != skb_get_hash_raw(p)) {
4096                         NAPI_GRO_CB(p)->same_flow = 0;
4097                         continue;
4098                 }
4099
4100                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4101                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4102                 if (maclen == ETH_HLEN)
4103                         diffs |= compare_ether_header(skb_mac_header(p),
4104                                                       skb_mac_header(skb));
4105                 else if (!diffs)
4106                         diffs = memcmp(skb_mac_header(p),
4107                                        skb_mac_header(skb),
4108                                        maclen);
4109                 NAPI_GRO_CB(p)->same_flow = !diffs;
4110         }
4111 }
4112
4113 static void skb_gro_reset_offset(struct sk_buff *skb)
4114 {
4115         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4116         const skb_frag_t *frag0 = &pinfo->frags[0];
4117
4118         NAPI_GRO_CB(skb)->data_offset = 0;
4119         NAPI_GRO_CB(skb)->frag0 = NULL;
4120         NAPI_GRO_CB(skb)->frag0_len = 0;
4121
4122         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4123             pinfo->nr_frags &&
4124             !PageHighMem(skb_frag_page(frag0))) {
4125                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4126                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4127         }
4128 }
4129
4130 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4131 {
4132         struct skb_shared_info *pinfo = skb_shinfo(skb);
4133
4134         BUG_ON(skb->end - skb->tail < grow);
4135
4136         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4137
4138         skb->data_len -= grow;
4139         skb->tail += grow;
4140
4141         pinfo->frags[0].page_offset += grow;
4142         skb_frag_size_sub(&pinfo->frags[0], grow);
4143
4144         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4145                 skb_frag_unref(skb, 0);
4146                 memmove(pinfo->frags, pinfo->frags + 1,
4147                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4148         }
4149 }
4150
4151 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4152 {
4153         struct sk_buff **pp = NULL;
4154         struct packet_offload *ptype;
4155         __be16 type = skb->protocol;
4156         struct list_head *head = &offload_base;
4157         int same_flow;
4158         enum gro_result ret;
4159         int grow;
4160
4161         if (!(skb->dev->features & NETIF_F_GRO))
4162                 goto normal;
4163
4164         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4165                 goto normal;
4166
4167         gro_list_prepare(napi, skb);
4168
4169         rcu_read_lock();
4170         list_for_each_entry_rcu(ptype, head, list) {
4171                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4172                         continue;
4173
4174                 skb_set_network_header(skb, skb_gro_offset(skb));
4175                 skb_reset_mac_len(skb);
4176                 NAPI_GRO_CB(skb)->same_flow = 0;
4177                 NAPI_GRO_CB(skb)->flush = 0;
4178                 NAPI_GRO_CB(skb)->free = 0;
4179                 NAPI_GRO_CB(skb)->udp_mark = 0;
4180                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4181
4182                 /* Setup for GRO checksum validation */
4183                 switch (skb->ip_summed) {
4184                 case CHECKSUM_COMPLETE:
4185                         NAPI_GRO_CB(skb)->csum = skb->csum;
4186                         NAPI_GRO_CB(skb)->csum_valid = 1;
4187                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4188                         break;
4189                 case CHECKSUM_UNNECESSARY:
4190                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4191                         NAPI_GRO_CB(skb)->csum_valid = 0;
4192                         break;
4193                 default:
4194                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4195                         NAPI_GRO_CB(skb)->csum_valid = 0;
4196                 }
4197
4198                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4199                 break;
4200         }
4201         rcu_read_unlock();
4202
4203         if (&ptype->list == head)
4204                 goto normal;
4205
4206         same_flow = NAPI_GRO_CB(skb)->same_flow;
4207         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4208
4209         if (pp) {
4210                 struct sk_buff *nskb = *pp;
4211
4212                 *pp = nskb->next;
4213                 nskb->next = NULL;
4214                 napi_gro_complete(nskb);
4215                 napi->gro_count--;
4216         }
4217
4218         if (same_flow)
4219                 goto ok;
4220
4221         if (NAPI_GRO_CB(skb)->flush)
4222                 goto normal;
4223
4224         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4225                 struct sk_buff *nskb = napi->gro_list;
4226
4227                 /* locate the end of the list to select the 'oldest' flow */
4228                 while (nskb->next) {
4229                         pp = &nskb->next;
4230                         nskb = *pp;
4231                 }
4232                 *pp = NULL;
4233                 nskb->next = NULL;
4234                 napi_gro_complete(nskb);
4235         } else {
4236                 napi->gro_count++;
4237         }
4238         NAPI_GRO_CB(skb)->count = 1;
4239         NAPI_GRO_CB(skb)->age = jiffies;
4240         NAPI_GRO_CB(skb)->last = skb;
4241         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4242         skb->next = napi->gro_list;
4243         napi->gro_list = skb;
4244         ret = GRO_HELD;
4245
4246 pull:
4247         grow = skb_gro_offset(skb) - skb_headlen(skb);
4248         if (grow > 0)
4249                 gro_pull_from_frag0(skb, grow);
4250 ok:
4251         return ret;
4252
4253 normal:
4254         ret = GRO_NORMAL;
4255         goto pull;
4256 }
4257
4258 struct packet_offload *gro_find_receive_by_type(__be16 type)
4259 {
4260         struct list_head *offload_head = &offload_base;
4261         struct packet_offload *ptype;
4262
4263         list_for_each_entry_rcu(ptype, offload_head, list) {
4264                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4265                         continue;
4266                 return ptype;
4267         }
4268         return NULL;
4269 }
4270 EXPORT_SYMBOL(gro_find_receive_by_type);
4271
4272 struct packet_offload *gro_find_complete_by_type(__be16 type)
4273 {
4274         struct list_head *offload_head = &offload_base;
4275         struct packet_offload *ptype;
4276
4277         list_for_each_entry_rcu(ptype, offload_head, list) {
4278                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4279                         continue;
4280                 return ptype;
4281         }
4282         return NULL;
4283 }
4284 EXPORT_SYMBOL(gro_find_complete_by_type);
4285
4286 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4287 {
4288         switch (ret) {
4289         case GRO_NORMAL:
4290                 if (netif_receive_skb_internal(skb))
4291                         ret = GRO_DROP;
4292                 break;
4293
4294         case GRO_DROP:
4295                 kfree_skb(skb);
4296                 break;
4297
4298         case GRO_MERGED_FREE:
4299                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4300                         kmem_cache_free(skbuff_head_cache, skb);
4301                 else
4302                         __kfree_skb(skb);
4303                 break;
4304
4305         case GRO_HELD:
4306         case GRO_MERGED:
4307                 break;
4308         }
4309
4310         return ret;
4311 }
4312
4313 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4314 {
4315         trace_napi_gro_receive_entry(skb);
4316
4317         skb_gro_reset_offset(skb);
4318
4319         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4320 }
4321 EXPORT_SYMBOL(napi_gro_receive);
4322
4323 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4324 {
4325         if (unlikely(skb->pfmemalloc)) {
4326                 consume_skb(skb);
4327                 return;
4328         }
4329         __skb_pull(skb, skb_headlen(skb));
4330         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4331         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4332         skb->vlan_tci = 0;
4333         skb->dev = napi->dev;
4334         skb->skb_iif = 0;
4335         skb->encapsulation = 0;
4336         skb_shinfo(skb)->gso_type = 0;
4337         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4338
4339         napi->skb = skb;
4340 }
4341
4342 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4343 {
4344         struct sk_buff *skb = napi->skb;
4345
4346         if (!skb) {
4347                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4348                 napi->skb = skb;
4349         }
4350         return skb;
4351 }
4352 EXPORT_SYMBOL(napi_get_frags);
4353
4354 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4355                                       struct sk_buff *skb,
4356                                       gro_result_t ret)
4357 {
4358         switch (ret) {
4359         case GRO_NORMAL:
4360         case GRO_HELD:
4361                 __skb_push(skb, ETH_HLEN);
4362                 skb->protocol = eth_type_trans(skb, skb->dev);
4363                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4364                         ret = GRO_DROP;
4365                 break;
4366
4367         case GRO_DROP:
4368         case GRO_MERGED_FREE:
4369                 napi_reuse_skb(napi, skb);
4370                 break;
4371
4372         case GRO_MERGED:
4373                 break;
4374         }
4375
4376         return ret;
4377 }
4378
4379 /* Upper GRO stack assumes network header starts at gro_offset=0
4380  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4381  * We copy ethernet header into skb->data to have a common layout.
4382  */
4383 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4384 {
4385         struct sk_buff *skb = napi->skb;
4386         const struct ethhdr *eth;
4387         unsigned int hlen = sizeof(*eth);
4388
4389         napi->skb = NULL;
4390
4391         skb_reset_mac_header(skb);
4392         skb_gro_reset_offset(skb);
4393
4394         eth = skb_gro_header_fast(skb, 0);
4395         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4396                 eth = skb_gro_header_slow(skb, hlen, 0);
4397                 if (unlikely(!eth)) {
4398                         napi_reuse_skb(napi, skb);
4399                         return NULL;
4400                 }
4401         } else {
4402                 gro_pull_from_frag0(skb, hlen);
4403                 NAPI_GRO_CB(skb)->frag0 += hlen;
4404                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4405         }
4406         __skb_pull(skb, hlen);
4407
4408         /*
4409          * This works because the only protocols we care about don't require
4410          * special handling.
4411          * We'll fix it up properly in napi_frags_finish()
4412          */
4413         skb->protocol = eth->h_proto;
4414
4415         return skb;
4416 }
4417
4418 gro_result_t napi_gro_frags(struct napi_struct *napi)
4419 {
4420         struct sk_buff *skb = napi_frags_skb(napi);
4421
4422         if (!skb)
4423                 return GRO_DROP;
4424
4425         trace_napi_gro_frags_entry(skb);
4426
4427         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4428 }
4429 EXPORT_SYMBOL(napi_gro_frags);
4430
4431 /* Compute the checksum from gro_offset and return the folded value
4432  * after adding in any pseudo checksum.
4433  */
4434 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4435 {
4436         __wsum wsum;
4437         __sum16 sum;
4438
4439         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4440
4441         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4442         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4443         if (likely(!sum)) {
4444                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4445                     !skb->csum_complete_sw)
4446                         netdev_rx_csum_fault(skb->dev);
4447         }
4448
4449         NAPI_GRO_CB(skb)->csum = wsum;
4450         NAPI_GRO_CB(skb)->csum_valid = 1;
4451
4452         return sum;
4453 }
4454 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4455
4456 /*
4457  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4458  * Note: called with local irq disabled, but exits with local irq enabled.
4459  */
4460 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4461 {
4462 #ifdef CONFIG_RPS
4463         struct softnet_data *remsd = sd->rps_ipi_list;
4464
4465         if (remsd) {
4466                 sd->rps_ipi_list = NULL;
4467
4468                 local_irq_enable();
4469
4470                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4471                 while (remsd) {
4472                         struct softnet_data *next = remsd->rps_ipi_next;
4473
4474                         if (cpu_online(remsd->cpu))
4475                                 smp_call_function_single_async(remsd->cpu,
4476                                                            &remsd->csd);
4477                         remsd = next;
4478                 }
4479         } else
4480 #endif
4481                 local_irq_enable();
4482 }
4483
4484 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4485 {
4486 #ifdef CONFIG_RPS
4487         return sd->rps_ipi_list != NULL;
4488 #else
4489         return false;
4490 #endif
4491 }
4492
4493 static int process_backlog(struct napi_struct *napi, int quota)
4494 {
4495         int work = 0;
4496         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4497
4498         /* Check if we have pending ipi, its better to send them now,
4499          * not waiting net_rx_action() end.
4500          */
4501         if (sd_has_rps_ipi_waiting(sd)) {
4502                 local_irq_disable();
4503                 net_rps_action_and_irq_enable(sd);
4504         }
4505
4506         napi->weight = weight_p;
4507         local_irq_disable();
4508         while (1) {
4509                 struct sk_buff *skb;
4510
4511                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4512                         local_irq_enable();
4513                         __netif_receive_skb(skb);
4514                         local_irq_disable();
4515                         input_queue_head_incr(sd);
4516                         if (++work >= quota) {
4517                                 local_irq_enable();
4518                                 return work;
4519                         }
4520                 }
4521
4522                 rps_lock(sd);
4523                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4524                         /*
4525                          * Inline a custom version of __napi_complete().
4526                          * only current cpu owns and manipulates this napi,
4527                          * and NAPI_STATE_SCHED is the only possible flag set
4528                          * on backlog.
4529                          * We can use a plain write instead of clear_bit(),
4530                          * and we dont need an smp_mb() memory barrier.
4531                          */
4532                         napi->state = 0;
4533                         rps_unlock(sd);
4534
4535                         break;
4536                 }
4537
4538                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4539                                            &sd->process_queue);
4540                 rps_unlock(sd);
4541         }
4542         local_irq_enable();
4543
4544         return work;
4545 }
4546
4547 /**
4548  * __napi_schedule - schedule for receive
4549  * @n: entry to schedule
4550  *
4551  * The entry's receive function will be scheduled to run.
4552  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4553  */
4554 void __napi_schedule(struct napi_struct *n)
4555 {
4556         unsigned long flags;
4557
4558         local_irq_save(flags);
4559         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4560         local_irq_restore(flags);
4561 }
4562 EXPORT_SYMBOL(__napi_schedule);
4563
4564 /**
4565  * __napi_schedule_irqoff - schedule for receive
4566  * @n: entry to schedule
4567  *
4568  * Variant of __napi_schedule() assuming hard irqs are masked
4569  */
4570 void __napi_schedule_irqoff(struct napi_struct *n)
4571 {
4572         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4573 }
4574 EXPORT_SYMBOL(__napi_schedule_irqoff);
4575
4576 void __napi_complete(struct napi_struct *n)
4577 {
4578         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4579
4580         list_del_init(&n->poll_list);
4581         smp_mb__before_atomic();
4582         clear_bit(NAPI_STATE_SCHED, &n->state);
4583 }
4584 EXPORT_SYMBOL(__napi_complete);
4585
4586 void napi_complete_done(struct napi_struct *n, int work_done)
4587 {
4588         unsigned long flags;
4589
4590         /*
4591          * don't let napi dequeue from the cpu poll list
4592          * just in case its running on a different cpu
4593          */
4594         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4595                 return;
4596
4597         if (n->gro_list) {
4598                 unsigned long timeout = 0;
4599
4600                 if (work_done)
4601                         timeout = n->dev->gro_flush_timeout;
4602
4603                 if (timeout)
4604                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4605                                       HRTIMER_MODE_REL_PINNED);
4606                 else
4607                         napi_gro_flush(n, false);
4608         }
4609         if (likely(list_empty(&n->poll_list))) {
4610                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4611         } else {
4612                 /* If n->poll_list is not empty, we need to mask irqs */
4613                 local_irq_save(flags);
4614                 __napi_complete(n);
4615                 local_irq_restore(flags);
4616         }
4617 }
4618 EXPORT_SYMBOL(napi_complete_done);
4619
4620 /* must be called under rcu_read_lock(), as we dont take a reference */
4621 struct napi_struct *napi_by_id(unsigned int napi_id)
4622 {
4623         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4624         struct napi_struct *napi;
4625
4626         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4627                 if (napi->napi_id == napi_id)
4628                         return napi;
4629
4630         return NULL;
4631 }
4632 EXPORT_SYMBOL_GPL(napi_by_id);
4633
4634 void napi_hash_add(struct napi_struct *napi)
4635 {
4636         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4637
4638                 spin_lock(&napi_hash_lock);
4639
4640                 /* 0 is not a valid id, we also skip an id that is taken
4641                  * we expect both events to be extremely rare
4642                  */
4643                 napi->napi_id = 0;
4644                 while (!napi->napi_id) {
4645                         napi->napi_id = ++napi_gen_id;
4646                         if (napi_by_id(napi->napi_id))
4647                                 napi->napi_id = 0;
4648                 }
4649
4650                 hlist_add_head_rcu(&napi->napi_hash_node,
4651                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4652
4653                 spin_unlock(&napi_hash_lock);
4654         }
4655 }
4656 EXPORT_SYMBOL_GPL(napi_hash_add);
4657
4658 /* Warning : caller is responsible to make sure rcu grace period
4659  * is respected before freeing memory containing @napi
4660  */
4661 void napi_hash_del(struct napi_struct *napi)
4662 {
4663         spin_lock(&napi_hash_lock);
4664
4665         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4666                 hlist_del_rcu(&napi->napi_hash_node);
4667
4668         spin_unlock(&napi_hash_lock);
4669 }
4670 EXPORT_SYMBOL_GPL(napi_hash_del);
4671
4672 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4673 {
4674         struct napi_struct *napi;
4675
4676         napi = container_of(timer, struct napi_struct, timer);
4677         if (napi->gro_list)
4678                 napi_schedule(napi);
4679
4680         return HRTIMER_NORESTART;
4681 }
4682
4683 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4684                     int (*poll)(struct napi_struct *, int), int weight)
4685 {
4686         INIT_LIST_HEAD(&napi->poll_list);
4687         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4688         napi->timer.function = napi_watchdog;
4689         napi->gro_count = 0;
4690         napi->gro_list = NULL;
4691         napi->skb = NULL;
4692         napi->poll = poll;
4693         if (weight > NAPI_POLL_WEIGHT)
4694                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4695                             weight, dev->name);
4696         napi->weight = weight;
4697         list_add(&napi->dev_list, &dev->napi_list);
4698         napi->dev = dev;
4699 #ifdef CONFIG_NETPOLL
4700         spin_lock_init(&napi->poll_lock);
4701         napi->poll_owner = -1;
4702 #endif
4703         set_bit(NAPI_STATE_SCHED, &napi->state);
4704 }
4705 EXPORT_SYMBOL(netif_napi_add);
4706
4707 void napi_disable(struct napi_struct *n)
4708 {
4709         might_sleep();
4710         set_bit(NAPI_STATE_DISABLE, &n->state);
4711
4712         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4713                 msleep(1);
4714
4715         hrtimer_cancel(&n->timer);
4716
4717         clear_bit(NAPI_STATE_DISABLE, &n->state);
4718 }
4719 EXPORT_SYMBOL(napi_disable);
4720
4721 void netif_napi_del(struct napi_struct *napi)
4722 {
4723         list_del_init(&napi->dev_list);
4724         napi_free_frags(napi);
4725
4726         kfree_skb_list(napi->gro_list);
4727         napi->gro_list = NULL;
4728         napi->gro_count = 0;
4729 }
4730 EXPORT_SYMBOL(netif_napi_del);
4731
4732 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4733 {
4734         void *have;
4735         int work, weight;
4736
4737         list_del_init(&n->poll_list);
4738
4739         have = netpoll_poll_lock(n);
4740
4741         weight = n->weight;
4742
4743         /* This NAPI_STATE_SCHED test is for avoiding a race
4744          * with netpoll's poll_napi().  Only the entity which
4745          * obtains the lock and sees NAPI_STATE_SCHED set will
4746          * actually make the ->poll() call.  Therefore we avoid
4747          * accidentally calling ->poll() when NAPI is not scheduled.
4748          */
4749         work = 0;
4750         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4751                 work = n->poll(n, weight);
4752                 trace_napi_poll(n);
4753         }
4754
4755         WARN_ON_ONCE(work > weight);
4756
4757         if (likely(work < weight))
4758                 goto out_unlock;
4759
4760         /* Drivers must not modify the NAPI state if they
4761          * consume the entire weight.  In such cases this code
4762          * still "owns" the NAPI instance and therefore can
4763          * move the instance around on the list at-will.
4764          */
4765         if (unlikely(napi_disable_pending(n))) {
4766                 napi_complete(n);
4767                 goto out_unlock;
4768         }
4769
4770         if (n->gro_list) {
4771                 /* flush too old packets
4772                  * If HZ < 1000, flush all packets.
4773                  */
4774                 napi_gro_flush(n, HZ >= 1000);
4775         }
4776
4777         /* Some drivers may have called napi_schedule
4778          * prior to exhausting their budget.
4779          */
4780         if (unlikely(!list_empty(&n->poll_list))) {
4781                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4782                              n->dev ? n->dev->name : "backlog");
4783                 goto out_unlock;
4784         }
4785
4786         list_add_tail(&n->poll_list, repoll);
4787
4788 out_unlock:
4789         netpoll_poll_unlock(have);
4790
4791         return work;
4792 }
4793
4794 static void net_rx_action(struct softirq_action *h)
4795 {
4796         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4797         unsigned long time_limit = jiffies + 2;
4798         int budget = netdev_budget;
4799         LIST_HEAD(list);
4800         LIST_HEAD(repoll);
4801
4802         local_irq_disable();
4803         list_splice_init(&sd->poll_list, &list);
4804         local_irq_enable();
4805
4806         for (;;) {
4807                 struct napi_struct *n;
4808
4809                 if (list_empty(&list)) {
4810                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4811                                 return;
4812                         break;
4813                 }
4814
4815                 n = list_first_entry(&list, struct napi_struct, poll_list);
4816                 budget -= napi_poll(n, &repoll);
4817
4818                 /* If softirq window is exhausted then punt.
4819                  * Allow this to run for 2 jiffies since which will allow
4820                  * an average latency of 1.5/HZ.
4821                  */
4822                 if (unlikely(budget <= 0 ||
4823                              time_after_eq(jiffies, time_limit))) {
4824                         sd->time_squeeze++;
4825                         break;
4826                 }
4827         }
4828
4829         local_irq_disable();
4830
4831         list_splice_tail_init(&sd->poll_list, &list);
4832         list_splice_tail(&repoll, &list);
4833         list_splice(&list, &sd->poll_list);
4834         if (!list_empty(&sd->poll_list))
4835                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4836
4837         net_rps_action_and_irq_enable(sd);
4838 }
4839
4840 struct netdev_adjacent {
4841         struct net_device *dev;
4842
4843         /* upper master flag, there can only be one master device per list */
4844         bool master;
4845
4846         /* counter for the number of times this device was added to us */
4847         u16 ref_nr;
4848
4849         /* private field for the users */
4850         void *private;
4851
4852         struct list_head list;
4853         struct rcu_head rcu;
4854 };
4855
4856 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4857                                                  struct net_device *adj_dev,
4858                                                  struct list_head *adj_list)
4859 {
4860         struct netdev_adjacent *adj;
4861
4862         list_for_each_entry(adj, adj_list, list) {
4863                 if (adj->dev == adj_dev)
4864                         return adj;
4865         }
4866         return NULL;
4867 }
4868
4869 /**
4870  * netdev_has_upper_dev - Check if device is linked to an upper device
4871  * @dev: device
4872  * @upper_dev: upper device to check
4873  *
4874  * Find out if a device is linked to specified upper device and return true
4875  * in case it is. Note that this checks only immediate upper device,
4876  * not through a complete stack of devices. The caller must hold the RTNL lock.
4877  */
4878 bool netdev_has_upper_dev(struct net_device *dev,
4879                           struct net_device *upper_dev)
4880 {
4881         ASSERT_RTNL();
4882
4883         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4884 }
4885 EXPORT_SYMBOL(netdev_has_upper_dev);
4886
4887 /**
4888  * netdev_has_any_upper_dev - Check if device is linked to some device
4889  * @dev: device
4890  *
4891  * Find out if a device is linked to an upper device and return true in case
4892  * it is. The caller must hold the RTNL lock.
4893  */
4894 static bool netdev_has_any_upper_dev(struct net_device *dev)
4895 {
4896         ASSERT_RTNL();
4897
4898         return !list_empty(&dev->all_adj_list.upper);
4899 }
4900
4901 /**
4902  * netdev_master_upper_dev_get - Get master upper device
4903  * @dev: device
4904  *
4905  * Find a master upper device and return pointer to it or NULL in case
4906  * it's not there. The caller must hold the RTNL lock.
4907  */
4908 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4909 {
4910         struct netdev_adjacent *upper;
4911
4912         ASSERT_RTNL();
4913
4914         if (list_empty(&dev->adj_list.upper))
4915                 return NULL;
4916
4917         upper = list_first_entry(&dev->adj_list.upper,
4918                                  struct netdev_adjacent, list);
4919         if (likely(upper->master))
4920                 return upper->dev;
4921         return NULL;
4922 }
4923 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4924
4925 void *netdev_adjacent_get_private(struct list_head *adj_list)
4926 {
4927         struct netdev_adjacent *adj;
4928
4929         adj = list_entry(adj_list, struct netdev_adjacent, list);
4930
4931         return adj->private;
4932 }
4933 EXPORT_SYMBOL(netdev_adjacent_get_private);
4934
4935 /**
4936  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4937  * @dev: device
4938  * @iter: list_head ** of the current position
4939  *
4940  * Gets the next device from the dev's upper list, starting from iter
4941  * position. The caller must hold RCU read lock.
4942  */
4943 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4944                                                  struct list_head **iter)
4945 {
4946         struct netdev_adjacent *upper;
4947
4948         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4949
4950         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4951
4952         if (&upper->list == &dev->adj_list.upper)
4953                 return NULL;
4954
4955         *iter = &upper->list;
4956
4957         return upper->dev;
4958 }
4959 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4960
4961 /**
4962  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4963  * @dev: device
4964  * @iter: list_head ** of the current position
4965  *
4966  * Gets the next device from the dev's upper list, starting from iter
4967  * position. The caller must hold RCU read lock.
4968  */
4969 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4970                                                      struct list_head **iter)
4971 {
4972         struct netdev_adjacent *upper;
4973
4974         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4975
4976         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4977
4978         if (&upper->list == &dev->all_adj_list.upper)
4979                 return NULL;
4980
4981         *iter = &upper->list;
4982
4983         return upper->dev;
4984 }
4985 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4986
4987 /**
4988  * netdev_lower_get_next_private - Get the next ->private from the
4989  *                                 lower neighbour list
4990  * @dev: device
4991  * @iter: list_head ** of the current position
4992  *
4993  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4994  * list, starting from iter position. The caller must hold either hold the
4995  * RTNL lock or its own locking that guarantees that the neighbour lower
4996  * list will remain unchainged.
4997  */
4998 void *netdev_lower_get_next_private(struct net_device *dev,
4999                                     struct list_head **iter)
5000 {
5001         struct netdev_adjacent *lower;
5002
5003         lower = list_entry(*iter, struct netdev_adjacent, list);
5004
5005         if (&lower->list == &dev->adj_list.lower)
5006                 return NULL;
5007
5008         *iter = lower->list.next;
5009
5010         return lower->private;
5011 }
5012 EXPORT_SYMBOL(netdev_lower_get_next_private);
5013
5014 /**
5015  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5016  *                                     lower neighbour list, RCU
5017  *                                     variant
5018  * @dev: device
5019  * @iter: list_head ** of the current position
5020  *
5021  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5022  * list, starting from iter position. The caller must hold RCU read lock.
5023  */
5024 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5025                                         struct list_head **iter)
5026 {
5027         struct netdev_adjacent *lower;
5028
5029         WARN_ON_ONCE(!rcu_read_lock_held());
5030
5031         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5032
5033         if (&lower->list == &dev->adj_list.lower)
5034                 return NULL;
5035
5036         *iter = &lower->list;
5037
5038         return lower->private;
5039 }
5040 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5041
5042 /**
5043  * netdev_lower_get_next - Get the next device from the lower neighbour
5044  *                         list
5045  * @dev: device
5046  * @iter: list_head ** of the current position
5047  *
5048  * Gets the next netdev_adjacent from the dev's lower neighbour
5049  * list, starting from iter position. The caller must hold RTNL lock or
5050  * its own locking that guarantees that the neighbour lower
5051  * list will remain unchainged.
5052  */
5053 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5054 {
5055         struct netdev_adjacent *lower;
5056
5057         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5058
5059         if (&lower->list == &dev->adj_list.lower)
5060                 return NULL;
5061
5062         *iter = &lower->list;
5063
5064         return lower->dev;
5065 }
5066 EXPORT_SYMBOL(netdev_lower_get_next);
5067
5068 /**
5069  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5070  *                                     lower neighbour list, RCU
5071  *                                     variant
5072  * @dev: device
5073  *
5074  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5075  * list. The caller must hold RCU read lock.
5076  */
5077 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5078 {
5079         struct netdev_adjacent *lower;
5080
5081         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5082                         struct netdev_adjacent, list);
5083         if (lower)
5084                 return lower->private;
5085         return NULL;
5086 }
5087 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5088
5089 /**
5090  * netdev_master_upper_dev_get_rcu - Get master upper device
5091  * @dev: device
5092  *
5093  * Find a master upper device and return pointer to it or NULL in case
5094  * it's not there. The caller must hold the RCU read lock.
5095  */
5096 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5097 {
5098         struct netdev_adjacent *upper;
5099
5100         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5101                                        struct netdev_adjacent, list);
5102         if (upper && likely(upper->master))
5103                 return upper->dev;
5104         return NULL;
5105 }
5106 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5107
5108 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5109                               struct net_device *adj_dev,
5110                               struct list_head *dev_list)
5111 {
5112         char linkname[IFNAMSIZ+7];
5113         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5114                 "upper_%s" : "lower_%s", adj_dev->name);
5115         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5116                                  linkname);
5117 }
5118 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5119                                char *name,
5120                                struct list_head *dev_list)
5121 {
5122         char linkname[IFNAMSIZ+7];
5123         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5124                 "upper_%s" : "lower_%s", name);
5125         sysfs_remove_link(&(dev->dev.kobj), linkname);
5126 }
5127
5128 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5129                                                  struct net_device *adj_dev,
5130                                                  struct list_head *dev_list)
5131 {
5132         return (dev_list == &dev->adj_list.upper ||
5133                 dev_list == &dev->adj_list.lower) &&
5134                 net_eq(dev_net(dev), dev_net(adj_dev));
5135 }
5136
5137 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5138                                         struct net_device *adj_dev,
5139                                         struct list_head *dev_list,
5140                                         void *private, bool master)
5141 {
5142         struct netdev_adjacent *adj;
5143         int ret;
5144
5145         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5146
5147         if (adj) {
5148                 adj->ref_nr++;
5149                 return 0;
5150         }
5151
5152         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5153         if (!adj)
5154                 return -ENOMEM;
5155
5156         adj->dev = adj_dev;
5157         adj->master = master;
5158         adj->ref_nr = 1;
5159         adj->private = private;
5160         dev_hold(adj_dev);
5161
5162         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5163                  adj_dev->name, dev->name, adj_dev->name);
5164
5165         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5166                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5167                 if (ret)
5168                         goto free_adj;
5169         }
5170
5171         /* Ensure that master link is always the first item in list. */
5172         if (master) {
5173                 ret = sysfs_create_link(&(dev->dev.kobj),
5174                                         &(adj_dev->dev.kobj), "master");
5175                 if (ret)
5176                         goto remove_symlinks;
5177
5178                 list_add_rcu(&adj->list, dev_list);
5179         } else {
5180                 list_add_tail_rcu(&adj->list, dev_list);
5181         }
5182
5183         return 0;
5184
5185 remove_symlinks:
5186         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5187                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5188 free_adj:
5189         kfree(adj);
5190         dev_put(adj_dev);
5191
5192         return ret;
5193 }
5194
5195 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5196                                          struct net_device *adj_dev,
5197                                          struct list_head *dev_list)
5198 {
5199         struct netdev_adjacent *adj;
5200
5201         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5202
5203         if (!adj) {
5204                 pr_err("tried to remove device %s from %s\n",
5205                        dev->name, adj_dev->name);
5206                 BUG();
5207         }
5208
5209         if (adj->ref_nr > 1) {
5210                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5211                          adj->ref_nr-1);
5212                 adj->ref_nr--;
5213                 return;
5214         }
5215
5216         if (adj->master)
5217                 sysfs_remove_link(&(dev->dev.kobj), "master");
5218
5219         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5220                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5221
5222         list_del_rcu(&adj->list);
5223         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5224                  adj_dev->name, dev->name, adj_dev->name);
5225         dev_put(adj_dev);
5226         kfree_rcu(adj, rcu);
5227 }
5228
5229 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5230                                             struct net_device *upper_dev,
5231                                             struct list_head *up_list,
5232                                             struct list_head *down_list,
5233                                             void *private, bool master)
5234 {
5235         int ret;
5236
5237         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5238                                            master);
5239         if (ret)
5240                 return ret;
5241
5242         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5243                                            false);
5244         if (ret) {
5245                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5246                 return ret;
5247         }
5248
5249         return 0;
5250 }
5251
5252 static int __netdev_adjacent_dev_link(struct net_device *dev,
5253                                       struct net_device *upper_dev)
5254 {
5255         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5256                                                 &dev->all_adj_list.upper,
5257                                                 &upper_dev->all_adj_list.lower,
5258                                                 NULL, false);
5259 }
5260
5261 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5262                                                struct net_device *upper_dev,
5263                                                struct list_head *up_list,
5264                                                struct list_head *down_list)
5265 {
5266         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5267         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5268 }
5269
5270 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5271                                          struct net_device *upper_dev)
5272 {
5273         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5274                                            &dev->all_adj_list.upper,
5275                                            &upper_dev->all_adj_list.lower);
5276 }
5277
5278 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5279                                                 struct net_device *upper_dev,
5280                                                 void *private, bool master)
5281 {
5282         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5283
5284         if (ret)
5285                 return ret;
5286
5287         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5288                                                &dev->adj_list.upper,
5289                                                &upper_dev->adj_list.lower,
5290                                                private, master);
5291         if (ret) {
5292                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5293                 return ret;
5294         }
5295
5296         return 0;
5297 }
5298
5299 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5300                                                    struct net_device *upper_dev)
5301 {
5302         __netdev_adjacent_dev_unlink(dev, upper_dev);
5303         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5304                                            &dev->adj_list.upper,
5305                                            &upper_dev->adj_list.lower);
5306 }
5307
5308 static int __netdev_upper_dev_link(struct net_device *dev,
5309                                    struct net_device *upper_dev, bool master,
5310                                    void *private)
5311 {
5312         struct netdev_adjacent *i, *j, *to_i, *to_j;
5313         int ret = 0;
5314
5315         ASSERT_RTNL();
5316
5317         if (dev == upper_dev)
5318                 return -EBUSY;
5319
5320         /* To prevent loops, check if dev is not upper device to upper_dev. */
5321         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5322                 return -EBUSY;
5323
5324         if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5325                 return -EEXIST;
5326
5327         if (master && netdev_master_upper_dev_get(dev))
5328                 return -EBUSY;
5329
5330         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5331                                                    master);
5332         if (ret)
5333                 return ret;
5334
5335         /* Now that we linked these devs, make all the upper_dev's
5336          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5337          * versa, and don't forget the devices itself. All of these
5338          * links are non-neighbours.
5339          */
5340         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5341                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5342                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5343                                  i->dev->name, j->dev->name);
5344                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5345                         if (ret)
5346                                 goto rollback_mesh;
5347                 }
5348         }
5349
5350         /* add dev to every upper_dev's upper device */
5351         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5352                 pr_debug("linking %s's upper device %s with %s\n",
5353                          upper_dev->name, i->dev->name, dev->name);
5354                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5355                 if (ret)
5356                         goto rollback_upper_mesh;
5357         }
5358
5359         /* add upper_dev to every dev's lower device */
5360         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5361                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5362                          i->dev->name, upper_dev->name);
5363                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5364                 if (ret)
5365                         goto rollback_lower_mesh;
5366         }
5367
5368         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5369         return 0;
5370
5371 rollback_lower_mesh:
5372         to_i = i;
5373         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5374                 if (i == to_i)
5375                         break;
5376                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5377         }
5378
5379         i = NULL;
5380
5381 rollback_upper_mesh:
5382         to_i = i;
5383         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5384                 if (i == to_i)
5385                         break;
5386                 __netdev_adjacent_dev_unlink(dev, i->dev);
5387         }
5388
5389         i = j = NULL;
5390
5391 rollback_mesh:
5392         to_i = i;
5393         to_j = j;
5394         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5395                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5396                         if (i == to_i && j == to_j)
5397                                 break;
5398                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5399                 }
5400                 if (i == to_i)
5401                         break;
5402         }
5403
5404         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5405
5406         return ret;
5407 }
5408
5409 /**
5410  * netdev_upper_dev_link - Add a link to the upper device
5411  * @dev: device
5412  * @upper_dev: new upper device
5413  *
5414  * Adds a link to device which is upper to this one. The caller must hold
5415  * the RTNL lock. On a failure a negative errno code is returned.
5416  * On success the reference counts are adjusted and the function
5417  * returns zero.
5418  */
5419 int netdev_upper_dev_link(struct net_device *dev,
5420                           struct net_device *upper_dev)
5421 {
5422         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5423 }
5424 EXPORT_SYMBOL(netdev_upper_dev_link);
5425
5426 /**
5427  * netdev_master_upper_dev_link - Add a master link to the upper device
5428  * @dev: device
5429  * @upper_dev: new upper device
5430  *
5431  * Adds a link to device which is upper to this one. In this case, only
5432  * one master upper device can be linked, although other non-master devices
5433  * might be linked as well. The caller must hold the RTNL lock.
5434  * On a failure a negative errno code is returned. On success the reference
5435  * counts are adjusted and the function returns zero.
5436  */
5437 int netdev_master_upper_dev_link(struct net_device *dev,
5438                                  struct net_device *upper_dev)
5439 {
5440         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5441 }
5442 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5443
5444 int netdev_master_upper_dev_link_private(struct net_device *dev,
5445                                          struct net_device *upper_dev,
5446                                          void *private)
5447 {
5448         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5449 }
5450 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5451
5452 /**
5453  * netdev_upper_dev_unlink - Removes a link to upper device
5454  * @dev: device
5455  * @upper_dev: new upper device
5456  *
5457  * Removes a link to device which is upper to this one. The caller must hold
5458  * the RTNL lock.
5459  */
5460 void netdev_upper_dev_unlink(struct net_device *dev,
5461                              struct net_device *upper_dev)
5462 {
5463         struct netdev_adjacent *i, *j;
5464         ASSERT_RTNL();
5465
5466         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5467
5468         /* Here is the tricky part. We must remove all dev's lower
5469          * devices from all upper_dev's upper devices and vice
5470          * versa, to maintain the graph relationship.
5471          */
5472         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5473                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5474                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5475
5476         /* remove also the devices itself from lower/upper device
5477          * list
5478          */
5479         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5480                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5481
5482         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5483                 __netdev_adjacent_dev_unlink(dev, i->dev);
5484
5485         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5486 }
5487 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5488
5489 /**
5490  * netdev_bonding_info_change - Dispatch event about slave change
5491  * @dev: device
5492  * @bonding_info: info to dispatch
5493  *
5494  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5495  * The caller must hold the RTNL lock.
5496  */
5497 void netdev_bonding_info_change(struct net_device *dev,
5498                                 struct netdev_bonding_info *bonding_info)
5499 {
5500         struct netdev_notifier_bonding_info     info;
5501
5502         memcpy(&info.bonding_info, bonding_info,
5503                sizeof(struct netdev_bonding_info));
5504         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5505                                       &info.info);
5506 }
5507 EXPORT_SYMBOL(netdev_bonding_info_change);
5508
5509 static void netdev_adjacent_add_links(struct net_device *dev)
5510 {
5511         struct netdev_adjacent *iter;
5512
5513         struct net *net = dev_net(dev);
5514
5515         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5516                 if (!net_eq(net,dev_net(iter->dev)))
5517                         continue;
5518                 netdev_adjacent_sysfs_add(iter->dev, dev,
5519                                           &iter->dev->adj_list.lower);
5520                 netdev_adjacent_sysfs_add(dev, iter->dev,
5521                                           &dev->adj_list.upper);
5522         }
5523
5524         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5525                 if (!net_eq(net,dev_net(iter->dev)))
5526                         continue;
5527                 netdev_adjacent_sysfs_add(iter->dev, dev,
5528                                           &iter->dev->adj_list.upper);
5529                 netdev_adjacent_sysfs_add(dev, iter->dev,
5530                                           &dev->adj_list.lower);
5531         }
5532 }
5533
5534 static void netdev_adjacent_del_links(struct net_device *dev)
5535 {
5536         struct netdev_adjacent *iter;
5537
5538         struct net *net = dev_net(dev);
5539
5540         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5541                 if (!net_eq(net,dev_net(iter->dev)))
5542                         continue;
5543                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5544                                           &iter->dev->adj_list.lower);
5545                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5546                                           &dev->adj_list.upper);
5547         }
5548
5549         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5550                 if (!net_eq(net,dev_net(iter->dev)))
5551                         continue;
5552                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5553                                           &iter->dev->adj_list.upper);
5554                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5555                                           &dev->adj_list.lower);
5556         }
5557 }
5558
5559 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5560 {
5561         struct netdev_adjacent *iter;
5562
5563         struct net *net = dev_net(dev);
5564
5565         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5566                 if (!net_eq(net,dev_net(iter->dev)))
5567                         continue;
5568                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5569                                           &iter->dev->adj_list.lower);
5570                 netdev_adjacent_sysfs_add(iter->dev, dev,
5571                                           &iter->dev->adj_list.lower);
5572         }
5573
5574         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5575                 if (!net_eq(net,dev_net(iter->dev)))
5576                         continue;
5577                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5578                                           &iter->dev->adj_list.upper);
5579                 netdev_adjacent_sysfs_add(iter->dev, dev,
5580                                           &iter->dev->adj_list.upper);
5581         }
5582 }
5583
5584 void *netdev_lower_dev_get_private(struct net_device *dev,
5585                                    struct net_device *lower_dev)
5586 {
5587         struct netdev_adjacent *lower;
5588
5589         if (!lower_dev)
5590                 return NULL;
5591         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5592         if (!lower)
5593                 return NULL;
5594
5595         return lower->private;
5596 }
5597 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5598
5599
5600 int dev_get_nest_level(struct net_device *dev,
5601                        bool (*type_check)(struct net_device *dev))
5602 {
5603         struct net_device *lower = NULL;
5604         struct list_head *iter;
5605         int max_nest = -1;
5606         int nest;
5607
5608         ASSERT_RTNL();
5609
5610         netdev_for_each_lower_dev(dev, lower, iter) {
5611                 nest = dev_get_nest_level(lower, type_check);
5612                 if (max_nest < nest)
5613                         max_nest = nest;
5614         }
5615
5616         if (type_check(dev))
5617                 max_nest++;
5618
5619         return max_nest;
5620 }
5621 EXPORT_SYMBOL(dev_get_nest_level);
5622
5623 static void dev_change_rx_flags(struct net_device *dev, int flags)
5624 {
5625         const struct net_device_ops *ops = dev->netdev_ops;
5626
5627         if (ops->ndo_change_rx_flags)
5628                 ops->ndo_change_rx_flags(dev, flags);
5629 }
5630
5631 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5632 {
5633         unsigned int old_flags = dev->flags;
5634         kuid_t uid;
5635         kgid_t gid;
5636
5637         ASSERT_RTNL();
5638
5639         dev->flags |= IFF_PROMISC;
5640         dev->promiscuity += inc;
5641         if (dev->promiscuity == 0) {
5642                 /*
5643                  * Avoid overflow.
5644                  * If inc causes overflow, untouch promisc and return error.
5645                  */
5646                 if (inc < 0)
5647                         dev->flags &= ~IFF_PROMISC;
5648                 else {
5649                         dev->promiscuity -= inc;
5650                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5651                                 dev->name);
5652                         return -EOVERFLOW;
5653                 }
5654         }
5655         if (dev->flags != old_flags) {
5656                 pr_info("device %s %s promiscuous mode\n",
5657                         dev->name,
5658                         dev->flags & IFF_PROMISC ? "entered" : "left");
5659                 if (audit_enabled) {
5660                         current_uid_gid(&uid, &gid);
5661                         audit_log(current->audit_context, GFP_ATOMIC,
5662                                 AUDIT_ANOM_PROMISCUOUS,
5663                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5664                                 dev->name, (dev->flags & IFF_PROMISC),
5665                                 (old_flags & IFF_PROMISC),
5666                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5667                                 from_kuid(&init_user_ns, uid),
5668                                 from_kgid(&init_user_ns, gid),
5669                                 audit_get_sessionid(current));
5670                 }
5671
5672                 dev_change_rx_flags(dev, IFF_PROMISC);
5673         }
5674         if (notify)
5675                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5676         return 0;
5677 }
5678
5679 /**
5680  *      dev_set_promiscuity     - update promiscuity count on a device
5681  *      @dev: device
5682  *      @inc: modifier
5683  *
5684  *      Add or remove promiscuity from a device. While the count in the device
5685  *      remains above zero the interface remains promiscuous. Once it hits zero
5686  *      the device reverts back to normal filtering operation. A negative inc
5687  *      value is used to drop promiscuity on the device.
5688  *      Return 0 if successful or a negative errno code on error.
5689  */
5690 int dev_set_promiscuity(struct net_device *dev, int inc)
5691 {
5692         unsigned int old_flags = dev->flags;
5693         int err;
5694
5695         err = __dev_set_promiscuity(dev, inc, true);
5696         if (err < 0)
5697                 return err;
5698         if (dev->flags != old_flags)
5699                 dev_set_rx_mode(dev);
5700         return err;
5701 }
5702 EXPORT_SYMBOL(dev_set_promiscuity);
5703
5704 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5705 {
5706         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5707
5708         ASSERT_RTNL();
5709
5710         dev->flags |= IFF_ALLMULTI;
5711         dev->allmulti += inc;
5712         if (dev->allmulti == 0) {
5713                 /*
5714                  * Avoid overflow.
5715                  * If inc causes overflow, untouch allmulti and return error.
5716                  */
5717                 if (inc < 0)
5718                         dev->flags &= ~IFF_ALLMULTI;
5719                 else {
5720                         dev->allmulti -= inc;
5721                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5722                                 dev->name);
5723                         return -EOVERFLOW;
5724                 }
5725         }
5726         if (dev->flags ^ old_flags) {
5727                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5728                 dev_set_rx_mode(dev);
5729                 if (notify)
5730                         __dev_notify_flags(dev, old_flags,
5731                                            dev->gflags ^ old_gflags);
5732         }
5733         return 0;
5734 }
5735
5736 /**
5737  *      dev_set_allmulti        - update allmulti count on a device
5738  *      @dev: device
5739  *      @inc: modifier
5740  *
5741  *      Add or remove reception of all multicast frames to a device. While the
5742  *      count in the device remains above zero the interface remains listening
5743  *      to all interfaces. Once it hits zero the device reverts back to normal
5744  *      filtering operation. A negative @inc value is used to drop the counter
5745  *      when releasing a resource needing all multicasts.
5746  *      Return 0 if successful or a negative errno code on error.
5747  */
5748
5749 int dev_set_allmulti(struct net_device *dev, int inc)
5750 {
5751         return __dev_set_allmulti(dev, inc, true);
5752 }
5753 EXPORT_SYMBOL(dev_set_allmulti);
5754
5755 /*
5756  *      Upload unicast and multicast address lists to device and
5757  *      configure RX filtering. When the device doesn't support unicast
5758  *      filtering it is put in promiscuous mode while unicast addresses
5759  *      are present.
5760  */
5761 void __dev_set_rx_mode(struct net_device *dev)
5762 {
5763         const struct net_device_ops *ops = dev->netdev_ops;
5764
5765         /* dev_open will call this function so the list will stay sane. */
5766         if (!(dev->flags&IFF_UP))
5767                 return;
5768
5769         if (!netif_device_present(dev))
5770                 return;
5771
5772         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5773                 /* Unicast addresses changes may only happen under the rtnl,
5774                  * therefore calling __dev_set_promiscuity here is safe.
5775                  */
5776                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5777                         __dev_set_promiscuity(dev, 1, false);
5778                         dev->uc_promisc = true;
5779                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5780                         __dev_set_promiscuity(dev, -1, false);
5781                         dev->uc_promisc = false;
5782                 }
5783         }
5784
5785         if (ops->ndo_set_rx_mode)
5786                 ops->ndo_set_rx_mode(dev);
5787 }
5788
5789 void dev_set_rx_mode(struct net_device *dev)
5790 {
5791         netif_addr_lock_bh(dev);
5792         __dev_set_rx_mode(dev);
5793         netif_addr_unlock_bh(dev);
5794 }
5795
5796 /**
5797  *      dev_get_flags - get flags reported to userspace
5798  *      @dev: device
5799  *
5800  *      Get the combination of flag bits exported through APIs to userspace.
5801  */
5802 unsigned int dev_get_flags(const struct net_device *dev)
5803 {
5804         unsigned int flags;
5805
5806         flags = (dev->flags & ~(IFF_PROMISC |
5807                                 IFF_ALLMULTI |
5808                                 IFF_RUNNING |
5809                                 IFF_LOWER_UP |
5810                                 IFF_DORMANT)) |
5811                 (dev->gflags & (IFF_PROMISC |
5812                                 IFF_ALLMULTI));
5813
5814         if (netif_running(dev)) {
5815                 if (netif_oper_up(dev))
5816                         flags |= IFF_RUNNING;
5817                 if (netif_carrier_ok(dev))
5818                         flags |= IFF_LOWER_UP;
5819                 if (netif_dormant(dev))
5820                         flags |= IFF_DORMANT;
5821         }
5822
5823         return flags;
5824 }
5825 EXPORT_SYMBOL(dev_get_flags);
5826
5827 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5828 {
5829         unsigned int old_flags = dev->flags;
5830         int ret;
5831
5832         ASSERT_RTNL();
5833
5834         /*
5835          *      Set the flags on our device.
5836          */
5837
5838         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5839                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5840                                IFF_AUTOMEDIA)) |
5841                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5842                                     IFF_ALLMULTI));
5843
5844         /*
5845          *      Load in the correct multicast list now the flags have changed.
5846          */
5847
5848         if ((old_flags ^ flags) & IFF_MULTICAST)
5849                 dev_change_rx_flags(dev, IFF_MULTICAST);
5850
5851         dev_set_rx_mode(dev);
5852
5853         /*
5854          *      Have we downed the interface. We handle IFF_UP ourselves
5855          *      according to user attempts to set it, rather than blindly
5856          *      setting it.
5857          */
5858
5859         ret = 0;
5860         if ((old_flags ^ flags) & IFF_UP)
5861                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5862
5863         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5864                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5865                 unsigned int old_flags = dev->flags;
5866
5867                 dev->gflags ^= IFF_PROMISC;
5868
5869                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5870                         if (dev->flags != old_flags)
5871                                 dev_set_rx_mode(dev);
5872         }
5873
5874         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5875            is important. Some (broken) drivers set IFF_PROMISC, when
5876            IFF_ALLMULTI is requested not asking us and not reporting.
5877          */
5878         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5879                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5880
5881                 dev->gflags ^= IFF_ALLMULTI;
5882                 __dev_set_allmulti(dev, inc, false);
5883         }
5884
5885         return ret;
5886 }
5887
5888 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5889                         unsigned int gchanges)
5890 {
5891         unsigned int changes = dev->flags ^ old_flags;
5892
5893         if (gchanges)
5894                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5895
5896         if (changes & IFF_UP) {
5897                 if (dev->flags & IFF_UP)
5898                         call_netdevice_notifiers(NETDEV_UP, dev);
5899                 else
5900                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5901         }
5902
5903         if (dev->flags & IFF_UP &&
5904             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5905                 struct netdev_notifier_change_info change_info;
5906
5907                 change_info.flags_changed = changes;
5908                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5909                                               &change_info.info);
5910         }
5911 }
5912
5913 /**
5914  *      dev_change_flags - change device settings
5915  *      @dev: device
5916  *      @flags: device state flags
5917  *
5918  *      Change settings on device based state flags. The flags are
5919  *      in the userspace exported format.
5920  */
5921 int dev_change_flags(struct net_device *dev, unsigned int flags)
5922 {
5923         int ret;
5924         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5925
5926         ret = __dev_change_flags(dev, flags);
5927         if (ret < 0)
5928                 return ret;
5929
5930         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5931         __dev_notify_flags(dev, old_flags, changes);
5932         return ret;
5933 }
5934 EXPORT_SYMBOL(dev_change_flags);
5935
5936 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5937 {
5938         const struct net_device_ops *ops = dev->netdev_ops;
5939
5940         if (ops->ndo_change_mtu)
5941                 return ops->ndo_change_mtu(dev, new_mtu);
5942
5943         dev->mtu = new_mtu;
5944         return 0;
5945 }
5946
5947 /**
5948  *      dev_set_mtu - Change maximum transfer unit
5949  *      @dev: device
5950  *      @new_mtu: new transfer unit
5951  *
5952  *      Change the maximum transfer size of the network device.
5953  */
5954 int dev_set_mtu(struct net_device *dev, int new_mtu)
5955 {
5956         int err, orig_mtu;
5957
5958         if (new_mtu == dev->mtu)
5959                 return 0;
5960
5961         /*      MTU must be positive.    */
5962         if (new_mtu < 0)
5963                 return -EINVAL;
5964
5965         if (!netif_device_present(dev))
5966                 return -ENODEV;
5967
5968         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5969         err = notifier_to_errno(err);
5970         if (err)
5971                 return err;
5972
5973         orig_mtu = dev->mtu;
5974         err = __dev_set_mtu(dev, new_mtu);
5975
5976         if (!err) {
5977                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5978                 err = notifier_to_errno(err);
5979                 if (err) {
5980                         /* setting mtu back and notifying everyone again,
5981                          * so that they have a chance to revert changes.
5982                          */
5983                         __dev_set_mtu(dev, orig_mtu);
5984                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5985                 }
5986         }
5987         return err;
5988 }
5989 EXPORT_SYMBOL(dev_set_mtu);
5990
5991 /**
5992  *      dev_set_group - Change group this device belongs to
5993  *      @dev: device
5994  *      @new_group: group this device should belong to
5995  */
5996 void dev_set_group(struct net_device *dev, int new_group)
5997 {
5998         dev->group = new_group;
5999 }
6000 EXPORT_SYMBOL(dev_set_group);
6001
6002 /**
6003  *      dev_set_mac_address - Change Media Access Control Address
6004  *      @dev: device
6005  *      @sa: new address
6006  *
6007  *      Change the hardware (MAC) address of the device
6008  */
6009 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6010 {
6011         const struct net_device_ops *ops = dev->netdev_ops;
6012         int err;
6013
6014         if (!ops->ndo_set_mac_address)
6015                 return -EOPNOTSUPP;
6016         if (sa->sa_family != dev->type)
6017                 return -EINVAL;
6018         if (!netif_device_present(dev))
6019                 return -ENODEV;
6020         err = ops->ndo_set_mac_address(dev, sa);
6021         if (err)
6022                 return err;
6023         dev->addr_assign_type = NET_ADDR_SET;
6024         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6025         add_device_randomness(dev->dev_addr, dev->addr_len);
6026         return 0;
6027 }
6028 EXPORT_SYMBOL(dev_set_mac_address);
6029
6030 /**
6031  *      dev_change_carrier - Change device carrier
6032  *      @dev: device
6033  *      @new_carrier: new value
6034  *
6035  *      Change device carrier
6036  */
6037 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6038 {
6039         const struct net_device_ops *ops = dev->netdev_ops;
6040
6041         if (!ops->ndo_change_carrier)
6042                 return -EOPNOTSUPP;
6043         if (!netif_device_present(dev))
6044                 return -ENODEV;
6045         return ops->ndo_change_carrier(dev, new_carrier);
6046 }
6047 EXPORT_SYMBOL(dev_change_carrier);
6048
6049 /**
6050  *      dev_get_phys_port_id - Get device physical port ID
6051  *      @dev: device
6052  *      @ppid: port ID
6053  *
6054  *      Get device physical port ID
6055  */
6056 int dev_get_phys_port_id(struct net_device *dev,
6057                          struct netdev_phys_item_id *ppid)
6058 {
6059         const struct net_device_ops *ops = dev->netdev_ops;
6060
6061         if (!ops->ndo_get_phys_port_id)
6062                 return -EOPNOTSUPP;
6063         return ops->ndo_get_phys_port_id(dev, ppid);
6064 }
6065 EXPORT_SYMBOL(dev_get_phys_port_id);
6066
6067 /**
6068  *      dev_get_phys_port_name - Get device physical port name
6069  *      @dev: device
6070  *      @name: port name
6071  *
6072  *      Get device physical port name
6073  */
6074 int dev_get_phys_port_name(struct net_device *dev,
6075                            char *name, size_t len)
6076 {
6077         const struct net_device_ops *ops = dev->netdev_ops;
6078
6079         if (!ops->ndo_get_phys_port_name)
6080                 return -EOPNOTSUPP;
6081         return ops->ndo_get_phys_port_name(dev, name, len);
6082 }
6083 EXPORT_SYMBOL(dev_get_phys_port_name);
6084
6085 /**
6086  *      dev_new_index   -       allocate an ifindex
6087  *      @net: the applicable net namespace
6088  *
6089  *      Returns a suitable unique value for a new device interface
6090  *      number.  The caller must hold the rtnl semaphore or the
6091  *      dev_base_lock to be sure it remains unique.
6092  */
6093 static int dev_new_index(struct net *net)
6094 {
6095         int ifindex = net->ifindex;
6096         for (;;) {
6097                 if (++ifindex <= 0)
6098                         ifindex = 1;
6099                 if (!__dev_get_by_index(net, ifindex))
6100                         return net->ifindex = ifindex;
6101         }
6102 }
6103
6104 /* Delayed registration/unregisteration */
6105 static LIST_HEAD(net_todo_list);
6106 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6107
6108 static void net_set_todo(struct net_device *dev)
6109 {
6110         list_add_tail(&dev->todo_list, &net_todo_list);
6111         dev_net(dev)->dev_unreg_count++;
6112 }
6113
6114 static void rollback_registered_many(struct list_head *head)
6115 {
6116         struct net_device *dev, *tmp;
6117         LIST_HEAD(close_head);
6118
6119         BUG_ON(dev_boot_phase);
6120         ASSERT_RTNL();
6121
6122         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6123                 /* Some devices call without registering
6124                  * for initialization unwind. Remove those
6125                  * devices and proceed with the remaining.
6126                  */
6127                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6128                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6129                                  dev->name, dev);
6130
6131                         WARN_ON(1);
6132                         list_del(&dev->unreg_list);
6133                         continue;
6134                 }
6135                 dev->dismantle = true;
6136                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6137         }
6138
6139         /* If device is running, close it first. */
6140         list_for_each_entry(dev, head, unreg_list)
6141                 list_add_tail(&dev->close_list, &close_head);
6142         dev_close_many(&close_head, true);
6143
6144         list_for_each_entry(dev, head, unreg_list) {
6145                 /* And unlink it from device chain. */
6146                 unlist_netdevice(dev);
6147
6148                 dev->reg_state = NETREG_UNREGISTERING;
6149         }
6150
6151         synchronize_net();
6152
6153         list_for_each_entry(dev, head, unreg_list) {
6154                 struct sk_buff *skb = NULL;
6155
6156                 /* Shutdown queueing discipline. */
6157                 dev_shutdown(dev);
6158
6159
6160                 /* Notify protocols, that we are about to destroy
6161                    this device. They should clean all the things.
6162                 */
6163                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6164
6165                 if (!dev->rtnl_link_ops ||
6166                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6167                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6168                                                      GFP_KERNEL);
6169
6170                 /*
6171                  *      Flush the unicast and multicast chains
6172                  */
6173                 dev_uc_flush(dev);
6174                 dev_mc_flush(dev);
6175
6176                 if (dev->netdev_ops->ndo_uninit)
6177                         dev->netdev_ops->ndo_uninit(dev);
6178
6179                 if (skb)
6180                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6181
6182                 /* Notifier chain MUST detach us all upper devices. */
6183                 WARN_ON(netdev_has_any_upper_dev(dev));
6184
6185                 /* Remove entries from kobject tree */
6186                 netdev_unregister_kobject(dev);
6187 #ifdef CONFIG_XPS
6188                 /* Remove XPS queueing entries */
6189                 netif_reset_xps_queues_gt(dev, 0);
6190 #endif
6191         }
6192
6193         synchronize_net();
6194
6195         list_for_each_entry(dev, head, unreg_list)
6196                 dev_put(dev);
6197 }
6198
6199 static void rollback_registered(struct net_device *dev)
6200 {
6201         LIST_HEAD(single);
6202
6203         list_add(&dev->unreg_list, &single);
6204         rollback_registered_many(&single);
6205         list_del(&single);
6206 }
6207
6208 static netdev_features_t netdev_fix_features(struct net_device *dev,
6209         netdev_features_t features)
6210 {
6211         /* Fix illegal checksum combinations */
6212         if ((features & NETIF_F_HW_CSUM) &&
6213             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6214                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6215                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6216         }
6217
6218         /* TSO requires that SG is present as well. */
6219         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6220                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6221                 features &= ~NETIF_F_ALL_TSO;
6222         }
6223
6224         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6225                                         !(features & NETIF_F_IP_CSUM)) {
6226                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6227                 features &= ~NETIF_F_TSO;
6228                 features &= ~NETIF_F_TSO_ECN;
6229         }
6230
6231         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6232                                          !(features & NETIF_F_IPV6_CSUM)) {
6233                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6234                 features &= ~NETIF_F_TSO6;
6235         }
6236
6237         /* TSO ECN requires that TSO is present as well. */
6238         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6239                 features &= ~NETIF_F_TSO_ECN;
6240
6241         /* Software GSO depends on SG. */
6242         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6243                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6244                 features &= ~NETIF_F_GSO;
6245         }
6246
6247         /* UFO needs SG and checksumming */
6248         if (features & NETIF_F_UFO) {
6249                 /* maybe split UFO into V4 and V6? */
6250                 if (!((features & NETIF_F_GEN_CSUM) ||
6251                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6252                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6253                         netdev_dbg(dev,
6254                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6255                         features &= ~NETIF_F_UFO;
6256                 }
6257
6258                 if (!(features & NETIF_F_SG)) {
6259                         netdev_dbg(dev,
6260                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6261                         features &= ~NETIF_F_UFO;
6262                 }
6263         }
6264
6265 #ifdef CONFIG_NET_RX_BUSY_POLL
6266         if (dev->netdev_ops->ndo_busy_poll)
6267                 features |= NETIF_F_BUSY_POLL;
6268         else
6269 #endif
6270                 features &= ~NETIF_F_BUSY_POLL;
6271
6272         return features;
6273 }
6274
6275 int __netdev_update_features(struct net_device *dev)
6276 {
6277         netdev_features_t features;
6278         int err = 0;
6279
6280         ASSERT_RTNL();
6281
6282         features = netdev_get_wanted_features(dev);
6283
6284         if (dev->netdev_ops->ndo_fix_features)
6285                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6286
6287         /* driver might be less strict about feature dependencies */
6288         features = netdev_fix_features(dev, features);
6289
6290         if (dev->features == features)
6291                 return 0;
6292
6293         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6294                 &dev->features, &features);
6295
6296         if (dev->netdev_ops->ndo_set_features)
6297                 err = dev->netdev_ops->ndo_set_features(dev, features);
6298
6299         if (unlikely(err < 0)) {
6300                 netdev_err(dev,
6301                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6302                         err, &features, &dev->features);
6303                 return -1;
6304         }
6305
6306         if (!err)
6307                 dev->features = features;
6308
6309         return 1;
6310 }
6311
6312 /**
6313  *      netdev_update_features - recalculate device features
6314  *      @dev: the device to check
6315  *
6316  *      Recalculate dev->features set and send notifications if it
6317  *      has changed. Should be called after driver or hardware dependent
6318  *      conditions might have changed that influence the features.
6319  */
6320 void netdev_update_features(struct net_device *dev)
6321 {
6322         if (__netdev_update_features(dev))
6323                 netdev_features_change(dev);
6324 }
6325 EXPORT_SYMBOL(netdev_update_features);
6326
6327 /**
6328  *      netdev_change_features - recalculate device features
6329  *      @dev: the device to check
6330  *
6331  *      Recalculate dev->features set and send notifications even
6332  *      if they have not changed. Should be called instead of
6333  *      netdev_update_features() if also dev->vlan_features might
6334  *      have changed to allow the changes to be propagated to stacked
6335  *      VLAN devices.
6336  */
6337 void netdev_change_features(struct net_device *dev)
6338 {
6339         __netdev_update_features(dev);
6340         netdev_features_change(dev);
6341 }
6342 EXPORT_SYMBOL(netdev_change_features);
6343
6344 /**
6345  *      netif_stacked_transfer_operstate -      transfer operstate
6346  *      @rootdev: the root or lower level device to transfer state from
6347  *      @dev: the device to transfer operstate to
6348  *
6349  *      Transfer operational state from root to device. This is normally
6350  *      called when a stacking relationship exists between the root
6351  *      device and the device(a leaf device).
6352  */
6353 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6354                                         struct net_device *dev)
6355 {
6356         if (rootdev->operstate == IF_OPER_DORMANT)
6357                 netif_dormant_on(dev);
6358         else
6359                 netif_dormant_off(dev);
6360
6361         if (netif_carrier_ok(rootdev)) {
6362                 if (!netif_carrier_ok(dev))
6363                         netif_carrier_on(dev);
6364         } else {
6365                 if (netif_carrier_ok(dev))
6366                         netif_carrier_off(dev);
6367         }
6368 }
6369 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6370
6371 #ifdef CONFIG_SYSFS
6372 static int netif_alloc_rx_queues(struct net_device *dev)
6373 {
6374         unsigned int i, count = dev->num_rx_queues;
6375         struct netdev_rx_queue *rx;
6376         size_t sz = count * sizeof(*rx);
6377
6378         BUG_ON(count < 1);
6379
6380         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6381         if (!rx) {
6382                 rx = vzalloc(sz);
6383                 if (!rx)
6384                         return -ENOMEM;
6385         }
6386         dev->_rx = rx;
6387
6388         for (i = 0; i < count; i++)
6389                 rx[i].dev = dev;
6390         return 0;
6391 }
6392 #endif
6393
6394 static void netdev_init_one_queue(struct net_device *dev,
6395                                   struct netdev_queue *queue, void *_unused)
6396 {
6397         /* Initialize queue lock */
6398         spin_lock_init(&queue->_xmit_lock);
6399         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6400         queue->xmit_lock_owner = -1;
6401         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6402         queue->dev = dev;
6403 #ifdef CONFIG_BQL
6404         dql_init(&queue->dql, HZ);
6405 #endif
6406 }
6407
6408 static void netif_free_tx_queues(struct net_device *dev)
6409 {
6410         kvfree(dev->_tx);
6411 }
6412
6413 static int netif_alloc_netdev_queues(struct net_device *dev)
6414 {
6415         unsigned int count = dev->num_tx_queues;
6416         struct netdev_queue *tx;
6417         size_t sz = count * sizeof(*tx);
6418
6419         BUG_ON(count < 1 || count > 0xffff);
6420
6421         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6422         if (!tx) {
6423                 tx = vzalloc(sz);
6424                 if (!tx)
6425                         return -ENOMEM;
6426         }
6427         dev->_tx = tx;
6428
6429         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6430         spin_lock_init(&dev->tx_global_lock);
6431
6432         return 0;
6433 }
6434
6435 void netif_tx_stop_all_queues(struct net_device *dev)
6436 {
6437         unsigned int i;
6438
6439         for (i = 0; i < dev->num_tx_queues; i++) {
6440                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6441                 netif_tx_stop_queue(txq);
6442         }
6443 }
6444 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6445
6446 /**
6447  *      register_netdevice      - register a network device
6448  *      @dev: device to register
6449  *
6450  *      Take a completed network device structure and add it to the kernel
6451  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6452  *      chain. 0 is returned on success. A negative errno code is returned
6453  *      on a failure to set up the device, or if the name is a duplicate.
6454  *
6455  *      Callers must hold the rtnl semaphore. You may want
6456  *      register_netdev() instead of this.
6457  *
6458  *      BUGS:
6459  *      The locking appears insufficient to guarantee two parallel registers
6460  *      will not get the same name.
6461  */
6462
6463 int register_netdevice(struct net_device *dev)
6464 {
6465         int ret;
6466         struct net *net = dev_net(dev);
6467
6468         BUG_ON(dev_boot_phase);
6469         ASSERT_RTNL();
6470
6471         might_sleep();
6472
6473         /* When net_device's are persistent, this will be fatal. */
6474         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6475         BUG_ON(!net);
6476
6477         spin_lock_init(&dev->addr_list_lock);
6478         netdev_set_addr_lockdep_class(dev);
6479
6480         ret = dev_get_valid_name(net, dev, dev->name);
6481         if (ret < 0)
6482                 goto out;
6483
6484         /* Init, if this function is available */
6485         if (dev->netdev_ops->ndo_init) {
6486                 ret = dev->netdev_ops->ndo_init(dev);
6487                 if (ret) {
6488                         if (ret > 0)
6489                                 ret = -EIO;
6490                         goto out;
6491                 }
6492         }
6493
6494         if (((dev->hw_features | dev->features) &
6495              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6496             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6497              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6498                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6499                 ret = -EINVAL;
6500                 goto err_uninit;
6501         }
6502
6503         ret = -EBUSY;
6504         if (!dev->ifindex)
6505                 dev->ifindex = dev_new_index(net);
6506         else if (__dev_get_by_index(net, dev->ifindex))
6507                 goto err_uninit;
6508
6509         /* Transfer changeable features to wanted_features and enable
6510          * software offloads (GSO and GRO).
6511          */
6512         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6513         dev->features |= NETIF_F_SOFT_FEATURES;
6514         dev->wanted_features = dev->features & dev->hw_features;
6515
6516         if (!(dev->flags & IFF_LOOPBACK)) {
6517                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6518         }
6519
6520         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6521          */
6522         dev->vlan_features |= NETIF_F_HIGHDMA;
6523
6524         /* Make NETIF_F_SG inheritable to tunnel devices.
6525          */
6526         dev->hw_enc_features |= NETIF_F_SG;
6527
6528         /* Make NETIF_F_SG inheritable to MPLS.
6529          */
6530         dev->mpls_features |= NETIF_F_SG;
6531
6532         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6533         ret = notifier_to_errno(ret);
6534         if (ret)
6535                 goto err_uninit;
6536
6537         ret = netdev_register_kobject(dev);
6538         if (ret)
6539                 goto err_uninit;
6540         dev->reg_state = NETREG_REGISTERED;
6541
6542         __netdev_update_features(dev);
6543
6544         /*
6545          *      Default initial state at registry is that the
6546          *      device is present.
6547          */
6548
6549         set_bit(__LINK_STATE_PRESENT, &dev->state);
6550
6551         linkwatch_init_dev(dev);
6552
6553         dev_init_scheduler(dev);
6554         dev_hold(dev);
6555         list_netdevice(dev);
6556         add_device_randomness(dev->dev_addr, dev->addr_len);
6557
6558         /* If the device has permanent device address, driver should
6559          * set dev_addr and also addr_assign_type should be set to
6560          * NET_ADDR_PERM (default value).
6561          */
6562         if (dev->addr_assign_type == NET_ADDR_PERM)
6563                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6564
6565         /* Notify protocols, that a new device appeared. */
6566         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6567         ret = notifier_to_errno(ret);
6568         if (ret) {
6569                 rollback_registered(dev);
6570                 dev->reg_state = NETREG_UNREGISTERED;
6571         }
6572         /*
6573          *      Prevent userspace races by waiting until the network
6574          *      device is fully setup before sending notifications.
6575          */
6576         if (!dev->rtnl_link_ops ||
6577             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6578                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6579
6580 out:
6581         return ret;
6582
6583 err_uninit:
6584         if (dev->netdev_ops->ndo_uninit)
6585                 dev->netdev_ops->ndo_uninit(dev);
6586         goto out;
6587 }
6588 EXPORT_SYMBOL(register_netdevice);
6589
6590 /**
6591  *      init_dummy_netdev       - init a dummy network device for NAPI
6592  *      @dev: device to init
6593  *
6594  *      This takes a network device structure and initialize the minimum
6595  *      amount of fields so it can be used to schedule NAPI polls without
6596  *      registering a full blown interface. This is to be used by drivers
6597  *      that need to tie several hardware interfaces to a single NAPI
6598  *      poll scheduler due to HW limitations.
6599  */
6600 int init_dummy_netdev(struct net_device *dev)
6601 {
6602         /* Clear everything. Note we don't initialize spinlocks
6603          * are they aren't supposed to be taken by any of the
6604          * NAPI code and this dummy netdev is supposed to be
6605          * only ever used for NAPI polls
6606          */
6607         memset(dev, 0, sizeof(struct net_device));
6608
6609         /* make sure we BUG if trying to hit standard
6610          * register/unregister code path
6611          */
6612         dev->reg_state = NETREG_DUMMY;
6613
6614         /* NAPI wants this */
6615         INIT_LIST_HEAD(&dev->napi_list);
6616
6617         /* a dummy interface is started by default */
6618         set_bit(__LINK_STATE_PRESENT, &dev->state);
6619         set_bit(__LINK_STATE_START, &dev->state);
6620
6621         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6622          * because users of this 'device' dont need to change
6623          * its refcount.
6624          */
6625
6626         return 0;
6627 }
6628 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6629
6630
6631 /**
6632  *      register_netdev - register a network device
6633  *      @dev: device to register
6634  *
6635  *      Take a completed network device structure and add it to the kernel
6636  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6637  *      chain. 0 is returned on success. A negative errno code is returned
6638  *      on a failure to set up the device, or if the name is a duplicate.
6639  *
6640  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6641  *      and expands the device name if you passed a format string to
6642  *      alloc_netdev.
6643  */
6644 int register_netdev(struct net_device *dev)
6645 {
6646         int err;
6647
6648         rtnl_lock();
6649         err = register_netdevice(dev);
6650         rtnl_unlock();
6651         return err;
6652 }
6653 EXPORT_SYMBOL(register_netdev);
6654
6655 int netdev_refcnt_read(const struct net_device *dev)
6656 {
6657         int i, refcnt = 0;
6658
6659         for_each_possible_cpu(i)
6660                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6661         return refcnt;
6662 }
6663 EXPORT_SYMBOL(netdev_refcnt_read);
6664
6665 /**
6666  * netdev_wait_allrefs - wait until all references are gone.
6667  * @dev: target net_device
6668  *
6669  * This is called when unregistering network devices.
6670  *
6671  * Any protocol or device that holds a reference should register
6672  * for netdevice notification, and cleanup and put back the
6673  * reference if they receive an UNREGISTER event.
6674  * We can get stuck here if buggy protocols don't correctly
6675  * call dev_put.
6676  */
6677 static void netdev_wait_allrefs(struct net_device *dev)
6678 {
6679         unsigned long rebroadcast_time, warning_time;
6680         int refcnt;
6681
6682         linkwatch_forget_dev(dev);
6683
6684         rebroadcast_time = warning_time = jiffies;
6685         refcnt = netdev_refcnt_read(dev);
6686
6687         while (refcnt != 0) {
6688                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6689                         rtnl_lock();
6690
6691                         /* Rebroadcast unregister notification */
6692                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6693
6694                         __rtnl_unlock();
6695                         rcu_barrier();
6696                         rtnl_lock();
6697
6698                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6699                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6700                                      &dev->state)) {
6701                                 /* We must not have linkwatch events
6702                                  * pending on unregister. If this
6703                                  * happens, we simply run the queue
6704                                  * unscheduled, resulting in a noop
6705                                  * for this device.
6706                                  */
6707                                 linkwatch_run_queue();
6708                         }
6709
6710                         __rtnl_unlock();
6711
6712                         rebroadcast_time = jiffies;
6713                 }
6714
6715                 msleep(250);
6716
6717                 refcnt = netdev_refcnt_read(dev);
6718
6719                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6720                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6721                                  dev->name, refcnt);
6722                         warning_time = jiffies;
6723                 }
6724         }
6725 }
6726
6727 /* The sequence is:
6728  *
6729  *      rtnl_lock();
6730  *      ...
6731  *      register_netdevice(x1);
6732  *      register_netdevice(x2);
6733  *      ...
6734  *      unregister_netdevice(y1);
6735  *      unregister_netdevice(y2);
6736  *      ...
6737  *      rtnl_unlock();
6738  *      free_netdev(y1);
6739  *      free_netdev(y2);
6740  *
6741  * We are invoked by rtnl_unlock().
6742  * This allows us to deal with problems:
6743  * 1) We can delete sysfs objects which invoke hotplug
6744  *    without deadlocking with linkwatch via keventd.
6745  * 2) Since we run with the RTNL semaphore not held, we can sleep
6746  *    safely in order to wait for the netdev refcnt to drop to zero.
6747  *
6748  * We must not return until all unregister events added during
6749  * the interval the lock was held have been completed.
6750  */
6751 void netdev_run_todo(void)
6752 {
6753         struct list_head list;
6754
6755         /* Snapshot list, allow later requests */
6756         list_replace_init(&net_todo_list, &list);
6757
6758         __rtnl_unlock();
6759
6760
6761         /* Wait for rcu callbacks to finish before next phase */
6762         if (!list_empty(&list))
6763                 rcu_barrier();
6764
6765         while (!list_empty(&list)) {
6766                 struct net_device *dev
6767                         = list_first_entry(&list, struct net_device, todo_list);
6768                 list_del(&dev->todo_list);
6769
6770                 rtnl_lock();
6771                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6772                 __rtnl_unlock();
6773
6774                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6775                         pr_err("network todo '%s' but state %d\n",
6776                                dev->name, dev->reg_state);
6777                         dump_stack();
6778                         continue;
6779                 }
6780
6781                 dev->reg_state = NETREG_UNREGISTERED;
6782
6783                 on_each_cpu(flush_backlog, dev, 1);
6784
6785                 netdev_wait_allrefs(dev);
6786
6787                 /* paranoia */
6788                 BUG_ON(netdev_refcnt_read(dev));
6789                 BUG_ON(!list_empty(&dev->ptype_all));
6790                 BUG_ON(!list_empty(&dev->ptype_specific));
6791                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6792                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6793                 WARN_ON(dev->dn_ptr);
6794
6795                 if (dev->destructor)
6796                         dev->destructor(dev);
6797
6798                 /* Report a network device has been unregistered */
6799                 rtnl_lock();
6800                 dev_net(dev)->dev_unreg_count--;
6801                 __rtnl_unlock();
6802                 wake_up(&netdev_unregistering_wq);
6803
6804                 /* Free network device */
6805                 kobject_put(&dev->dev.kobj);
6806         }
6807 }
6808
6809 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6810  * fields in the same order, with only the type differing.
6811  */
6812 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6813                              const struct net_device_stats *netdev_stats)
6814 {
6815 #if BITS_PER_LONG == 64
6816         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6817         memcpy(stats64, netdev_stats, sizeof(*stats64));
6818 #else
6819         size_t i, n = sizeof(*stats64) / sizeof(u64);
6820         const unsigned long *src = (const unsigned long *)netdev_stats;
6821         u64 *dst = (u64 *)stats64;
6822
6823         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6824                      sizeof(*stats64) / sizeof(u64));
6825         for (i = 0; i < n; i++)
6826                 dst[i] = src[i];
6827 #endif
6828 }
6829 EXPORT_SYMBOL(netdev_stats_to_stats64);
6830
6831 /**
6832  *      dev_get_stats   - get network device statistics
6833  *      @dev: device to get statistics from
6834  *      @storage: place to store stats
6835  *
6836  *      Get network statistics from device. Return @storage.
6837  *      The device driver may provide its own method by setting
6838  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6839  *      otherwise the internal statistics structure is used.
6840  */
6841 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6842                                         struct rtnl_link_stats64 *storage)
6843 {
6844         const struct net_device_ops *ops = dev->netdev_ops;
6845
6846         if (ops->ndo_get_stats64) {
6847                 memset(storage, 0, sizeof(*storage));
6848                 ops->ndo_get_stats64(dev, storage);
6849         } else if (ops->ndo_get_stats) {
6850                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6851         } else {
6852                 netdev_stats_to_stats64(storage, &dev->stats);
6853         }
6854         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6855         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6856         return storage;
6857 }
6858 EXPORT_SYMBOL(dev_get_stats);
6859
6860 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6861 {
6862         struct netdev_queue *queue = dev_ingress_queue(dev);
6863
6864 #ifdef CONFIG_NET_CLS_ACT
6865         if (queue)
6866                 return queue;
6867         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6868         if (!queue)
6869                 return NULL;
6870         netdev_init_one_queue(dev, queue, NULL);
6871         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6872         queue->qdisc_sleeping = &noop_qdisc;
6873         rcu_assign_pointer(dev->ingress_queue, queue);
6874 #endif
6875         return queue;
6876 }
6877
6878 static const struct ethtool_ops default_ethtool_ops;
6879
6880 void netdev_set_default_ethtool_ops(struct net_device *dev,
6881                                     const struct ethtool_ops *ops)
6882 {
6883         if (dev->ethtool_ops == &default_ethtool_ops)
6884                 dev->ethtool_ops = ops;
6885 }
6886 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6887
6888 void netdev_freemem(struct net_device *dev)
6889 {
6890         char *addr = (char *)dev - dev->padded;
6891
6892         kvfree(addr);
6893 }
6894
6895 /**
6896  *      alloc_netdev_mqs - allocate network device
6897  *      @sizeof_priv:           size of private data to allocate space for
6898  *      @name:                  device name format string
6899  *      @name_assign_type:      origin of device name
6900  *      @setup:                 callback to initialize device
6901  *      @txqs:                  the number of TX subqueues to allocate
6902  *      @rxqs:                  the number of RX subqueues to allocate
6903  *
6904  *      Allocates a struct net_device with private data area for driver use
6905  *      and performs basic initialization.  Also allocates subqueue structs
6906  *      for each queue on the device.
6907  */
6908 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6909                 unsigned char name_assign_type,
6910                 void (*setup)(struct net_device *),
6911                 unsigned int txqs, unsigned int rxqs)
6912 {
6913         struct net_device *dev;
6914         size_t alloc_size;
6915         struct net_device *p;
6916
6917         BUG_ON(strlen(name) >= sizeof(dev->name));
6918
6919         if (txqs < 1) {
6920                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6921                 return NULL;
6922         }
6923
6924 #ifdef CONFIG_SYSFS
6925         if (rxqs < 1) {
6926                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6927                 return NULL;
6928         }
6929 #endif
6930
6931         alloc_size = sizeof(struct net_device);
6932         if (sizeof_priv) {
6933                 /* ensure 32-byte alignment of private area */
6934                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6935                 alloc_size += sizeof_priv;
6936         }
6937         /* ensure 32-byte alignment of whole construct */
6938         alloc_size += NETDEV_ALIGN - 1;
6939
6940         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6941         if (!p)
6942                 p = vzalloc(alloc_size);
6943         if (!p)
6944                 return NULL;
6945
6946         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6947         dev->padded = (char *)dev - (char *)p;
6948
6949         dev->pcpu_refcnt = alloc_percpu(int);
6950         if (!dev->pcpu_refcnt)
6951                 goto free_dev;
6952
6953         if (dev_addr_init(dev))
6954                 goto free_pcpu;
6955
6956         dev_mc_init(dev);
6957         dev_uc_init(dev);
6958
6959         dev_net_set(dev, &init_net);
6960
6961         dev->gso_max_size = GSO_MAX_SIZE;
6962         dev->gso_max_segs = GSO_MAX_SEGS;
6963         dev->gso_min_segs = 0;
6964
6965         INIT_LIST_HEAD(&dev->napi_list);
6966         INIT_LIST_HEAD(&dev->unreg_list);
6967         INIT_LIST_HEAD(&dev->close_list);
6968         INIT_LIST_HEAD(&dev->link_watch_list);
6969         INIT_LIST_HEAD(&dev->adj_list.upper);
6970         INIT_LIST_HEAD(&dev->adj_list.lower);
6971         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6972         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6973         INIT_LIST_HEAD(&dev->ptype_all);
6974         INIT_LIST_HEAD(&dev->ptype_specific);
6975         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6976         setup(dev);
6977
6978         dev->num_tx_queues = txqs;
6979         dev->real_num_tx_queues = txqs;
6980         if (netif_alloc_netdev_queues(dev))
6981                 goto free_all;
6982
6983 #ifdef CONFIG_SYSFS
6984         dev->num_rx_queues = rxqs;
6985         dev->real_num_rx_queues = rxqs;
6986         if (netif_alloc_rx_queues(dev))
6987                 goto free_all;
6988 #endif
6989
6990         strcpy(dev->name, name);
6991         dev->name_assign_type = name_assign_type;
6992         dev->group = INIT_NETDEV_GROUP;
6993         if (!dev->ethtool_ops)
6994                 dev->ethtool_ops = &default_ethtool_ops;
6995
6996         nf_hook_ingress_init(dev);
6997
6998         return dev;
6999
7000 free_all:
7001         free_netdev(dev);
7002         return NULL;
7003
7004 free_pcpu:
7005         free_percpu(dev->pcpu_refcnt);
7006 free_dev:
7007         netdev_freemem(dev);
7008         return NULL;
7009 }
7010 EXPORT_SYMBOL(alloc_netdev_mqs);
7011
7012 /**
7013  *      free_netdev - free network device
7014  *      @dev: device
7015  *
7016  *      This function does the last stage of destroying an allocated device
7017  *      interface. The reference to the device object is released.
7018  *      If this is the last reference then it will be freed.
7019  */
7020 void free_netdev(struct net_device *dev)
7021 {
7022         struct napi_struct *p, *n;
7023
7024         netif_free_tx_queues(dev);
7025 #ifdef CONFIG_SYSFS
7026         kvfree(dev->_rx);
7027 #endif
7028
7029         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7030
7031         /* Flush device addresses */
7032         dev_addr_flush(dev);
7033
7034         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7035                 netif_napi_del(p);
7036
7037         free_percpu(dev->pcpu_refcnt);
7038         dev->pcpu_refcnt = NULL;
7039
7040         /*  Compatibility with error handling in drivers */
7041         if (dev->reg_state == NETREG_UNINITIALIZED) {
7042                 netdev_freemem(dev);
7043                 return;
7044         }
7045
7046         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7047         dev->reg_state = NETREG_RELEASED;
7048
7049         /* will free via device release */
7050         put_device(&dev->dev);
7051 }
7052 EXPORT_SYMBOL(free_netdev);
7053
7054 /**
7055  *      synchronize_net -  Synchronize with packet receive processing
7056  *
7057  *      Wait for packets currently being received to be done.
7058  *      Does not block later packets from starting.
7059  */
7060 void synchronize_net(void)
7061 {
7062         might_sleep();
7063         if (rtnl_is_locked())
7064                 synchronize_rcu_expedited();
7065         else
7066                 synchronize_rcu();
7067 }
7068 EXPORT_SYMBOL(synchronize_net);
7069
7070 /**
7071  *      unregister_netdevice_queue - remove device from the kernel
7072  *      @dev: device
7073  *      @head: list
7074  *
7075  *      This function shuts down a device interface and removes it
7076  *      from the kernel tables.
7077  *      If head not NULL, device is queued to be unregistered later.
7078  *
7079  *      Callers must hold the rtnl semaphore.  You may want
7080  *      unregister_netdev() instead of this.
7081  */
7082
7083 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7084 {
7085         ASSERT_RTNL();
7086
7087         if (head) {
7088                 list_move_tail(&dev->unreg_list, head);
7089         } else {
7090                 rollback_registered(dev);
7091                 /* Finish processing unregister after unlock */
7092                 net_set_todo(dev);
7093         }
7094 }
7095 EXPORT_SYMBOL(unregister_netdevice_queue);
7096
7097 /**
7098  *      unregister_netdevice_many - unregister many devices
7099  *      @head: list of devices
7100  *
7101  *  Note: As most callers use a stack allocated list_head,
7102  *  we force a list_del() to make sure stack wont be corrupted later.
7103  */
7104 void unregister_netdevice_many(struct list_head *head)
7105 {
7106         struct net_device *dev;
7107
7108         if (!list_empty(head)) {
7109                 rollback_registered_many(head);
7110                 list_for_each_entry(dev, head, unreg_list)
7111                         net_set_todo(dev);
7112                 list_del(head);
7113         }
7114 }
7115 EXPORT_SYMBOL(unregister_netdevice_many);
7116
7117 /**
7118  *      unregister_netdev - remove device from the kernel
7119  *      @dev: device
7120  *
7121  *      This function shuts down a device interface and removes it
7122  *      from the kernel tables.
7123  *
7124  *      This is just a wrapper for unregister_netdevice that takes
7125  *      the rtnl semaphore.  In general you want to use this and not
7126  *      unregister_netdevice.
7127  */
7128 void unregister_netdev(struct net_device *dev)
7129 {
7130         rtnl_lock();
7131         unregister_netdevice(dev);
7132         rtnl_unlock();
7133 }
7134 EXPORT_SYMBOL(unregister_netdev);
7135
7136 /**
7137  *      dev_change_net_namespace - move device to different nethost namespace
7138  *      @dev: device
7139  *      @net: network namespace
7140  *      @pat: If not NULL name pattern to try if the current device name
7141  *            is already taken in the destination network namespace.
7142  *
7143  *      This function shuts down a device interface and moves it
7144  *      to a new network namespace. On success 0 is returned, on
7145  *      a failure a netagive errno code is returned.
7146  *
7147  *      Callers must hold the rtnl semaphore.
7148  */
7149
7150 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7151 {
7152         int err;
7153
7154         ASSERT_RTNL();
7155
7156         /* Don't allow namespace local devices to be moved. */
7157         err = -EINVAL;
7158         if (dev->features & NETIF_F_NETNS_LOCAL)
7159                 goto out;
7160
7161         /* Ensure the device has been registrered */
7162         if (dev->reg_state != NETREG_REGISTERED)
7163                 goto out;
7164
7165         /* Get out if there is nothing todo */
7166         err = 0;
7167         if (net_eq(dev_net(dev), net))
7168                 goto out;
7169
7170         /* Pick the destination device name, and ensure
7171          * we can use it in the destination network namespace.
7172          */
7173         err = -EEXIST;
7174         if (__dev_get_by_name(net, dev->name)) {
7175                 /* We get here if we can't use the current device name */
7176                 if (!pat)
7177                         goto out;
7178                 if (dev_get_valid_name(net, dev, pat) < 0)
7179                         goto out;
7180         }
7181
7182         /*
7183          * And now a mini version of register_netdevice unregister_netdevice.
7184          */
7185
7186         /* If device is running close it first. */
7187         dev_close(dev);
7188
7189         /* And unlink it from device chain */
7190         err = -ENODEV;
7191         unlist_netdevice(dev);
7192
7193         synchronize_net();
7194
7195         /* Shutdown queueing discipline. */
7196         dev_shutdown(dev);
7197
7198         /* Notify protocols, that we are about to destroy
7199            this device. They should clean all the things.
7200
7201            Note that dev->reg_state stays at NETREG_REGISTERED.
7202            This is wanted because this way 8021q and macvlan know
7203            the device is just moving and can keep their slaves up.
7204         */
7205         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7206         rcu_barrier();
7207         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7208         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7209
7210         /*
7211          *      Flush the unicast and multicast chains
7212          */
7213         dev_uc_flush(dev);
7214         dev_mc_flush(dev);
7215
7216         /* Send a netdev-removed uevent to the old namespace */
7217         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7218         netdev_adjacent_del_links(dev);
7219
7220         /* Actually switch the network namespace */
7221         dev_net_set(dev, net);
7222
7223         /* If there is an ifindex conflict assign a new one */
7224         if (__dev_get_by_index(net, dev->ifindex))
7225                 dev->ifindex = dev_new_index(net);
7226
7227         /* Send a netdev-add uevent to the new namespace */
7228         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7229         netdev_adjacent_add_links(dev);
7230
7231         /* Fixup kobjects */
7232         err = device_rename(&dev->dev, dev->name);
7233         WARN_ON(err);
7234
7235         /* Add the device back in the hashes */
7236         list_netdevice(dev);
7237
7238         /* Notify protocols, that a new device appeared. */
7239         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7240
7241         /*
7242          *      Prevent userspace races by waiting until the network
7243          *      device is fully setup before sending notifications.
7244          */
7245         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7246
7247         synchronize_net();
7248         err = 0;
7249 out:
7250         return err;
7251 }
7252 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7253
7254 static int dev_cpu_callback(struct notifier_block *nfb,
7255                             unsigned long action,
7256                             void *ocpu)
7257 {
7258         struct sk_buff **list_skb;
7259         struct sk_buff *skb;
7260         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7261         struct softnet_data *sd, *oldsd;
7262
7263         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7264                 return NOTIFY_OK;
7265
7266         local_irq_disable();
7267         cpu = smp_processor_id();
7268         sd = &per_cpu(softnet_data, cpu);
7269         oldsd = &per_cpu(softnet_data, oldcpu);
7270
7271         /* Find end of our completion_queue. */
7272         list_skb = &sd->completion_queue;
7273         while (*list_skb)
7274                 list_skb = &(*list_skb)->next;
7275         /* Append completion queue from offline CPU. */
7276         *list_skb = oldsd->completion_queue;
7277         oldsd->completion_queue = NULL;
7278
7279         /* Append output queue from offline CPU. */
7280         if (oldsd->output_queue) {
7281                 *sd->output_queue_tailp = oldsd->output_queue;
7282                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7283                 oldsd->output_queue = NULL;
7284                 oldsd->output_queue_tailp = &oldsd->output_queue;
7285         }
7286         /* Append NAPI poll list from offline CPU, with one exception :
7287          * process_backlog() must be called by cpu owning percpu backlog.
7288          * We properly handle process_queue & input_pkt_queue later.
7289          */
7290         while (!list_empty(&oldsd->poll_list)) {
7291                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7292                                                             struct napi_struct,
7293                                                             poll_list);
7294
7295                 list_del_init(&napi->poll_list);
7296                 if (napi->poll == process_backlog)
7297                         napi->state = 0;
7298                 else
7299                         ____napi_schedule(sd, napi);
7300         }
7301
7302         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7303         local_irq_enable();
7304
7305         /* Process offline CPU's input_pkt_queue */
7306         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7307                 netif_rx_ni(skb);
7308                 input_queue_head_incr(oldsd);
7309         }
7310         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7311                 netif_rx_ni(skb);
7312                 input_queue_head_incr(oldsd);
7313         }
7314
7315         return NOTIFY_OK;
7316 }
7317
7318
7319 /**
7320  *      netdev_increment_features - increment feature set by one
7321  *      @all: current feature set
7322  *      @one: new feature set
7323  *      @mask: mask feature set
7324  *
7325  *      Computes a new feature set after adding a device with feature set
7326  *      @one to the master device with current feature set @all.  Will not
7327  *      enable anything that is off in @mask. Returns the new feature set.
7328  */
7329 netdev_features_t netdev_increment_features(netdev_features_t all,
7330         netdev_features_t one, netdev_features_t mask)
7331 {
7332         if (mask & NETIF_F_GEN_CSUM)
7333                 mask |= NETIF_F_ALL_CSUM;
7334         mask |= NETIF_F_VLAN_CHALLENGED;
7335
7336         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7337         all &= one | ~NETIF_F_ALL_FOR_ALL;
7338
7339         /* If one device supports hw checksumming, set for all. */
7340         if (all & NETIF_F_GEN_CSUM)
7341                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7342
7343         return all;
7344 }
7345 EXPORT_SYMBOL(netdev_increment_features);
7346
7347 static struct hlist_head * __net_init netdev_create_hash(void)
7348 {
7349         int i;
7350         struct hlist_head *hash;
7351
7352         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7353         if (hash != NULL)
7354                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7355                         INIT_HLIST_HEAD(&hash[i]);
7356
7357         return hash;
7358 }
7359
7360 /* Initialize per network namespace state */
7361 static int __net_init netdev_init(struct net *net)
7362 {
7363         if (net != &init_net)
7364                 INIT_LIST_HEAD(&net->dev_base_head);
7365
7366         net->dev_name_head = netdev_create_hash();
7367         if (net->dev_name_head == NULL)
7368                 goto err_name;
7369
7370         net->dev_index_head = netdev_create_hash();
7371         if (net->dev_index_head == NULL)
7372                 goto err_idx;
7373
7374         return 0;
7375
7376 err_idx:
7377         kfree(net->dev_name_head);
7378 err_name:
7379         return -ENOMEM;
7380 }
7381
7382 /**
7383  *      netdev_drivername - network driver for the device
7384  *      @dev: network device
7385  *
7386  *      Determine network driver for device.
7387  */
7388 const char *netdev_drivername(const struct net_device *dev)
7389 {
7390         const struct device_driver *driver;
7391         const struct device *parent;
7392         const char *empty = "";
7393
7394         parent = dev->dev.parent;
7395         if (!parent)
7396                 return empty;
7397
7398         driver = parent->driver;
7399         if (driver && driver->name)
7400                 return driver->name;
7401         return empty;
7402 }
7403
7404 static void __netdev_printk(const char *level, const struct net_device *dev,
7405                             struct va_format *vaf)
7406 {
7407         if (dev && dev->dev.parent) {
7408                 dev_printk_emit(level[1] - '0',
7409                                 dev->dev.parent,
7410                                 "%s %s %s%s: %pV",
7411                                 dev_driver_string(dev->dev.parent),
7412                                 dev_name(dev->dev.parent),
7413                                 netdev_name(dev), netdev_reg_state(dev),
7414                                 vaf);
7415         } else if (dev) {
7416                 printk("%s%s%s: %pV",
7417                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7418         } else {
7419                 printk("%s(NULL net_device): %pV", level, vaf);
7420         }
7421 }
7422
7423 void netdev_printk(const char *level, const struct net_device *dev,
7424                    const char *format, ...)
7425 {
7426         struct va_format vaf;
7427         va_list args;
7428
7429         va_start(args, format);
7430
7431         vaf.fmt = format;
7432         vaf.va = &args;
7433
7434         __netdev_printk(level, dev, &vaf);
7435
7436         va_end(args);
7437 }
7438 EXPORT_SYMBOL(netdev_printk);
7439
7440 #define define_netdev_printk_level(func, level)                 \
7441 void func(const struct net_device *dev, const char *fmt, ...)   \
7442 {                                                               \
7443         struct va_format vaf;                                   \
7444         va_list args;                                           \
7445                                                                 \
7446         va_start(args, fmt);                                    \
7447                                                                 \
7448         vaf.fmt = fmt;                                          \
7449         vaf.va = &args;                                         \
7450                                                                 \
7451         __netdev_printk(level, dev, &vaf);                      \
7452                                                                 \
7453         va_end(args);                                           \
7454 }                                                               \
7455 EXPORT_SYMBOL(func);
7456
7457 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7458 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7459 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7460 define_netdev_printk_level(netdev_err, KERN_ERR);
7461 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7462 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7463 define_netdev_printk_level(netdev_info, KERN_INFO);
7464
7465 static void __net_exit netdev_exit(struct net *net)
7466 {
7467         kfree(net->dev_name_head);
7468         kfree(net->dev_index_head);
7469 }
7470
7471 static struct pernet_operations __net_initdata netdev_net_ops = {
7472         .init = netdev_init,
7473         .exit = netdev_exit,
7474 };
7475
7476 static void __net_exit default_device_exit(struct net *net)
7477 {
7478         struct net_device *dev, *aux;
7479         /*
7480          * Push all migratable network devices back to the
7481          * initial network namespace
7482          */
7483         rtnl_lock();
7484         for_each_netdev_safe(net, dev, aux) {
7485                 int err;
7486                 char fb_name[IFNAMSIZ];
7487
7488                 /* Ignore unmoveable devices (i.e. loopback) */
7489                 if (dev->features & NETIF_F_NETNS_LOCAL)
7490                         continue;
7491
7492                 /* Leave virtual devices for the generic cleanup */
7493                 if (dev->rtnl_link_ops)
7494                         continue;
7495
7496                 /* Push remaining network devices to init_net */
7497                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7498                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7499                 if (err) {
7500                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7501                                  __func__, dev->name, err);
7502                         BUG();
7503                 }
7504         }
7505         rtnl_unlock();
7506 }
7507
7508 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7509 {
7510         /* Return with the rtnl_lock held when there are no network
7511          * devices unregistering in any network namespace in net_list.
7512          */
7513         struct net *net;
7514         bool unregistering;
7515         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7516
7517         add_wait_queue(&netdev_unregistering_wq, &wait);
7518         for (;;) {
7519                 unregistering = false;
7520                 rtnl_lock();
7521                 list_for_each_entry(net, net_list, exit_list) {
7522                         if (net->dev_unreg_count > 0) {
7523                                 unregistering = true;
7524                                 break;
7525                         }
7526                 }
7527                 if (!unregistering)
7528                         break;
7529                 __rtnl_unlock();
7530
7531                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7532         }
7533         remove_wait_queue(&netdev_unregistering_wq, &wait);
7534 }
7535
7536 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7537 {
7538         /* At exit all network devices most be removed from a network
7539          * namespace.  Do this in the reverse order of registration.
7540          * Do this across as many network namespaces as possible to
7541          * improve batching efficiency.
7542          */
7543         struct net_device *dev;
7544         struct net *net;
7545         LIST_HEAD(dev_kill_list);
7546
7547         /* To prevent network device cleanup code from dereferencing
7548          * loopback devices or network devices that have been freed
7549          * wait here for all pending unregistrations to complete,
7550          * before unregistring the loopback device and allowing the
7551          * network namespace be freed.
7552          *
7553          * The netdev todo list containing all network devices
7554          * unregistrations that happen in default_device_exit_batch
7555          * will run in the rtnl_unlock() at the end of
7556          * default_device_exit_batch.
7557          */
7558         rtnl_lock_unregistering(net_list);
7559         list_for_each_entry(net, net_list, exit_list) {
7560                 for_each_netdev_reverse(net, dev) {
7561                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7562                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7563                         else
7564                                 unregister_netdevice_queue(dev, &dev_kill_list);
7565                 }
7566         }
7567         unregister_netdevice_many(&dev_kill_list);
7568         rtnl_unlock();
7569 }
7570
7571 static struct pernet_operations __net_initdata default_device_ops = {
7572         .exit = default_device_exit,
7573         .exit_batch = default_device_exit_batch,
7574 };
7575
7576 /*
7577  *      Initialize the DEV module. At boot time this walks the device list and
7578  *      unhooks any devices that fail to initialise (normally hardware not
7579  *      present) and leaves us with a valid list of present and active devices.
7580  *
7581  */
7582
7583 /*
7584  *       This is called single threaded during boot, so no need
7585  *       to take the rtnl semaphore.
7586  */
7587 static int __init net_dev_init(void)
7588 {
7589         int i, rc = -ENOMEM;
7590
7591         BUG_ON(!dev_boot_phase);
7592
7593         if (dev_proc_init())
7594                 goto out;
7595
7596         if (netdev_kobject_init())
7597                 goto out;
7598
7599         INIT_LIST_HEAD(&ptype_all);
7600         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7601                 INIT_LIST_HEAD(&ptype_base[i]);
7602
7603         INIT_LIST_HEAD(&offload_base);
7604
7605         if (register_pernet_subsys(&netdev_net_ops))
7606                 goto out;
7607
7608         /*
7609          *      Initialise the packet receive queues.
7610          */
7611
7612         for_each_possible_cpu(i) {
7613                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7614
7615                 skb_queue_head_init(&sd->input_pkt_queue);
7616                 skb_queue_head_init(&sd->process_queue);
7617                 INIT_LIST_HEAD(&sd->poll_list);
7618                 sd->output_queue_tailp = &sd->output_queue;
7619 #ifdef CONFIG_RPS
7620                 sd->csd.func = rps_trigger_softirq;
7621                 sd->csd.info = sd;
7622                 sd->cpu = i;
7623 #endif
7624
7625                 sd->backlog.poll = process_backlog;
7626                 sd->backlog.weight = weight_p;
7627         }
7628
7629         dev_boot_phase = 0;
7630
7631         /* The loopback device is special if any other network devices
7632          * is present in a network namespace the loopback device must
7633          * be present. Since we now dynamically allocate and free the
7634          * loopback device ensure this invariant is maintained by
7635          * keeping the loopback device as the first device on the
7636          * list of network devices.  Ensuring the loopback devices
7637          * is the first device that appears and the last network device
7638          * that disappears.
7639          */
7640         if (register_pernet_device(&loopback_net_ops))
7641                 goto out;
7642
7643         if (register_pernet_device(&default_device_ops))
7644                 goto out;
7645
7646         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7647         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7648
7649         hotcpu_notifier(dev_cpu_callback, 0);
7650         dst_init();
7651         rc = 0;
7652 out:
7653         return rc;
7654 }
7655
7656 subsys_initcall(net_dev_init);