net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 static int netif_rx_internal(struct sk_buff *skb);
 151
 152 /*
 153  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 154  * semaphore.
 155  *
 156  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 157  *
 158  * Writers must hold the rtnl semaphore while they loop through the
 159  * dev_base_head list, and hold dev_base_lock for writing when they do the
 160  * actual updates.  This allows pure readers to access the list even
 161  * while a writer is preparing to update it.
 162  *
 163  * To put it another way, dev_base_lock is held for writing only to
 164  * protect against pure readers; the rtnl semaphore provides the
 165  * protection against other writers.
 166  *
 167  * See, for example usages, register_netdevice() and
 168  * unregister_netdevice(), which must be called with the rtnl
 169  * semaphore held.
 170  */
 171 DEFINE_RWLOCK(dev_base_lock);
 172 EXPORT_SYMBOL(dev_base_lock);
 173
 174 /* protects napi_hash addition/deletion and napi_gen_id */
 175 static DEFINE_SPINLOCK(napi_hash_lock);
 176
 177 static unsigned int napi_gen_id;
 178 static DEFINE_HASHTABLE(napi_hash, 8);
 179
 180 static seqcount_t devnet_rename_seq;
 181
 182 static inline void dev_base_seq_inc(struct net *net)
 183 {
 184         while (++net->dev_base_seq == 0);
 185 }
 186
 187 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 188 {
 189         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 190
 191         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 192 }
 193
 194 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 195 {
 196         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 197 }
 198
 199 static inline void rps_lock(struct softnet_data *sd)
 200 {
 201 #ifdef CONFIG_RPS
 202         spin_lock(&sd->input_pkt_queue.lock);
 203 #endif
 204 }
 205
 206 static inline void rps_unlock(struct softnet_data *sd)
 207 {
 208 #ifdef CONFIG_RPS
 209         spin_unlock(&sd->input_pkt_queue.lock);
 210 #endif
 211 }
 212
 213 /* Device list insertion */
 214 static void list_netdevice(struct net_device *dev)
 215 {
 216         struct net *net = dev_net(dev);
 217
 218         ASSERT_RTNL();
 219
 220         write_lock_bh(&dev_base_lock);
 221         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 222         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 223         hlist_add_head_rcu(&dev->index_hlist,
 224                            dev_index_hash(net, dev->ifindex));
 225         write_unlock_bh(&dev_base_lock);
 226
 227         dev_base_seq_inc(net);
 228 }
 229
 230 /* Device list removal
 231  * caller must respect a RCU grace period before freeing/reusing dev
 232  */
 233 static void unlist_netdevice(struct net_device *dev)
 234 {
 235         ASSERT_RTNL();
 236
 237         /* Unlink dev from the device chain */
 238         write_lock_bh(&dev_base_lock);
 239         list_del_rcu(&dev->dev_list);
 240         hlist_del_rcu(&dev->name_hlist);
 241         hlist_del_rcu(&dev->index_hlist);
 242         write_unlock_bh(&dev_base_lock);
 243
 244         dev_base_seq_inc(dev_net(dev));
 245 }
 246
 247 /*
 248  *      Our notifier list
 249  */
 250
 251 static RAW_NOTIFIER_HEAD(netdev_chain);
 252
 253 /*
 254  *      Device drivers call our routines to queue packets here. We empty the
 255  *      queue in the local softnet handler.
 256  */
 257
 258 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 259 EXPORT_PER_CPU_SYMBOL(softnet_data);
 260
 261 #ifdef CONFIG_LOCKDEP
 262 /*
 263  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 264  * according to dev->type
 265  */
 266 static const unsigned short netdev_lock_type[] =
 267         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 268          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 269          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 270          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 271          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 272          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 273          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 274          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 275          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 276          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 277          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 278          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 279          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 280          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 281          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 282
 283 static const char *const netdev_lock_name[] =
 284         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 285          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 286          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 287          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 288          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 289          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 290          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 291          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 292          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 293          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 294          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 295          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 296          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 297          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 298          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 299
 300 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 301 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 302
 303 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 304 {
 305         int i;
 306
 307         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 308                 if (netdev_lock_type[i] == dev_type)
 309                         return i;
 310         /* the last key is used by default */
 311         return ARRAY_SIZE(netdev_lock_type) - 1;
 312 }
 313
 314 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 315                                                  unsigned short dev_type)
 316 {
 317         int i;
 318
 319         i = netdev_lock_pos(dev_type);
 320         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 321                                    netdev_lock_name[i]);
 322 }
 323
 324 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 325 {
 326         int i;
 327
 328         i = netdev_lock_pos(dev->type);
 329         lockdep_set_class_and_name(&dev->addr_list_lock,
 330                                    &netdev_addr_lock_key[i],
 331                                    netdev_lock_name[i]);
 332 }
 333 #else
 334 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 335                                                  unsigned short dev_type)
 336 {
 337 }
 338 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 339 {
 340 }
 341 #endif
 342
 343 /*******************************************************************************
 344
 345                 Protocol management and registration routines
 346
 347 *******************************************************************************/
 348
 349 /*
 350  *      Add a protocol ID to the list. Now that the input handler is
 351  *      smarter we can dispense with all the messy stuff that used to be
 352  *      here.
 353  *
 354  *      BEWARE!!! Protocol handlers, mangling input packets,
 355  *      MUST BE last in hash buckets and checking protocol handlers
 356  *      MUST start from promiscuous ptype_all chain in net_bh.
 357  *      It is true now, do not change it.
 358  *      Explanation follows: if protocol handler, mangling packet, will
 359  *      be the first on list, it is not able to sense, that packet
 360  *      is cloned and should be copied-on-write, so that it will
 361  *      change it and subsequent readers will get broken packet.
 362  *                                                      --ANK (980803)
 363  */
 364
 365 static inline struct list_head *ptype_head(const struct packet_type *pt)
 366 {
 367         if (pt->type == htons(ETH_P_ALL))
 368                 return &ptype_all;
 369         else
 370                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 371 }
 372
 373 /**
 374  *      dev_add_pack - add packet handler
 375  *      @pt: packet type declaration
 376  *
 377  *      Add a protocol handler to the networking stack. The passed &packet_type
 378  *      is linked into kernel lists and may not be freed until it has been
 379  *      removed from the kernel lists.
 380  *
 381  *      This call does not sleep therefore it can not
 382  *      guarantee all CPU's that are in middle of receiving packets
 383  *      will see the new packet type (until the next received packet).
 384  */
 385
 386 void dev_add_pack(struct packet_type *pt)
 387 {
 388         struct list_head *head = ptype_head(pt);
 389
 390         spin_lock(&ptype_lock);
 391         list_add_rcu(&pt->list, head);
 392         spin_unlock(&ptype_lock);
 393 }
 394 EXPORT_SYMBOL(dev_add_pack);
 395
 396 /**
 397  *      __dev_remove_pack        - remove packet handler
 398  *      @pt: packet type declaration
 399  *
 400  *      Remove a protocol handler that was previously added to the kernel
 401  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 402  *      from the kernel lists and can be freed or reused once this function
 403  *      returns.
 404  *
 405  *      The packet type might still be in use by receivers
 406  *      and must not be freed until after all the CPU's have gone
 407  *      through a quiescent state.
 408  */
 409 void __dev_remove_pack(struct packet_type *pt)
 410 {
 411         struct list_head *head = ptype_head(pt);
 412         struct packet_type *pt1;
 413
 414         spin_lock(&ptype_lock);
 415
 416         list_for_each_entry(pt1, head, list) {
 417                 if (pt == pt1) {
 418                         list_del_rcu(&pt->list);
 419                         goto out;
 420                 }
 421         }
 422
 423         pr_warn("dev_remove_pack: %p not found\n", pt);
 424 out:
 425         spin_unlock(&ptype_lock);
 426 }
 427 EXPORT_SYMBOL(__dev_remove_pack);
 428
 429 /**
 430  *      dev_remove_pack  - remove packet handler
 431  *      @pt: packet type declaration
 432  *
 433  *      Remove a protocol handler that was previously added to the kernel
 434  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 435  *      from the kernel lists and can be freed or reused once this function
 436  *      returns.
 437  *
 438  *      This call sleeps to guarantee that no CPU is looking at the packet
 439  *      type after return.
 440  */
 441 void dev_remove_pack(struct packet_type *pt)
 442 {
 443         __dev_remove_pack(pt);
 444
 445         synchronize_net();
 446 }
 447 EXPORT_SYMBOL(dev_remove_pack);
 448
 449
 450 /**
 451  *      dev_add_offload - register offload handlers
 452  *      @po: protocol offload declaration
 453  *
 454  *      Add protocol offload handlers to the networking stack. The passed
 455  *      &proto_offload is linked into kernel lists and may not be freed until
 456  *      it has been removed from the kernel lists.
 457  *
 458  *      This call does not sleep therefore it can not
 459  *      guarantee all CPU's that are in middle of receiving packets
 460  *      will see the new offload handlers (until the next received packet).
 461  */
 462 void dev_add_offload(struct packet_offload *po)
 463 {
 464         struct list_head *head = &offload_base;
 465
 466         spin_lock(&offload_lock);
 467         list_add_rcu(&po->list, head);
 468         spin_unlock(&offload_lock);
 469 }
 470 EXPORT_SYMBOL(dev_add_offload);
 471
 472 /**
 473  *      __dev_remove_offload     - remove offload handler
 474  *      @po: packet offload declaration
 475  *
 476  *      Remove a protocol offload handler that was previously added to the
 477  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 478  *      is removed from the kernel lists and can be freed or reused once this
 479  *      function returns.
 480  *
 481  *      The packet type might still be in use by receivers
 482  *      and must not be freed until after all the CPU's have gone
 483  *      through a quiescent state.
 484  */
 485 static void __dev_remove_offload(struct packet_offload *po)
 486 {
 487         struct list_head *head = &offload_base;
 488         struct packet_offload *po1;
 489
 490         spin_lock(&offload_lock);
 491
 492         list_for_each_entry(po1, head, list) {
 493                 if (po == po1) {
 494                         list_del_rcu(&po->list);
 495                         goto out;
 496                 }
 497         }
 498
 499         pr_warn("dev_remove_offload: %p not found\n", po);
 500 out:
 501         spin_unlock(&offload_lock);
 502 }
 503
 504 /**
 505  *      dev_remove_offload       - remove packet offload handler
 506  *      @po: packet offload declaration
 507  *
 508  *      Remove a packet offload handler that was previously added to the kernel
 509  *      offload handlers by dev_add_offload(). The passed &offload_type is
 510  *      removed from the kernel lists and can be freed or reused once this
 511  *      function returns.
 512  *
 513  *      This call sleeps to guarantee that no CPU is looking at the packet
 514  *      type after return.
 515  */
 516 void dev_remove_offload(struct packet_offload *po)
 517 {
 518         __dev_remove_offload(po);
 519
 520         synchronize_net();
 521 }
 522 EXPORT_SYMBOL(dev_remove_offload);
 523
 524 /******************************************************************************
 525
 526                       Device Boot-time Settings Routines
 527
 528 *******************************************************************************/
 529
 530 /* Boot time configuration table */
 531 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 532
 533 /**
 534  *      netdev_boot_setup_add   - add new setup entry
 535  *      @name: name of the device
 536  *      @map: configured settings for the device
 537  *
 538  *      Adds new setup entry to the dev_boot_setup list.  The function
 539  *      returns 0 on error and 1 on success.  This is a generic routine to
 540  *      all netdevices.
 541  */
 542 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 543 {
 544         struct netdev_boot_setup *s;
 545         int i;
 546
 547         s = dev_boot_setup;
 548         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 549                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 550                         memset(s[i].name, 0, sizeof(s[i].name));
 551                         strlcpy(s[i].name, name, IFNAMSIZ);
 552                         memcpy(&s[i].map, map, sizeof(s[i].map));
 553                         break;
 554                 }
 555         }
 556
 557         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 558 }
 559
 560 /**
 561  *      netdev_boot_setup_check - check boot time settings
 562  *      @dev: the netdevice
 563  *
 564  *      Check boot time settings for the device.
 565  *      The found settings are set for the device to be used
 566  *      later in the device probing.
 567  *      Returns 0 if no settings found, 1 if they are.
 568  */
 569 int netdev_boot_setup_check(struct net_device *dev)
 570 {
 571         struct netdev_boot_setup *s = dev_boot_setup;
 572         int i;
 573
 574         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 575                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 576                     !strcmp(dev->name, s[i].name)) {
 577                         dev->irq        = s[i].map.irq;
 578                         dev->base_addr  = s[i].map.base_addr;
 579                         dev->mem_start  = s[i].map.mem_start;
 580                         dev->mem_end    = s[i].map.mem_end;
 581                         return 1;
 582                 }
 583         }
 584         return 0;
 585 }
 586 EXPORT_SYMBOL(netdev_boot_setup_check);
 587
 588
 589 /**
 590  *      netdev_boot_base        - get address from boot time settings
 591  *      @prefix: prefix for network device
 592  *      @unit: id for network device
 593  *
 594  *      Check boot time settings for the base address of device.
 595  *      The found settings are set for the device to be used
 596  *      later in the device probing.
 597  *      Returns 0 if no settings found.
 598  */
 599 unsigned long netdev_boot_base(const char *prefix, int unit)
 600 {
 601         const struct netdev_boot_setup *s = dev_boot_setup;
 602         char name[IFNAMSIZ];
 603         int i;
 604
 605         sprintf(name, "%s%d", prefix, unit);
 606
 607         /*
 608          * If device already registered then return base of 1
 609          * to indicate not to probe for this interface
 610          */
 611         if (__dev_get_by_name(&init_net, name))
 612                 return 1;
 613
 614         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 615                 if (!strcmp(name, s[i].name))
 616                         return s[i].map.base_addr;
 617         return 0;
 618 }
 619
 620 /*
 621  * Saves at boot time configured settings for any netdevice.
 622  */
 623 int __init netdev_boot_setup(char *str)
 624 {
 625         int ints[5];
 626         struct ifmap map;
 627
 628         str = get_options(str, ARRAY_SIZE(ints), ints);
 629         if (!str || !*str)
 630                 return 0;
 631
 632         /* Save settings */
 633         memset(&map, 0, sizeof(map));
 634         if (ints[0] > 0)
 635                 map.irq = ints[1];
 636         if (ints[0] > 1)
 637                 map.base_addr = ints[2];
 638         if (ints[0] > 2)
 639                 map.mem_start = ints[3];
 640         if (ints[0] > 3)
 641                 map.mem_end = ints[4];
 642
 643         /* Add new entry to the list */
 644         return netdev_boot_setup_add(str, &map);
 645 }
 646
 647 __setup("netdev=", netdev_boot_setup);
 648
 649 /*******************************************************************************
 650
 651                             Device Interface Subroutines
 652
 653 *******************************************************************************/
 654
 655 /**
 656  *      __dev_get_by_name       - find a device by its name
 657  *      @net: the applicable net namespace
 658  *      @name: name to find
 659  *
 660  *      Find an interface by name. Must be called under RTNL semaphore
 661  *      or @dev_base_lock. If the name is found a pointer to the device
 662  *      is returned. If the name is not found then %NULL is returned. The
 663  *      reference counters are not incremented so the caller must be
 664  *      careful with locks.
 665  */
 666
 667 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 668 {
 669         struct net_device *dev;
 670         struct hlist_head *head = dev_name_hash(net, name);
 671
 672         hlist_for_each_entry(dev, head, name_hlist)
 673                 if (!strncmp(dev->name, name, IFNAMSIZ))
 674                         return dev;
 675
 676         return NULL;
 677 }
 678 EXPORT_SYMBOL(__dev_get_by_name);
 679
 680 /**
 681  *      dev_get_by_name_rcu     - find a device by its name
 682  *      @net: the applicable net namespace
 683  *      @name: name to find
 684  *
 685  *      Find an interface by name.
 686  *      If the name is found a pointer to the device is returned.
 687  *      If the name is not found then %NULL is returned.
 688  *      The reference counters are not incremented so the caller must be
 689  *      careful with locks. The caller must hold RCU lock.
 690  */
 691
 692 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 693 {
 694         struct net_device *dev;
 695         struct hlist_head *head = dev_name_hash(net, name);
 696
 697         hlist_for_each_entry_rcu(dev, head, name_hlist)
 698                 if (!strncmp(dev->name, name, IFNAMSIZ))
 699                         return dev;
 700
 701         return NULL;
 702 }
 703 EXPORT_SYMBOL(dev_get_by_name_rcu);
 704
 705 /**
 706  *      dev_get_by_name         - find a device by its name
 707  *      @net: the applicable net namespace
 708  *      @name: name to find
 709  *
 710  *      Find an interface by name. This can be called from any
 711  *      context and does its own locking. The returned handle has
 712  *      the usage count incremented and the caller must use dev_put() to
 713  *      release it when it is no longer needed. %NULL is returned if no
 714  *      matching device is found.
 715  */
 716
 717 struct net_device *dev_get_by_name(struct net *net, const char *name)
 718 {
 719         struct net_device *dev;
 720
 721         rcu_read_lock();
 722         dev = dev_get_by_name_rcu(net, name);
 723         if (dev)
 724                 dev_hold(dev);
 725         rcu_read_unlock();
 726         return dev;
 727 }
 728 EXPORT_SYMBOL(dev_get_by_name);
 729
 730 /**
 731  *      __dev_get_by_index - find a device by its ifindex
 732  *      @net: the applicable net namespace
 733  *      @ifindex: index of device
 734  *
 735  *      Search for an interface by index. Returns %NULL if the device
 736  *      is not found or a pointer to the device. The device has not
 737  *      had its reference counter increased so the caller must be careful
 738  *      about locking. The caller must hold either the RTNL semaphore
 739  *      or @dev_base_lock.
 740  */
 741
 742 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 743 {
 744         struct net_device *dev;
 745         struct hlist_head *head = dev_index_hash(net, ifindex);
 746
 747         hlist_for_each_entry(dev, head, index_hlist)
 748                 if (dev->ifindex == ifindex)
 749                         return dev;
 750
 751         return NULL;
 752 }
 753 EXPORT_SYMBOL(__dev_get_by_index);
 754
 755 /**
 756  *      dev_get_by_index_rcu - find a device by its ifindex
 757  *      @net: the applicable net namespace
 758  *      @ifindex: index of device
 759  *
 760  *      Search for an interface by index. Returns %NULL if the device
 761  *      is not found or a pointer to the device. The device has not
 762  *      had its reference counter increased so the caller must be careful
 763  *      about locking. The caller must hold RCU lock.
 764  */
 765
 766 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 767 {
 768         struct net_device *dev;
 769         struct hlist_head *head = dev_index_hash(net, ifindex);
 770
 771         hlist_for_each_entry_rcu(dev, head, index_hlist)
 772                 if (dev->ifindex == ifindex)
 773                         return dev;
 774
 775         return NULL;
 776 }
 777 EXPORT_SYMBOL(dev_get_by_index_rcu);
 778
 779
 780 /**
 781  *      dev_get_by_index - find a device by its ifindex
 782  *      @net: the applicable net namespace
 783  *      @ifindex: index of device
 784  *
 785  *      Search for an interface by index. Returns NULL if the device
 786  *      is not found or a pointer to the device. The device returned has
 787  *      had a reference added and the pointer is safe until the user calls
 788  *      dev_put to indicate they have finished with it.
 789  */
 790
 791 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 792 {
 793         struct net_device *dev;
 794
 795         rcu_read_lock();
 796         dev = dev_get_by_index_rcu(net, ifindex);
 797         if (dev)
 798                 dev_hold(dev);
 799         rcu_read_unlock();
 800         return dev;
 801 }
 802 EXPORT_SYMBOL(dev_get_by_index);
 803
 804 /**
 805  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 806  *      @net: network namespace
 807  *      @name: a pointer to the buffer where the name will be stored.
 808  *      @ifindex: the ifindex of the interface to get the name from.
 809  *
 810  *      The use of raw_seqcount_begin() and cond_resched() before
 811  *      retrying is required as we want to give the writers a chance
 812  *      to complete when CONFIG_PREEMPT is not set.
 813  */
 814 int netdev_get_name(struct net *net, char *name, int ifindex)
 815 {
 816         struct net_device *dev;
 817         unsigned int seq;
 818
 819 retry:
 820         seq = raw_seqcount_begin(&devnet_rename_seq);
 821         rcu_read_lock();
 822         dev = dev_get_by_index_rcu(net, ifindex);
 823         if (!dev) {
 824                 rcu_read_unlock();
 825                 return -ENODEV;
 826         }
 827
 828         strcpy(name, dev->name);
 829         rcu_read_unlock();
 830         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 831                 cond_resched();
 832                 goto retry;
 833         }
 834
 835         return 0;
 836 }
 837
 838 /**
 839  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 840  *      @net: the applicable net namespace
 841  *      @type: media type of device
 842  *      @ha: hardware address
 843  *
 844  *      Search for an interface by MAC address. Returns NULL if the device
 845  *      is not found or a pointer to the device.
 846  *      The caller must hold RCU or RTNL.
 847  *      The returned device has not had its ref count increased
 848  *      and the caller must therefore be careful about locking
 849  *
 850  */
 851
 852 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 853                                        const char *ha)
 854 {
 855         struct net_device *dev;
 856
 857         for_each_netdev_rcu(net, dev)
 858                 if (dev->type == type &&
 859                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 860                         return dev;
 861
 862         return NULL;
 863 }
 864 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 865
 866 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 867 {
 868         struct net_device *dev;
 869
 870         ASSERT_RTNL();
 871         for_each_netdev(net, dev)
 872                 if (dev->type == type)
 873                         return dev;
 874
 875         return NULL;
 876 }
 877 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 878
 879 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 880 {
 881         struct net_device *dev, *ret = NULL;
 882
 883         rcu_read_lock();
 884         for_each_netdev_rcu(net, dev)
 885                 if (dev->type == type) {
 886                         dev_hold(dev);
 887                         ret = dev;
 888                         break;
 889                 }
 890         rcu_read_unlock();
 891         return ret;
 892 }
 893 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 894
 895 /**
 896  *      dev_get_by_flags_rcu - find any device with given flags
 897  *      @net: the applicable net namespace
 898  *      @if_flags: IFF_* values
 899  *      @mask: bitmask of bits in if_flags to check
 900  *
 901  *      Search for any interface with the given flags. Returns NULL if a device
 902  *      is not found or a pointer to the device. Must be called inside
 903  *      rcu_read_lock(), and result refcount is unchanged.
 904  */
 905
 906 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 907                                     unsigned short mask)
 908 {
 909         struct net_device *dev, *ret;
 910
 911         ret = NULL;
 912         for_each_netdev_rcu(net, dev) {
 913                 if (((dev->flags ^ if_flags) & mask) == 0) {
 914                         ret = dev;
 915                         break;
 916                 }
 917         }
 918         return ret;
 919 }
 920 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 921
 922 /**
 923  *      dev_valid_name - check if name is okay for network device
 924  *      @name: name string
 925  *
 926  *      Network device names need to be valid file names to
 927  *      to allow sysfs to work.  We also disallow any kind of
 928  *      whitespace.
 929  */
 930 bool dev_valid_name(const char *name)
 931 {
 932         if (*name == '\0')
 933                 return false;
 934         if (strlen(name) >= IFNAMSIZ)
 935                 return false;
 936         if (!strcmp(name, ".") || !strcmp(name, ".."))
 937                 return false;
 938
 939         while (*name) {
 940                 if (*name == '/' || isspace(*name))
 941                         return false;
 942                 name++;
 943         }
 944         return true;
 945 }
 946 EXPORT_SYMBOL(dev_valid_name);
 947
 948 /**
 949  *      __dev_alloc_name - allocate a name for a device
 950  *      @net: network namespace to allocate the device name in
 951  *      @name: name format string
 952  *      @buf:  scratch buffer and result name string
 953  *
 954  *      Passed a format string - eg "lt%d" it will try and find a suitable
 955  *      id. It scans list of devices to build up a free map, then chooses
 956  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 957  *      while allocating the name and adding the device in order to avoid
 958  *      duplicates.
 959  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 960  *      Returns the number of the unit assigned or a negative errno code.
 961  */
 962
 963 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 964 {
 965         int i = 0;
 966         const char *p;
 967         const int max_netdevices = 8*PAGE_SIZE;
 968         unsigned long *inuse;
 969         struct net_device *d;
 970
 971         p = strnchr(name, IFNAMSIZ-1, '%');
 972         if (p) {
 973                 /*
 974                  * Verify the string as this thing may have come from
 975                  * the user.  There must be either one "%d" and no other "%"
 976                  * characters.
 977                  */
 978                 if (p[1] != 'd' || strchr(p + 2, '%'))
 979                         return -EINVAL;
 980
 981                 /* Use one page as a bit array of possible slots */
 982                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 983                 if (!inuse)
 984                         return -ENOMEM;
 985
 986                 for_each_netdev(net, d) {
 987                         if (!sscanf(d->name, name, &i))
 988                                 continue;
 989                         if (i < 0 || i >= max_netdevices)
 990                                 continue;
 991
 992                         /*  avoid cases where sscanf is not exact inverse of printf */
 993                         snprintf(buf, IFNAMSIZ, name, i);
 994                         if (!strncmp(buf, d->name, IFNAMSIZ))
 995                                 set_bit(i, inuse);
 996                 }
 997
 998                 i = find_first_zero_bit(inuse, max_netdevices);
 999                 free_page((unsigned long) inuse);
1000         }
1001
1002         if (buf != name)
1003                 snprintf(buf, IFNAMSIZ, name, i);
1004         if (!__dev_get_by_name(net, buf))
1005                 return i;
1006
1007         /* It is possible to run out of possible slots
1008          * when the name is long and there isn't enough space left
1009          * for the digits, or if all bits are used.
1010          */
1011         return -ENFILE;
1012 }
1013
1014 /**
1015  *      dev_alloc_name - allocate a name for a device
1016  *      @dev: device
1017  *      @name: name format string
1018  *
1019  *      Passed a format string - eg "lt%d" it will try and find a suitable
1020  *      id. It scans list of devices to build up a free map, then chooses
1021  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1022  *      while allocating the name and adding the device in order to avoid
1023  *      duplicates.
1024  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1025  *      Returns the number of the unit assigned or a negative errno code.
1026  */
1027
1028 int dev_alloc_name(struct net_device *dev, const char *name)
1029 {
1030         char buf[IFNAMSIZ];
1031         struct net *net;
1032         int ret;
1033
1034         BUG_ON(!dev_net(dev));
1035         net = dev_net(dev);
1036         ret = __dev_alloc_name(net, name, buf);
1037         if (ret >= 0)
1038                 strlcpy(dev->name, buf, IFNAMSIZ);
1039         return ret;
1040 }
1041 EXPORT_SYMBOL(dev_alloc_name);
1042
1043 static int dev_alloc_name_ns(struct net *net,
1044                              struct net_device *dev,
1045                              const char *name)
1046 {
1047         char buf[IFNAMSIZ];
1048         int ret;
1049
1050         ret = __dev_alloc_name(net, name, buf);
1051         if (ret >= 0)
1052                 strlcpy(dev->name, buf, IFNAMSIZ);
1053         return ret;
1054 }
1055
1056 static int dev_get_valid_name(struct net *net,
1057                               struct net_device *dev,
1058                               const char *name)
1059 {
1060         BUG_ON(!net);
1061
1062         if (!dev_valid_name(name))
1063                 return -EINVAL;
1064
1065         if (strchr(name, '%'))
1066                 return dev_alloc_name_ns(net, dev, name);
1067         else if (__dev_get_by_name(net, name))
1068                 return -EEXIST;
1069         else if (dev->name != name)
1070                 strlcpy(dev->name, name, IFNAMSIZ);
1071
1072         return 0;
1073 }
1074
1075 /**
1076  *      dev_change_name - change name of a device
1077  *      @dev: device
1078  *      @newname: name (or format string) must be at least IFNAMSIZ
1079  *
1080  *      Change name of a device, can pass format strings "eth%d".
1081  *      for wildcarding.
1082  */
1083 int dev_change_name(struct net_device *dev, const char *newname)
1084 {
1085         char oldname[IFNAMSIZ];
1086         int err = 0;
1087         int ret;
1088         struct net *net;
1089
1090         ASSERT_RTNL();
1091         BUG_ON(!dev_net(dev));
1092
1093         net = dev_net(dev);
1094         if (dev->flags & IFF_UP)
1095                 return -EBUSY;
1096
1097         write_seqcount_begin(&devnet_rename_seq);
1098
1099         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1100                 write_seqcount_end(&devnet_rename_seq);
1101                 return 0;
1102         }
1103
1104         memcpy(oldname, dev->name, IFNAMSIZ);
1105
1106         err = dev_get_valid_name(net, dev, newname);
1107         if (err < 0) {
1108                 write_seqcount_end(&devnet_rename_seq);
1109                 return err;
1110         }
1111
1112 rollback:
1113         ret = device_rename(&dev->dev, dev->name);
1114         if (ret) {
1115                 memcpy(dev->name, oldname, IFNAMSIZ);
1116                 write_seqcount_end(&devnet_rename_seq);
1117                 return ret;
1118         }
1119
1120         write_seqcount_end(&devnet_rename_seq);
1121
1122         netdev_adjacent_rename_links(dev, oldname);
1123
1124         write_lock_bh(&dev_base_lock);
1125         hlist_del_rcu(&dev->name_hlist);
1126         write_unlock_bh(&dev_base_lock);
1127
1128         synchronize_rcu();
1129
1130         write_lock_bh(&dev_base_lock);
1131         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1132         write_unlock_bh(&dev_base_lock);
1133
1134         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1135         ret = notifier_to_errno(ret);
1136
1137         if (ret) {
1138                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1139                 if (err >= 0) {
1140                         err = ret;
1141                         write_seqcount_begin(&devnet_rename_seq);
1142                         memcpy(dev->name, oldname, IFNAMSIZ);
1143                         memcpy(oldname, newname, IFNAMSIZ);
1144                         goto rollback;
1145                 } else {
1146                         pr_err("%s: name change rollback failed: %d\n",
1147                                dev->name, ret);
1148                 }
1149         }
1150
1151         return err;
1152 }
1153
1154 /**
1155  *      dev_set_alias - change ifalias of a device
1156  *      @dev: device
1157  *      @alias: name up to IFALIASZ
1158  *      @len: limit of bytes to copy from info
1159  *
1160  *      Set ifalias for a device,
1161  */
1162 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1163 {
1164         char *new_ifalias;
1165
1166         ASSERT_RTNL();
1167
1168         if (len >= IFALIASZ)
1169                 return -EINVAL;
1170
1171         if (!len) {
1172                 kfree(dev->ifalias);
1173                 dev->ifalias = NULL;
1174                 return 0;
1175         }
1176
1177         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1178         if (!new_ifalias)
1179                 return -ENOMEM;
1180         dev->ifalias = new_ifalias;
1181
1182         strlcpy(dev->ifalias, alias, len+1);
1183         return len;
1184 }
1185
1186
1187 /**
1188  *      netdev_features_change - device changes features
1189  *      @dev: device to cause notification
1190  *
1191  *      Called to indicate a device has changed features.
1192  */
1193 void netdev_features_change(struct net_device *dev)
1194 {
1195         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1196 }
1197 EXPORT_SYMBOL(netdev_features_change);
1198
1199 /**
1200  *      netdev_state_change - device changes state
1201  *      @dev: device to cause notification
1202  *
1203  *      Called to indicate a device has changed state. This function calls
1204  *      the notifier chains for netdev_chain and sends a NEWLINK message
1205  *      to the routing socket.
1206  */
1207 void netdev_state_change(struct net_device *dev)
1208 {
1209         if (dev->flags & IFF_UP) {
1210                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1211                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1212         }
1213 }
1214 EXPORT_SYMBOL(netdev_state_change);
1215
1216 /**
1217  *      netdev_notify_peers - notify network peers about existence of @dev
1218  *      @dev: network device
1219  *
1220  * Generate traffic such that interested network peers are aware of
1221  * @dev, such as by generating a gratuitous ARP. This may be used when
1222  * a device wants to inform the rest of the network about some sort of
1223  * reconfiguration such as a failover event or virtual machine
1224  * migration.
1225  */
1226 void netdev_notify_peers(struct net_device *dev)
1227 {
1228         rtnl_lock();
1229         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1230         rtnl_unlock();
1231 }
1232 EXPORT_SYMBOL(netdev_notify_peers);
1233
1234 static int __dev_open(struct net_device *dev)
1235 {
1236         const struct net_device_ops *ops = dev->netdev_ops;
1237         int ret;
1238
1239         ASSERT_RTNL();
1240
1241         if (!netif_device_present(dev))
1242                 return -ENODEV;
1243
1244         /* Block netpoll from trying to do any rx path servicing.
1245          * If we don't do this there is a chance ndo_poll_controller
1246          * or ndo_poll may be running while we open the device
1247          */
1248         netpoll_poll_disable(dev);
1249
1250         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1251         ret = notifier_to_errno(ret);
1252         if (ret)
1253                 return ret;
1254
1255         set_bit(__LINK_STATE_START, &dev->state);
1256
1257         if (ops->ndo_validate_addr)
1258                 ret = ops->ndo_validate_addr(dev);
1259
1260         if (!ret && ops->ndo_open)
1261                 ret = ops->ndo_open(dev);
1262
1263         netpoll_poll_enable(dev);
1264
1265         if (ret)
1266                 clear_bit(__LINK_STATE_START, &dev->state);
1267         else {
1268                 dev->flags |= IFF_UP;
1269                 net_dmaengine_get();
1270                 dev_set_rx_mode(dev);
1271                 dev_activate(dev);
1272                 add_device_randomness(dev->dev_addr, dev->addr_len);
1273         }
1274
1275         return ret;
1276 }
1277
1278 /**
1279  *      dev_open        - prepare an interface for use.
1280  *      @dev:   device to open
1281  *
1282  *      Takes a device from down to up state. The device's private open
1283  *      function is invoked and then the multicast lists are loaded. Finally
1284  *      the device is moved into the up state and a %NETDEV_UP message is
1285  *      sent to the netdev notifier chain.
1286  *
1287  *      Calling this function on an active interface is a nop. On a failure
1288  *      a negative errno code is returned.
1289  */
1290 int dev_open(struct net_device *dev)
1291 {
1292         int ret;
1293
1294         if (dev->flags & IFF_UP)
1295                 return 0;
1296
1297         ret = __dev_open(dev);
1298         if (ret < 0)
1299                 return ret;
1300
1301         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1302         call_netdevice_notifiers(NETDEV_UP, dev);
1303
1304         return ret;
1305 }
1306 EXPORT_SYMBOL(dev_open);
1307
1308 static int __dev_close_many(struct list_head *head)
1309 {
1310         struct net_device *dev;
1311
1312         ASSERT_RTNL();
1313         might_sleep();
1314
1315         list_for_each_entry(dev, head, close_list) {
1316                 /* Temporarily disable netpoll until the interface is down */
1317                 netpoll_poll_disable(dev);
1318
1319                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1320
1321                 clear_bit(__LINK_STATE_START, &dev->state);
1322
1323                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1324                  * can be even on different cpu. So just clear netif_running().
1325                  *
1326                  * dev->stop() will invoke napi_disable() on all of it's
1327                  * napi_struct instances on this device.
1328                  */
1329                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1330         }
1331
1332         dev_deactivate_many(head);
1333
1334         list_for_each_entry(dev, head, close_list) {
1335                 const struct net_device_ops *ops = dev->netdev_ops;
1336
1337                 /*
1338                  *      Call the device specific close. This cannot fail.
1339                  *      Only if device is UP
1340                  *
1341                  *      We allow it to be called even after a DETACH hot-plug
1342                  *      event.
1343                  */
1344                 if (ops->ndo_stop)
1345                         ops->ndo_stop(dev);
1346
1347                 dev->flags &= ~IFF_UP;
1348                 net_dmaengine_put();
1349                 netpoll_poll_enable(dev);
1350         }
1351
1352         return 0;
1353 }
1354
1355 static int __dev_close(struct net_device *dev)
1356 {
1357         int retval;
1358         LIST_HEAD(single);
1359
1360         list_add(&dev->close_list, &single);
1361         retval = __dev_close_many(&single);
1362         list_del(&single);
1363
1364         return retval;
1365 }
1366
1367 static int dev_close_many(struct list_head *head)
1368 {
1369         struct net_device *dev, *tmp;
1370
1371         /* Remove the devices that don't need to be closed */
1372         list_for_each_entry_safe(dev, tmp, head, close_list)
1373                 if (!(dev->flags & IFF_UP))
1374                         list_del_init(&dev->close_list);
1375
1376         __dev_close_many(head);
1377
1378         list_for_each_entry_safe(dev, tmp, head, close_list) {
1379                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1380                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1381                 list_del_init(&dev->close_list);
1382         }
1383
1384         return 0;
1385 }
1386
1387 /**
1388  *      dev_close - shutdown an interface.
1389  *      @dev: device to shutdown
1390  *
1391  *      This function moves an active device into down state. A
1392  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1393  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1394  *      chain.
1395  */
1396 int dev_close(struct net_device *dev)
1397 {
1398         if (dev->flags & IFF_UP) {
1399                 LIST_HEAD(single);
1400
1401                 list_add(&dev->close_list, &single);
1402                 dev_close_many(&single);
1403                 list_del(&single);
1404         }
1405         return 0;
1406 }
1407 EXPORT_SYMBOL(dev_close);
1408
1409
1410 /**
1411  *      dev_disable_lro - disable Large Receive Offload on a device
1412  *      @dev: device
1413  *
1414  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1415  *      called under RTNL.  This is needed if received packets may be
1416  *      forwarded to another interface.
1417  */
1418 void dev_disable_lro(struct net_device *dev)
1419 {
1420         /*
1421          * If we're trying to disable lro on a vlan device
1422          * use the underlying physical device instead
1423          */
1424         if (is_vlan_dev(dev))
1425                 dev = vlan_dev_real_dev(dev);
1426
1427         /* the same for macvlan devices */
1428         if (netif_is_macvlan(dev))
1429                 dev = macvlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1440                                    struct net_device *dev)
1441 {
1442         struct netdev_notifier_info info;
1443
1444         netdev_notifier_info_init(&info, dev);
1445         return nb->notifier_call(nb, val, &info);
1446 }
1447
1448 static int dev_boot_phase = 1;
1449
1450 /**
1451  *      register_netdevice_notifier - register a network notifier block
1452  *      @nb: notifier
1453  *
1454  *      Register a notifier to be called when network device events occur.
1455  *      The notifier passed is linked into the kernel structures and must
1456  *      not be reused until it has been unregistered. A negative errno code
1457  *      is returned on a failure.
1458  *
1459  *      When registered all registration and up events are replayed
1460  *      to the new notifier to allow device to have a race free
1461  *      view of the network device list.
1462  */
1463
1464 int register_netdevice_notifier(struct notifier_block *nb)
1465 {
1466         struct net_device *dev;
1467         struct net_device *last;
1468         struct net *net;
1469         int err;
1470
1471         rtnl_lock();
1472         err = raw_notifier_chain_register(&netdev_chain, nb);
1473         if (err)
1474                 goto unlock;
1475         if (dev_boot_phase)
1476                 goto unlock;
1477         for_each_net(net) {
1478                 for_each_netdev(net, dev) {
1479                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1480                         err = notifier_to_errno(err);
1481                         if (err)
1482                                 goto rollback;
1483
1484                         if (!(dev->flags & IFF_UP))
1485                                 continue;
1486
1487                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1488                 }
1489         }
1490
1491 unlock:
1492         rtnl_unlock();
1493         return err;
1494
1495 rollback:
1496         last = dev;
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         if (dev == last)
1500                                 goto outroll;
1501
1502                         if (dev->flags & IFF_UP) {
1503                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1504                                                         dev);
1505                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1506                         }
1507                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1508                 }
1509         }
1510
1511 outroll:
1512         raw_notifier_chain_unregister(&netdev_chain, nb);
1513         goto unlock;
1514 }
1515 EXPORT_SYMBOL(register_netdevice_notifier);
1516
1517 /**
1518  *      unregister_netdevice_notifier - unregister a network notifier block
1519  *      @nb: notifier
1520  *
1521  *      Unregister a notifier previously registered by
1522  *      register_netdevice_notifier(). The notifier is unlinked into the
1523  *      kernel structures and may then be reused. A negative errno code
1524  *      is returned on a failure.
1525  *
1526  *      After unregistering unregister and down device events are synthesized
1527  *      for all devices on the device list to the removed notifier to remove
1528  *      the need for special case cleanup code.
1529  */
1530
1531 int unregister_netdevice_notifier(struct notifier_block *nb)
1532 {
1533         struct net_device *dev;
1534         struct net *net;
1535         int err;
1536
1537         rtnl_lock();
1538         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1539         if (err)
1540                 goto unlock;
1541
1542         for_each_net(net) {
1543                 for_each_netdev(net, dev) {
1544                         if (dev->flags & IFF_UP) {
1545                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1546                                                         dev);
1547                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1548                         }
1549                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1550                 }
1551         }
1552 unlock:
1553         rtnl_unlock();
1554         return err;
1555 }
1556 EXPORT_SYMBOL(unregister_netdevice_notifier);
1557
1558 /**
1559  *      call_netdevice_notifiers_info - call all network notifier blocks
1560  *      @val: value passed unmodified to notifier function
1561  *      @dev: net_device pointer passed unmodified to notifier function
1562  *      @info: notifier information data
1563  *
1564  *      Call all network notifier blocks.  Parameters and return value
1565  *      are as for raw_notifier_call_chain().
1566  */
1567
1568 static int call_netdevice_notifiers_info(unsigned long val,
1569                                          struct net_device *dev,
1570                                          struct netdev_notifier_info *info)
1571 {
1572         ASSERT_RTNL();
1573         netdev_notifier_info_init(info, dev);
1574         return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576
1577 /**
1578  *      call_netdevice_notifiers - call all network notifier blocks
1579  *      @val: value passed unmodified to notifier function
1580  *      @dev: net_device pointer passed unmodified to notifier function
1581  *
1582  *      Call all network notifier blocks.  Parameters and return value
1583  *      are as for raw_notifier_call_chain().
1584  */
1585
1586 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1587 {
1588         struct netdev_notifier_info info;
1589
1590         return call_netdevice_notifiers_info(val, dev, &info);
1591 }
1592 EXPORT_SYMBOL(call_netdevice_notifiers);
1593
1594 static struct static_key netstamp_needed __read_mostly;
1595 #ifdef HAVE_JUMP_LABEL
1596 /* We are not allowed to call static_key_slow_dec() from irq context
1597  * If net_disable_timestamp() is called from irq context, defer the
1598  * static_key_slow_dec() calls.
1599  */
1600 static atomic_t netstamp_needed_deferred;
1601 #endif
1602
1603 void net_enable_timestamp(void)
1604 {
1605 #ifdef HAVE_JUMP_LABEL
1606         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1607
1608         if (deferred) {
1609                 while (--deferred)
1610                         static_key_slow_dec(&netstamp_needed);
1611                 return;
1612         }
1613 #endif
1614         static_key_slow_inc(&netstamp_needed);
1615 }
1616 EXPORT_SYMBOL(net_enable_timestamp);
1617
1618 void net_disable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621         if (in_interrupt()) {
1622                 atomic_inc(&netstamp_needed_deferred);
1623                 return;
1624         }
1625 #endif
1626         static_key_slow_dec(&netstamp_needed);
1627 }
1628 EXPORT_SYMBOL(net_disable_timestamp);
1629
1630 static inline void net_timestamp_set(struct sk_buff *skb)
1631 {
1632         skb->tstamp.tv64 = 0;
1633         if (static_key_false(&netstamp_needed))
1634                 __net_timestamp(skb);
1635 }
1636
1637 #define net_timestamp_check(COND, SKB)                  \
1638         if (static_key_false(&netstamp_needed)) {               \
1639                 if ((COND) && !(SKB)->tstamp.tv64)      \
1640                         __net_timestamp(SKB);           \
1641         }                                               \
1642
1643 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1644 {
1645         unsigned int len;
1646
1647         if (!(dev->flags & IFF_UP))
1648                 return false;
1649
1650         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1651         if (skb->len <= len)
1652                 return true;
1653
1654         /* if TSO is enabled, we don't care about the length as the packet
1655          * could be forwarded without being segmented before
1656          */
1657         if (skb_is_gso(skb))
1658                 return true;
1659
1660         return false;
1661 }
1662 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1663
1664 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1665 {
1666         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1667                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1668                         atomic_long_inc(&dev->rx_dropped);
1669                         kfree_skb(skb);
1670                         return NET_RX_DROP;
1671                 }
1672         }
1673
1674         if (unlikely(!is_skb_forwardable(dev, skb))) {
1675                 atomic_long_inc(&dev->rx_dropped);
1676                 kfree_skb(skb);
1677                 return NET_RX_DROP;
1678         }
1679
1680         skb_scrub_packet(skb, true);
1681         skb->protocol = eth_type_trans(skb, dev);
1682
1683         return 0;
1684 }
1685 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1686
1687 /**
1688  * dev_forward_skb - loopback an skb to another netif
1689  *
1690  * @dev: destination network device
1691  * @skb: buffer to forward
1692  *
1693  * return values:
1694  *      NET_RX_SUCCESS  (no congestion)
1695  *      NET_RX_DROP     (packet was dropped, but freed)
1696  *
1697  * dev_forward_skb can be used for injecting an skb from the
1698  * start_xmit function of one device into the receive queue
1699  * of another device.
1700  *
1701  * The receiving device may be in another namespace, so
1702  * we have to clear all information in the skb that could
1703  * impact namespace isolation.
1704  */
1705 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1706 {
1707         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1708 }
1709 EXPORT_SYMBOL_GPL(dev_forward_skb);
1710
1711 static inline int deliver_skb(struct sk_buff *skb,
1712                               struct packet_type *pt_prev,
1713                               struct net_device *orig_dev)
1714 {
1715         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1716                 return -ENOMEM;
1717         atomic_inc(&skb->users);
1718         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1719 }
1720
1721 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1722 {
1723         if (!ptype->af_packet_priv || !skb->sk)
1724                 return false;
1725
1726         if (ptype->id_match)
1727                 return ptype->id_match(ptype, skb->sk);
1728         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1729                 return true;
1730
1731         return false;
1732 }
1733
1734 /*
1735  *      Support routine. Sends outgoing frames to any network
1736  *      taps currently in use.
1737  */
1738
1739 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1740 {
1741         struct packet_type *ptype;
1742         struct sk_buff *skb2 = NULL;
1743         struct packet_type *pt_prev = NULL;
1744
1745         rcu_read_lock();
1746         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1747                 /* Never send packets back to the socket
1748                  * they originated from - MvS (miquels@drinkel.ow.org)
1749                  */
1750                 if ((ptype->dev == dev || !ptype->dev) &&
1751                     (!skb_loop_sk(ptype, skb))) {
1752                         if (pt_prev) {
1753                                 deliver_skb(skb2, pt_prev, skb->dev);
1754                                 pt_prev = ptype;
1755                                 continue;
1756                         }
1757
1758                         skb2 = skb_clone(skb, GFP_ATOMIC);
1759                         if (!skb2)
1760                                 break;
1761
1762                         net_timestamp_set(skb2);
1763
1764                         /* skb->nh should be correctly
1765                            set by sender, so that the second statement is
1766                            just protection against buggy protocols.
1767                          */
1768                         skb_reset_mac_header(skb2);
1769
1770                         if (skb_network_header(skb2) < skb2->data ||
1771                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1772                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1773                                                      ntohs(skb2->protocol),
1774                                                      dev->name);
1775                                 skb_reset_network_header(skb2);
1776                         }
1777
1778                         skb2->transport_header = skb2->network_header;
1779                         skb2->pkt_type = PACKET_OUTGOING;
1780                         pt_prev = ptype;
1781                 }
1782         }
1783         if (pt_prev)
1784                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1785         rcu_read_unlock();
1786 }
1787
1788 /**
1789  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1790  * @dev: Network device
1791  * @txq: number of queues available
1792  *
1793  * If real_num_tx_queues is changed the tc mappings may no longer be
1794  * valid. To resolve this verify the tc mapping remains valid and if
1795  * not NULL the mapping. With no priorities mapping to this
1796  * offset/count pair it will no longer be used. In the worst case TC0
1797  * is invalid nothing can be done so disable priority mappings. If is
1798  * expected that drivers will fix this mapping if they can before
1799  * calling netif_set_real_num_tx_queues.
1800  */
1801 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1802 {
1803         int i;
1804         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1805
1806         /* If TC0 is invalidated disable TC mapping */
1807         if (tc->offset + tc->count > txq) {
1808                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1809                 dev->num_tc = 0;
1810                 return;
1811         }
1812
1813         /* Invalidated prio to tc mappings set to TC0 */
1814         for (i = 1; i < TC_BITMASK + 1; i++) {
1815                 int q = netdev_get_prio_tc_map(dev, i);
1816
1817                 tc = &dev->tc_to_txq[q];
1818                 if (tc->offset + tc->count > txq) {
1819                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1820                                 i, q);
1821                         netdev_set_prio_tc_map(dev, i, 0);
1822                 }
1823         }
1824 }
1825
1826 #ifdef CONFIG_XPS
1827 static DEFINE_MUTEX(xps_map_mutex);
1828 #define xmap_dereference(P)             \
1829         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1830
1831 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1832                                         int cpu, u16 index)
1833 {
1834         struct xps_map *map = NULL;
1835         int pos;
1836
1837         if (dev_maps)
1838                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1839
1840         for (pos = 0; map && pos < map->len; pos++) {
1841                 if (map->queues[pos] == index) {
1842                         if (map->len > 1) {
1843                                 map->queues[pos] = map->queues[--map->len];
1844                         } else {
1845                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1846                                 kfree_rcu(map, rcu);
1847                                 map = NULL;
1848                         }
1849                         break;
1850                 }
1851         }
1852
1853         return map;
1854 }
1855
1856 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1857 {
1858         struct xps_dev_maps *dev_maps;
1859         int cpu, i;
1860         bool active = false;
1861
1862         mutex_lock(&xps_map_mutex);
1863         dev_maps = xmap_dereference(dev->xps_maps);
1864
1865         if (!dev_maps)
1866                 goto out_no_maps;
1867
1868         for_each_possible_cpu(cpu) {
1869                 for (i = index; i < dev->num_tx_queues; i++) {
1870                         if (!remove_xps_queue(dev_maps, cpu, i))
1871                                 break;
1872                 }
1873                 if (i == dev->num_tx_queues)
1874                         active = true;
1875         }
1876
1877         if (!active) {
1878                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1879                 kfree_rcu(dev_maps, rcu);
1880         }
1881
1882         for (i = index; i < dev->num_tx_queues; i++)
1883                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1884                                              NUMA_NO_NODE);
1885
1886 out_no_maps:
1887         mutex_unlock(&xps_map_mutex);
1888 }
1889
1890 static struct xps_map *expand_xps_map(struct xps_map *map,
1891                                       int cpu, u16 index)
1892 {
1893         struct xps_map *new_map;
1894         int alloc_len = XPS_MIN_MAP_ALLOC;
1895         int i, pos;
1896
1897         for (pos = 0; map && pos < map->len; pos++) {
1898                 if (map->queues[pos] != index)
1899                         continue;
1900                 return map;
1901         }
1902
1903         /* Need to add queue to this CPU's existing map */
1904         if (map) {
1905                 if (pos < map->alloc_len)
1906                         return map;
1907
1908                 alloc_len = map->alloc_len * 2;
1909         }
1910
1911         /* Need to allocate new map to store queue on this CPU's map */
1912         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1913                                cpu_to_node(cpu));
1914         if (!new_map)
1915                 return NULL;
1916
1917         for (i = 0; i < pos; i++)
1918                 new_map->queues[i] = map->queues[i];
1919         new_map->alloc_len = alloc_len;
1920         new_map->len = pos;
1921
1922         return new_map;
1923 }
1924
1925 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1926                         u16 index)
1927 {
1928         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1929         struct xps_map *map, *new_map;
1930         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1931         int cpu, numa_node_id = -2;
1932         bool active = false;
1933
1934         mutex_lock(&xps_map_mutex);
1935
1936         dev_maps = xmap_dereference(dev->xps_maps);
1937
1938         /* allocate memory for queue storage */
1939         for_each_online_cpu(cpu) {
1940                 if (!cpumask_test_cpu(cpu, mask))
1941                         continue;
1942
1943                 if (!new_dev_maps)
1944                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1945                 if (!new_dev_maps) {
1946                         mutex_unlock(&xps_map_mutex);
1947                         return -ENOMEM;
1948                 }
1949
1950                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1951                                  NULL;
1952
1953                 map = expand_xps_map(map, cpu, index);
1954                 if (!map)
1955                         goto error;
1956
1957                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1958         }
1959
1960         if (!new_dev_maps)
1961                 goto out_no_new_maps;
1962
1963         for_each_possible_cpu(cpu) {
1964                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1965                         /* add queue to CPU maps */
1966                         int pos = 0;
1967
1968                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1969                         while ((pos < map->len) && (map->queues[pos] != index))
1970                                 pos++;
1971
1972                         if (pos == map->len)
1973                                 map->queues[map->len++] = index;
1974 #ifdef CONFIG_NUMA
1975                         if (numa_node_id == -2)
1976                                 numa_node_id = cpu_to_node(cpu);
1977                         else if (numa_node_id != cpu_to_node(cpu))
1978                                 numa_node_id = -1;
1979 #endif
1980                 } else if (dev_maps) {
1981                         /* fill in the new device map from the old device map */
1982                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1983                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1984                 }
1985
1986         }
1987
1988         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1989
1990         /* Cleanup old maps */
1991         if (dev_maps) {
1992                 for_each_possible_cpu(cpu) {
1993                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1994                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1995                         if (map && map != new_map)
1996                                 kfree_rcu(map, rcu);
1997                 }
1998
1999                 kfree_rcu(dev_maps, rcu);
2000         }
2001
2002         dev_maps = new_dev_maps;
2003         active = true;
2004
2005 out_no_new_maps:
2006         /* update Tx queue numa node */
2007         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2008                                      (numa_node_id >= 0) ? numa_node_id :
2009                                      NUMA_NO_NODE);
2010
2011         if (!dev_maps)
2012                 goto out_no_maps;
2013
2014         /* removes queue from unused CPUs */
2015         for_each_possible_cpu(cpu) {
2016                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2017                         continue;
2018
2019                 if (remove_xps_queue(dev_maps, cpu, index))
2020                         active = true;
2021         }
2022
2023         /* free map if not active */
2024         if (!active) {
2025                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2026                 kfree_rcu(dev_maps, rcu);
2027         }
2028
2029 out_no_maps:
2030         mutex_unlock(&xps_map_mutex);
2031
2032         return 0;
2033 error:
2034         /* remove any maps that we added */
2035         for_each_possible_cpu(cpu) {
2036                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2037                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2038                                  NULL;
2039                 if (new_map && new_map != map)
2040                         kfree(new_map);
2041         }
2042
2043         mutex_unlock(&xps_map_mutex);
2044
2045         kfree(new_dev_maps);
2046         return -ENOMEM;
2047 }
2048 EXPORT_SYMBOL(netif_set_xps_queue);
2049
2050 #endif
2051 /*
2052  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2053  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2054  */
2055 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2056 {
2057         int rc;
2058
2059         if (txq < 1 || txq > dev->num_tx_queues)
2060                 return -EINVAL;
2061
2062         if (dev->reg_state == NETREG_REGISTERED ||
2063             dev->reg_state == NETREG_UNREGISTERING) {
2064                 ASSERT_RTNL();
2065
2066                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2067                                                   txq);
2068                 if (rc)
2069                         return rc;
2070
2071                 if (dev->num_tc)
2072                         netif_setup_tc(dev, txq);
2073
2074                 if (txq < dev->real_num_tx_queues) {
2075                         qdisc_reset_all_tx_gt(dev, txq);
2076 #ifdef CONFIG_XPS
2077                         netif_reset_xps_queues_gt(dev, txq);
2078 #endif
2079                 }
2080         }
2081
2082         dev->real_num_tx_queues = txq;
2083         return 0;
2084 }
2085 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2086
2087 #ifdef CONFIG_SYSFS
2088 /**
2089  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2090  *      @dev: Network device
2091  *      @rxq: Actual number of RX queues
2092  *
2093  *      This must be called either with the rtnl_lock held or before
2094  *      registration of the net device.  Returns 0 on success, or a
2095  *      negative error code.  If called before registration, it always
2096  *      succeeds.
2097  */
2098 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2099 {
2100         int rc;
2101
2102         if (rxq < 1 || rxq > dev->num_rx_queues)
2103                 return -EINVAL;
2104
2105         if (dev->reg_state == NETREG_REGISTERED) {
2106                 ASSERT_RTNL();
2107
2108                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2109                                                   rxq);
2110                 if (rc)
2111                         return rc;
2112         }
2113
2114         dev->real_num_rx_queues = rxq;
2115         return 0;
2116 }
2117 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2118 #endif
2119
2120 /**
2121  * netif_get_num_default_rss_queues - default number of RSS queues
2122  *
2123  * This routine should set an upper limit on the number of RSS queues
2124  * used by default by multiqueue devices.
2125  */
2126 int netif_get_num_default_rss_queues(void)
2127 {
2128         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2129 }
2130 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2131
2132 static inline void __netif_reschedule(struct Qdisc *q)
2133 {
2134         struct softnet_data *sd;
2135         unsigned long flags;
2136
2137         local_irq_save(flags);
2138         sd = &__get_cpu_var(softnet_data);
2139         q->next_sched = NULL;
2140         *sd->output_queue_tailp = q;
2141         sd->output_queue_tailp = &q->next_sched;
2142         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2143         local_irq_restore(flags);
2144 }
2145
2146 void __netif_schedule(struct Qdisc *q)
2147 {
2148         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2149                 __netif_reschedule(q);
2150 }
2151 EXPORT_SYMBOL(__netif_schedule);
2152
2153 struct dev_kfree_skb_cb {
2154         enum skb_free_reason reason;
2155 };
2156
2157 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2158 {
2159         return (struct dev_kfree_skb_cb *)skb->cb;
2160 }
2161
2162 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2163 {
2164         unsigned long flags;
2165
2166         if (likely(atomic_read(&skb->users) == 1)) {
2167                 smp_rmb();
2168                 atomic_set(&skb->users, 0);
2169         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2170                 return;
2171         }
2172         get_kfree_skb_cb(skb)->reason = reason;
2173         local_irq_save(flags);
2174         skb->next = __this_cpu_read(softnet_data.completion_queue);
2175         __this_cpu_write(softnet_data.completion_queue, skb);
2176         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2177         local_irq_restore(flags);
2178 }
2179 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2180
2181 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2182 {
2183         if (in_irq() || irqs_disabled())
2184                 __dev_kfree_skb_irq(skb, reason);
2185         else
2186                 dev_kfree_skb(skb);
2187 }
2188 EXPORT_SYMBOL(__dev_kfree_skb_any);
2189
2190
2191 /**
2192  * netif_device_detach - mark device as removed
2193  * @dev: network device
2194  *
2195  * Mark device as removed from system and therefore no longer available.
2196  */
2197 void netif_device_detach(struct net_device *dev)
2198 {
2199         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2200             netif_running(dev)) {
2201                 netif_tx_stop_all_queues(dev);
2202         }
2203 }
2204 EXPORT_SYMBOL(netif_device_detach);
2205
2206 /**
2207  * netif_device_attach - mark device as attached
2208  * @dev: network device
2209  *
2210  * Mark device as attached from system and restart if needed.
2211  */
2212 void netif_device_attach(struct net_device *dev)
2213 {
2214         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2215             netif_running(dev)) {
2216                 netif_tx_wake_all_queues(dev);
2217                 __netdev_watchdog_up(dev);
2218         }
2219 }
2220 EXPORT_SYMBOL(netif_device_attach);
2221
2222 static void skb_warn_bad_offload(const struct sk_buff *skb)
2223 {
2224         static const netdev_features_t null_features = 0;
2225         struct net_device *dev = skb->dev;
2226         const char *driver = "";
2227
2228         if (!net_ratelimit())
2229                 return;
2230
2231         if (dev && dev->dev.parent)
2232                 driver = dev_driver_string(dev->dev.parent);
2233
2234         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2235              "gso_type=%d ip_summed=%d\n",
2236              driver, dev ? &dev->features : &null_features,
2237              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2238              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2239              skb_shinfo(skb)->gso_type, skb->ip_summed);
2240 }
2241
2242 /*
2243  * Invalidate hardware checksum when packet is to be mangled, and
2244  * complete checksum manually on outgoing path.
2245  */
2246 int skb_checksum_help(struct sk_buff *skb)
2247 {
2248         __wsum csum;
2249         int ret = 0, offset;
2250
2251         if (skb->ip_summed == CHECKSUM_COMPLETE)
2252                 goto out_set_summed;
2253
2254         if (unlikely(skb_shinfo(skb)->gso_size)) {
2255                 skb_warn_bad_offload(skb);
2256                 return -EINVAL;
2257         }
2258
2259         /* Before computing a checksum, we should make sure no frag could
2260          * be modified by an external entity : checksum could be wrong.
2261          */
2262         if (skb_has_shared_frag(skb)) {
2263                 ret = __skb_linearize(skb);
2264                 if (ret)
2265                         goto out;
2266         }
2267
2268         offset = skb_checksum_start_offset(skb);
2269         BUG_ON(offset >= skb_headlen(skb));
2270         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2271
2272         offset += skb->csum_offset;
2273         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2274
2275         if (skb_cloned(skb) &&
2276             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2277                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2278                 if (ret)
2279                         goto out;
2280         }
2281
2282         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2283 out_set_summed:
2284         skb->ip_summed = CHECKSUM_NONE;
2285 out:
2286         return ret;
2287 }
2288 EXPORT_SYMBOL(skb_checksum_help);
2289
2290 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2291 {
2292         unsigned int vlan_depth = skb->mac_len;
2293         __be16 type = skb->protocol;
2294
2295         /* Tunnel gso handlers can set protocol to ethernet. */
2296         if (type == htons(ETH_P_TEB)) {
2297                 struct ethhdr *eth;
2298
2299                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2300                         return 0;
2301
2302                 eth = (struct ethhdr *)skb_mac_header(skb);
2303                 type = eth->h_proto;
2304         }
2305
2306         /* if skb->protocol is 802.1Q/AD then the header should already be
2307          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2308          * ETH_HLEN otherwise
2309          */
2310         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2311                 if (vlan_depth) {
2312                         if (unlikely(WARN_ON(vlan_depth < VLAN_HLEN)))
2313                                 return 0;
2314                         vlan_depth -= VLAN_HLEN;
2315                 } else {
2316                         vlan_depth = ETH_HLEN;
2317                 }
2318                 do {
2319                         struct vlan_hdr *vh;
2320
2321                         if (unlikely(!pskb_may_pull(skb,
2322                                                     vlan_depth + VLAN_HLEN)))
2323                                 return 0;
2324
2325                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2326                         type = vh->h_vlan_encapsulated_proto;
2327                         vlan_depth += VLAN_HLEN;
2328                 } while (type == htons(ETH_P_8021Q) ||
2329                          type == htons(ETH_P_8021AD));
2330         }
2331
2332         *depth = vlan_depth;
2333
2334         return type;
2335 }
2336
2337 /**
2338  *      skb_mac_gso_segment - mac layer segmentation handler.
2339  *      @skb: buffer to segment
2340  *      @features: features for the output path (see dev->features)
2341  */
2342 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2343                                     netdev_features_t features)
2344 {
2345         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2346         struct packet_offload *ptype;
2347         int vlan_depth = skb->mac_len;
2348         __be16 type = skb_network_protocol(skb, &vlan_depth);
2349
2350         if (unlikely(!type))
2351                 return ERR_PTR(-EINVAL);
2352
2353         __skb_pull(skb, vlan_depth);
2354
2355         rcu_read_lock();
2356         list_for_each_entry_rcu(ptype, &offload_base, list) {
2357                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2358                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2359                                 int err;
2360
2361                                 err = ptype->callbacks.gso_send_check(skb);
2362                                 segs = ERR_PTR(err);
2363                                 if (err || skb_gso_ok(skb, features))
2364                                         break;
2365                                 __skb_push(skb, (skb->data -
2366                                                  skb_network_header(skb)));
2367                         }
2368                         segs = ptype->callbacks.gso_segment(skb, features);
2369                         break;
2370                 }
2371         }
2372         rcu_read_unlock();
2373
2374         __skb_push(skb, skb->data - skb_mac_header(skb));
2375
2376         return segs;
2377 }
2378 EXPORT_SYMBOL(skb_mac_gso_segment);
2379
2380
2381 /* openvswitch calls this on rx path, so we need a different check.
2382  */
2383 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2384 {
2385         if (tx_path)
2386                 return skb->ip_summed != CHECKSUM_PARTIAL;
2387         else
2388                 return skb->ip_summed == CHECKSUM_NONE;
2389 }
2390
2391 /**
2392  *      __skb_gso_segment - Perform segmentation on skb.
2393  *      @skb: buffer to segment
2394  *      @features: features for the output path (see dev->features)
2395  *      @tx_path: whether it is called in TX path
2396  *
2397  *      This function segments the given skb and returns a list of segments.
2398  *
2399  *      It may return NULL if the skb requires no segmentation.  This is
2400  *      only possible when GSO is used for verifying header integrity.
2401  */
2402 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2403                                   netdev_features_t features, bool tx_path)
2404 {
2405         if (unlikely(skb_needs_check(skb, tx_path))) {
2406                 int err;
2407
2408                 skb_warn_bad_offload(skb);
2409
2410                 if (skb_header_cloned(skb) &&
2411                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2412                         return ERR_PTR(err);
2413         }
2414
2415         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2416         SKB_GSO_CB(skb)->encap_level = 0;
2417
2418         skb_reset_mac_header(skb);
2419         skb_reset_mac_len(skb);
2420
2421         return skb_mac_gso_segment(skb, features);
2422 }
2423 EXPORT_SYMBOL(__skb_gso_segment);
2424
2425 /* Take action when hardware reception checksum errors are detected. */
2426 #ifdef CONFIG_BUG
2427 void netdev_rx_csum_fault(struct net_device *dev)
2428 {
2429         if (net_ratelimit()) {
2430                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2431                 dump_stack();
2432         }
2433 }
2434 EXPORT_SYMBOL(netdev_rx_csum_fault);
2435 #endif
2436
2437 /* Actually, we should eliminate this check as soon as we know, that:
2438  * 1. IOMMU is present and allows to map all the memory.
2439  * 2. No high memory really exists on this machine.
2440  */
2441
2442 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2443 {
2444 #ifdef CONFIG_HIGHMEM
2445         int i;
2446         if (!(dev->features & NETIF_F_HIGHDMA)) {
2447                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2448                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2449                         if (PageHighMem(skb_frag_page(frag)))
2450                                 return 1;
2451                 }
2452         }
2453
2454         if (PCI_DMA_BUS_IS_PHYS) {
2455                 struct device *pdev = dev->dev.parent;
2456
2457                 if (!pdev)
2458                         return 0;
2459                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2460                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2461                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2462                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2463                                 return 1;
2464                 }
2465         }
2466 #endif
2467         return 0;
2468 }
2469
2470 struct dev_gso_cb {
2471         void (*destructor)(struct sk_buff *skb);
2472 };
2473
2474 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2475
2476 static void dev_gso_skb_destructor(struct sk_buff *skb)
2477 {
2478         struct dev_gso_cb *cb;
2479
2480         kfree_skb_list(skb->next);
2481         skb->next = NULL;
2482
2483         cb = DEV_GSO_CB(skb);
2484         if (cb->destructor)
2485                 cb->destructor(skb);
2486 }
2487
2488 /**
2489  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2490  *      @skb: buffer to segment
2491  *      @features: device features as applicable to this skb
2492  *
2493  *      This function segments the given skb and stores the list of segments
2494  *      in skb->next.
2495  */
2496 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2497 {
2498         struct sk_buff *segs;
2499
2500         segs = skb_gso_segment(skb, features);
2501
2502         /* Verifying header integrity only. */
2503         if (!segs)
2504                 return 0;
2505
2506         if (IS_ERR(segs))
2507                 return PTR_ERR(segs);
2508
2509         skb->next = segs;
2510         DEV_GSO_CB(skb)->destructor = skb->destructor;
2511         skb->destructor = dev_gso_skb_destructor;
2512
2513         return 0;
2514 }
2515
2516 static netdev_features_t harmonize_features(struct sk_buff *skb,
2517         netdev_features_t features)
2518 {
2519         int tmp;
2520
2521         if (skb->ip_summed != CHECKSUM_NONE &&
2522             !can_checksum_protocol(features, skb_network_protocol(skb, &tmp))) {
2523                 features &= ~NETIF_F_ALL_CSUM;
2524         } else if (illegal_highdma(skb->dev, skb)) {
2525                 features &= ~NETIF_F_SG;
2526         }
2527
2528         return features;
2529 }
2530
2531 netdev_features_t netif_skb_features(struct sk_buff *skb)
2532 {
2533         __be16 protocol = skb->protocol;
2534         netdev_features_t features = skb->dev->features;
2535
2536         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2537                 features &= ~NETIF_F_GSO_MASK;
2538
2539         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2540                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2541                 protocol = veh->h_vlan_encapsulated_proto;
2542         } else if (!vlan_tx_tag_present(skb)) {
2543                 return harmonize_features(skb, features);
2544         }
2545
2546         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2547                                                NETIF_F_HW_VLAN_STAG_TX);
2548
2549         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2550                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2551                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2552                                 NETIF_F_HW_VLAN_STAG_TX;
2553
2554         return harmonize_features(skb, features);
2555 }
2556 EXPORT_SYMBOL(netif_skb_features);
2557
2558 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2559                         struct netdev_queue *txq)
2560 {
2561         const struct net_device_ops *ops = dev->netdev_ops;
2562         int rc = NETDEV_TX_OK;
2563         unsigned int skb_len;
2564
2565         if (likely(!skb->next)) {
2566                 netdev_features_t features;
2567
2568                 /*
2569                  * If device doesn't need skb->dst, release it right now while
2570                  * its hot in this cpu cache
2571                  */
2572                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2573                         skb_dst_drop(skb);
2574
2575                 features = netif_skb_features(skb);
2576
2577                 if (vlan_tx_tag_present(skb) &&
2578                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2579                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2580                                              vlan_tx_tag_get(skb));
2581                         if (unlikely(!skb))
2582                                 goto out;
2583
2584                         skb->vlan_tci = 0;
2585                 }
2586
2587                 /* If encapsulation offload request, verify we are testing
2588                  * hardware encapsulation features instead of standard
2589                  * features for the netdev
2590                  */
2591                 if (skb->encapsulation)
2592                         features &= dev->hw_enc_features;
2593
2594                 if (netif_needs_gso(skb, features)) {
2595                         if (unlikely(dev_gso_segment(skb, features)))
2596                                 goto out_kfree_skb;
2597                         if (skb->next)
2598                                 goto gso;
2599                 } else {
2600                         if (skb_needs_linearize(skb, features) &&
2601                             __skb_linearize(skb))
2602                                 goto out_kfree_skb;
2603
2604                         /* If packet is not checksummed and device does not
2605                          * support checksumming for this protocol, complete
2606                          * checksumming here.
2607                          */
2608                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2609                                 if (skb->encapsulation)
2610                                         skb_set_inner_transport_header(skb,
2611                                                 skb_checksum_start_offset(skb));
2612                                 else
2613                                         skb_set_transport_header(skb,
2614                                                 skb_checksum_start_offset(skb));
2615                                 if (!(features & NETIF_F_ALL_CSUM) &&
2616                                      skb_checksum_help(skb))
2617                                         goto out_kfree_skb;
2618                         }
2619                 }
2620
2621                 if (!list_empty(&ptype_all))
2622                         dev_queue_xmit_nit(skb, dev);
2623
2624                 skb_len = skb->len;
2625                 trace_net_dev_start_xmit(skb, dev);
2626                 rc = ops->ndo_start_xmit(skb, dev);
2627                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2628                 if (rc == NETDEV_TX_OK)
2629                         txq_trans_update(txq);
2630                 return rc;
2631         }
2632
2633 gso:
2634         do {
2635                 struct sk_buff *nskb = skb->next;
2636
2637                 skb->next = nskb->next;
2638                 nskb->next = NULL;
2639
2640                 if (!list_empty(&ptype_all))
2641                         dev_queue_xmit_nit(nskb, dev);
2642
2643                 skb_len = nskb->len;
2644                 trace_net_dev_start_xmit(nskb, dev);
2645                 rc = ops->ndo_start_xmit(nskb, dev);
2646                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2647                 if (unlikely(rc != NETDEV_TX_OK)) {
2648                         if (rc & ~NETDEV_TX_MASK)
2649                                 goto out_kfree_gso_skb;
2650                         nskb->next = skb->next;
2651                         skb->next = nskb;
2652                         return rc;
2653                 }
2654                 txq_trans_update(txq);
2655                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2656                         return NETDEV_TX_BUSY;
2657         } while (skb->next);
2658
2659 out_kfree_gso_skb:
2660         if (likely(skb->next == NULL)) {
2661                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2662                 consume_skb(skb);
2663                 return rc;
2664         }
2665 out_kfree_skb:
2666         kfree_skb(skb);
2667 out:
2668         return rc;
2669 }
2670 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2671
2672 static void qdisc_pkt_len_init(struct sk_buff *skb)
2673 {
2674         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2675
2676         qdisc_skb_cb(skb)->pkt_len = skb->len;
2677
2678         /* To get more precise estimation of bytes sent on wire,
2679          * we add to pkt_len the headers size of all segments
2680          */
2681         if (shinfo->gso_size)  {
2682                 unsigned int hdr_len;
2683                 u16 gso_segs = shinfo->gso_segs;
2684
2685                 /* mac layer + network layer */
2686                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2687
2688                 /* + transport layer */
2689                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2690                         hdr_len += tcp_hdrlen(skb);
2691                 else
2692                         hdr_len += sizeof(struct udphdr);
2693
2694                 if (shinfo->gso_type & SKB_GSO_DODGY)
2695                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2696                                                 shinfo->gso_size);
2697
2698                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2699         }
2700 }
2701
2702 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2703                                  struct net_device *dev,
2704                                  struct netdev_queue *txq)
2705 {
2706         spinlock_t *root_lock = qdisc_lock(q);
2707         bool contended;
2708         int rc;
2709
2710         qdisc_pkt_len_init(skb);
2711         qdisc_calculate_pkt_len(skb, q);
2712         /*
2713          * Heuristic to force contended enqueues to serialize on a
2714          * separate lock before trying to get qdisc main lock.
2715          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2716          * and dequeue packets faster.
2717          */
2718         contended = qdisc_is_running(q);
2719         if (unlikely(contended))
2720                 spin_lock(&q->busylock);
2721
2722         spin_lock(root_lock);
2723         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2724                 kfree_skb(skb);
2725                 rc = NET_XMIT_DROP;
2726         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2727                    qdisc_run_begin(q)) {
2728                 /*
2729                  * This is a work-conserving queue; there are no old skbs
2730                  * waiting to be sent out; and the qdisc is not running -
2731                  * xmit the skb directly.
2732                  */
2733                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2734                         skb_dst_force(skb);
2735
2736                 qdisc_bstats_update(q, skb);
2737
2738                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2739                         if (unlikely(contended)) {
2740                                 spin_unlock(&q->busylock);
2741                                 contended = false;
2742                         }
2743                         __qdisc_run(q);
2744                 } else
2745                         qdisc_run_end(q);
2746
2747                 rc = NET_XMIT_SUCCESS;
2748         } else {
2749                 skb_dst_force(skb);
2750                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2751                 if (qdisc_run_begin(q)) {
2752                         if (unlikely(contended)) {
2753                                 spin_unlock(&q->busylock);
2754                                 contended = false;
2755                         }
2756                         __qdisc_run(q);
2757                 }
2758         }
2759         spin_unlock(root_lock);
2760         if (unlikely(contended))
2761                 spin_unlock(&q->busylock);
2762         return rc;
2763 }
2764
2765 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2766 static void skb_update_prio(struct sk_buff *skb)
2767 {
2768         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2769
2770         if (!skb->priority && skb->sk && map) {
2771                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2772
2773                 if (prioidx < map->priomap_len)
2774                         skb->priority = map->priomap[prioidx];
2775         }
2776 }
2777 #else
2778 #define skb_update_prio(skb)
2779 #endif
2780
2781 static DEFINE_PER_CPU(int, xmit_recursion);
2782 #define RECURSION_LIMIT 10
2783
2784 /**
2785  *      dev_loopback_xmit - loop back @skb
2786  *      @skb: buffer to transmit
2787  */
2788 int dev_loopback_xmit(struct sk_buff *skb)
2789 {
2790         skb_reset_mac_header(skb);
2791         __skb_pull(skb, skb_network_offset(skb));
2792         skb->pkt_type = PACKET_LOOPBACK;
2793         skb->ip_summed = CHECKSUM_UNNECESSARY;
2794         WARN_ON(!skb_dst(skb));
2795         skb_dst_force(skb);
2796         netif_rx_ni(skb);
2797         return 0;
2798 }
2799 EXPORT_SYMBOL(dev_loopback_xmit);
2800
2801 /**
2802  *      __dev_queue_xmit - transmit a buffer
2803  *      @skb: buffer to transmit
2804  *      @accel_priv: private data used for L2 forwarding offload
2805  *
2806  *      Queue a buffer for transmission to a network device. The caller must
2807  *      have set the device and priority and built the buffer before calling
2808  *      this function. The function can be called from an interrupt.
2809  *
2810  *      A negative errno code is returned on a failure. A success does not
2811  *      guarantee the frame will be transmitted as it may be dropped due
2812  *      to congestion or traffic shaping.
2813  *
2814  * -----------------------------------------------------------------------------------
2815  *      I notice this method can also return errors from the queue disciplines,
2816  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2817  *      be positive.
2818  *
2819  *      Regardless of the return value, the skb is consumed, so it is currently
2820  *      difficult to retry a send to this method.  (You can bump the ref count
2821  *      before sending to hold a reference for retry if you are careful.)
2822  *
2823  *      When calling this method, interrupts MUST be enabled.  This is because
2824  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2825  *          --BLG
2826  */
2827 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2828 {
2829         struct net_device *dev = skb->dev;
2830         struct netdev_queue *txq;
2831         struct Qdisc *q;
2832         int rc = -ENOMEM;
2833
2834         skb_reset_mac_header(skb);
2835
2836         /* Disable soft irqs for various locks below. Also
2837          * stops preemption for RCU.
2838          */
2839         rcu_read_lock_bh();
2840
2841         skb_update_prio(skb);
2842
2843         txq = netdev_pick_tx(dev, skb, accel_priv);
2844         q = rcu_dereference_bh(txq->qdisc);
2845
2846 #ifdef CONFIG_NET_CLS_ACT
2847         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2848 #endif
2849         trace_net_dev_queue(skb);
2850         if (q->enqueue) {
2851                 rc = __dev_xmit_skb(skb, q, dev, txq);
2852                 goto out;
2853         }
2854
2855         /* The device has no queue. Common case for software devices:
2856            loopback, all the sorts of tunnels...
2857
2858            Really, it is unlikely that netif_tx_lock protection is necessary
2859            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2860            counters.)
2861            However, it is possible, that they rely on protection
2862            made by us here.
2863
2864            Check this and shot the lock. It is not prone from deadlocks.
2865            Either shot noqueue qdisc, it is even simpler 8)
2866          */
2867         if (dev->flags & IFF_UP) {
2868                 int cpu = smp_processor_id(); /* ok because BHs are off */
2869
2870                 if (txq->xmit_lock_owner != cpu) {
2871
2872                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2873                                 goto recursion_alert;
2874
2875                         HARD_TX_LOCK(dev, txq, cpu);
2876
2877                         if (!netif_xmit_stopped(txq)) {
2878                                 __this_cpu_inc(xmit_recursion);
2879                                 rc = dev_hard_start_xmit(skb, dev, txq);
2880                                 __this_cpu_dec(xmit_recursion);
2881                                 if (dev_xmit_complete(rc)) {
2882                                         HARD_TX_UNLOCK(dev, txq);
2883                                         goto out;
2884                                 }
2885                         }
2886                         HARD_TX_UNLOCK(dev, txq);
2887                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2888                                              dev->name);
2889                 } else {
2890                         /* Recursion is detected! It is possible,
2891                          * unfortunately
2892                          */
2893 recursion_alert:
2894                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2895                                              dev->name);
2896                 }
2897         }
2898
2899         rc = -ENETDOWN;
2900         rcu_read_unlock_bh();
2901
2902         atomic_long_inc(&dev->tx_dropped);
2903         kfree_skb(skb);
2904         return rc;
2905 out:
2906         rcu_read_unlock_bh();
2907         return rc;
2908 }
2909
2910 int dev_queue_xmit(struct sk_buff *skb)
2911 {
2912         return __dev_queue_xmit(skb, NULL);
2913 }
2914 EXPORT_SYMBOL(dev_queue_xmit);
2915
2916 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2917 {
2918         return __dev_queue_xmit(skb, accel_priv);
2919 }
2920 EXPORT_SYMBOL(dev_queue_xmit_accel);
2921
2922
2923 /*=======================================================================
2924                         Receiver routines
2925   =======================================================================*/
2926
2927 int netdev_max_backlog __read_mostly = 1000;
2928 EXPORT_SYMBOL(netdev_max_backlog);
2929
2930 int netdev_tstamp_prequeue __read_mostly = 1;
2931 int netdev_budget __read_mostly = 300;
2932 int weight_p __read_mostly = 64;            /* old backlog weight */
2933
2934 /* Called with irq disabled */
2935 static inline void ____napi_schedule(struct softnet_data *sd,
2936                                      struct napi_struct *napi)
2937 {
2938         list_add_tail(&napi->poll_list, &sd->poll_list);
2939         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2940 }
2941
2942 #ifdef CONFIG_RPS
2943
2944 /* One global table that all flow-based protocols share. */
2945 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2946 EXPORT_SYMBOL(rps_sock_flow_table);
2947
2948 struct static_key rps_needed __read_mostly;
2949
2950 static struct rps_dev_flow *
2951 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2952             struct rps_dev_flow *rflow, u16 next_cpu)
2953 {
2954         if (next_cpu != RPS_NO_CPU) {
2955 #ifdef CONFIG_RFS_ACCEL
2956                 struct netdev_rx_queue *rxqueue;
2957                 struct rps_dev_flow_table *flow_table;
2958                 struct rps_dev_flow *old_rflow;
2959                 u32 flow_id;
2960                 u16 rxq_index;
2961                 int rc;
2962
2963                 /* Should we steer this flow to a different hardware queue? */
2964                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2965                     !(dev->features & NETIF_F_NTUPLE))
2966                         goto out;
2967                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2968                 if (rxq_index == skb_get_rx_queue(skb))
2969                         goto out;
2970
2971                 rxqueue = dev->_rx + rxq_index;
2972                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2973                 if (!flow_table)
2974                         goto out;
2975                 flow_id = skb_get_hash(skb) & flow_table->mask;
2976                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2977                                                         rxq_index, flow_id);
2978                 if (rc < 0)
2979                         goto out;
2980                 old_rflow = rflow;
2981                 rflow = &flow_table->flows[flow_id];
2982                 rflow->filter = rc;
2983                 if (old_rflow->filter == rflow->filter)
2984                         old_rflow->filter = RPS_NO_FILTER;
2985         out:
2986 #endif
2987                 rflow->last_qtail =
2988                         per_cpu(softnet_data, next_cpu).input_queue_head;
2989         }
2990
2991         rflow->cpu = next_cpu;
2992         return rflow;
2993 }
2994
2995 /*
2996  * get_rps_cpu is called from netif_receive_skb and returns the target
2997  * CPU from the RPS map of the receiving queue for a given skb.
2998  * rcu_read_lock must be held on entry.
2999  */
3000 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3001                        struct rps_dev_flow **rflowp)
3002 {
3003         struct netdev_rx_queue *rxqueue;
3004         struct rps_map *map;
3005         struct rps_dev_flow_table *flow_table;
3006         struct rps_sock_flow_table *sock_flow_table;
3007         int cpu = -1;
3008         u16 tcpu;
3009         u32 hash;
3010
3011         if (skb_rx_queue_recorded(skb)) {
3012                 u16 index = skb_get_rx_queue(skb);
3013                 if (unlikely(index >= dev->real_num_rx_queues)) {
3014                         WARN_ONCE(dev->real_num_rx_queues > 1,
3015                                   "%s received packet on queue %u, but number "
3016                                   "of RX queues is %u\n",
3017                                   dev->name, index, dev->real_num_rx_queues);
3018                         goto done;
3019                 }
3020                 rxqueue = dev->_rx + index;
3021         } else
3022                 rxqueue = dev->_rx;
3023
3024         map = rcu_dereference(rxqueue->rps_map);
3025         if (map) {
3026                 if (map->len == 1 &&
3027                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3028                         tcpu = map->cpus[0];
3029                         if (cpu_online(tcpu))
3030                                 cpu = tcpu;
3031                         goto done;
3032                 }
3033         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3034                 goto done;
3035         }
3036
3037         skb_reset_network_header(skb);
3038         hash = skb_get_hash(skb);
3039         if (!hash)
3040                 goto done;
3041
3042         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3043         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3044         if (flow_table && sock_flow_table) {
3045                 u16 next_cpu;
3046                 struct rps_dev_flow *rflow;
3047
3048                 rflow = &flow_table->flows[hash & flow_table->mask];
3049                 tcpu = rflow->cpu;
3050
3051                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3052
3053                 /*
3054                  * If the desired CPU (where last recvmsg was done) is
3055                  * different from current CPU (one in the rx-queue flow
3056                  * table entry), switch if one of the following holds:
3057                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3058                  *   - Current CPU is offline.
3059                  *   - The current CPU's queue tail has advanced beyond the
3060                  *     last packet that was enqueued using this table entry.
3061                  *     This guarantees that all previous packets for the flow
3062                  *     have been dequeued, thus preserving in order delivery.
3063                  */
3064                 if (unlikely(tcpu != next_cpu) &&
3065                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3066                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3067                       rflow->last_qtail)) >= 0)) {
3068                         tcpu = next_cpu;
3069                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3070                 }
3071
3072                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3073                         *rflowp = rflow;
3074                         cpu = tcpu;
3075                         goto done;
3076                 }
3077         }
3078
3079         if (map) {
3080                 tcpu = map->cpus[((u64) hash * map->len) >> 32];
3081
3082                 if (cpu_online(tcpu)) {
3083                         cpu = tcpu;
3084                         goto done;
3085                 }
3086         }
3087
3088 done:
3089         return cpu;
3090 }
3091
3092 #ifdef CONFIG_RFS_ACCEL
3093
3094 /**
3095  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3096  * @dev: Device on which the filter was set
3097  * @rxq_index: RX queue index
3098  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3099  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3100  *
3101  * Drivers that implement ndo_rx_flow_steer() should periodically call
3102  * this function for each installed filter and remove the filters for
3103  * which it returns %true.
3104  */
3105 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3106                          u32 flow_id, u16 filter_id)
3107 {
3108         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3109         struct rps_dev_flow_table *flow_table;
3110         struct rps_dev_flow *rflow;
3111         bool expire = true;
3112         int cpu;
3113
3114         rcu_read_lock();
3115         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3116         if (flow_table && flow_id <= flow_table->mask) {
3117                 rflow = &flow_table->flows[flow_id];
3118                 cpu = ACCESS_ONCE(rflow->cpu);
3119                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3120                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3121                            rflow->last_qtail) <
3122                      (int)(10 * flow_table->mask)))
3123                         expire = false;
3124         }
3125         rcu_read_unlock();
3126         return expire;
3127 }
3128 EXPORT_SYMBOL(rps_may_expire_flow);
3129
3130 #endif /* CONFIG_RFS_ACCEL */
3131
3132 /* Called from hardirq (IPI) context */
3133 static void rps_trigger_softirq(void *data)
3134 {
3135         struct softnet_data *sd = data;
3136
3137         ____napi_schedule(sd, &sd->backlog);
3138         sd->received_rps++;
3139 }
3140
3141 #endif /* CONFIG_RPS */
3142
3143 /*
3144  * Check if this softnet_data structure is another cpu one
3145  * If yes, queue it to our IPI list and return 1
3146  * If no, return 0
3147  */
3148 static int rps_ipi_queued(struct softnet_data *sd)
3149 {
3150 #ifdef CONFIG_RPS
3151         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3152
3153         if (sd != mysd) {
3154                 sd->rps_ipi_next = mysd->rps_ipi_list;
3155                 mysd->rps_ipi_list = sd;
3156
3157                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3158                 return 1;
3159         }
3160 #endif /* CONFIG_RPS */
3161         return 0;
3162 }
3163
3164 #ifdef CONFIG_NET_FLOW_LIMIT
3165 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3166 #endif
3167
3168 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3169 {
3170 #ifdef CONFIG_NET_FLOW_LIMIT
3171         struct sd_flow_limit *fl;
3172         struct softnet_data *sd;
3173         unsigned int old_flow, new_flow;
3174
3175         if (qlen < (netdev_max_backlog >> 1))
3176                 return false;
3177
3178         sd = &__get_cpu_var(softnet_data);
3179
3180         rcu_read_lock();
3181         fl = rcu_dereference(sd->flow_limit);
3182         if (fl) {
3183                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3184                 old_flow = fl->history[fl->history_head];
3185                 fl->history[fl->history_head] = new_flow;
3186
3187                 fl->history_head++;
3188                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3189
3190                 if (likely(fl->buckets[old_flow]))
3191                         fl->buckets[old_flow]--;
3192
3193                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3194                         fl->count++;
3195                         rcu_read_unlock();
3196                         return true;
3197                 }
3198         }
3199         rcu_read_unlock();
3200 #endif
3201         return false;
3202 }
3203
3204 /*
3205  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3206  * queue (may be a remote CPU queue).
3207  */
3208 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3209                               unsigned int *qtail)
3210 {
3211         struct softnet_data *sd;
3212         unsigned long flags;
3213         unsigned int qlen;
3214
3215         sd = &per_cpu(softnet_data, cpu);
3216
3217         local_irq_save(flags);
3218
3219         rps_lock(sd);
3220         qlen = skb_queue_len(&sd->input_pkt_queue);
3221         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3222                 if (skb_queue_len(&sd->input_pkt_queue)) {
3223 enqueue:
3224                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3225                         input_queue_tail_incr_save(sd, qtail);
3226                         rps_unlock(sd);
3227                         local_irq_restore(flags);
3228                         return NET_RX_SUCCESS;
3229                 }
3230
3231                 /* Schedule NAPI for backlog device
3232                  * We can use non atomic operation since we own the queue lock
3233                  */
3234                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3235                         if (!rps_ipi_queued(sd))
3236                                 ____napi_schedule(sd, &sd->backlog);
3237                 }
3238                 goto enqueue;
3239         }
3240
3241         sd->dropped++;
3242         rps_unlock(sd);
3243
3244         local_irq_restore(flags);
3245
3246         atomic_long_inc(&skb->dev->rx_dropped);
3247         kfree_skb(skb);
3248         return NET_RX_DROP;
3249 }
3250
3251 static int netif_rx_internal(struct sk_buff *skb)
3252 {
3253         int ret;
3254
3255         net_timestamp_check(netdev_tstamp_prequeue, skb);
3256
3257         trace_netif_rx(skb);
3258 #ifdef CONFIG_RPS
3259         if (static_key_false(&rps_needed)) {
3260                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3261                 int cpu;
3262
3263                 preempt_disable();
3264                 rcu_read_lock();
3265
3266                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3267                 if (cpu < 0)
3268                         cpu = smp_processor_id();
3269
3270                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3271
3272                 rcu_read_unlock();
3273                 preempt_enable();
3274         } else
3275 #endif
3276         {
3277                 unsigned int qtail;
3278                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3279                 put_cpu();
3280         }
3281         return ret;
3282 }
3283
3284 /**
3285  *      netif_rx        -       post buffer to the network code
3286  *      @skb: buffer to post
3287  *
3288  *      This function receives a packet from a device driver and queues it for
3289  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3290  *      may be dropped during processing for congestion control or by the
3291  *      protocol layers.
3292  *
3293  *      return values:
3294  *      NET_RX_SUCCESS  (no congestion)
3295  *      NET_RX_DROP     (packet was dropped)
3296  *
3297  */
3298
3299 int netif_rx(struct sk_buff *skb)
3300 {
3301         trace_netif_rx_entry(skb);
3302
3303         return netif_rx_internal(skb);
3304 }
3305 EXPORT_SYMBOL(netif_rx);
3306
3307 int netif_rx_ni(struct sk_buff *skb)
3308 {
3309         int err;
3310
3311         trace_netif_rx_ni_entry(skb);
3312
3313         preempt_disable();
3314         err = netif_rx_internal(skb);
3315         if (local_softirq_pending())
3316                 do_softirq();
3317         preempt_enable();
3318
3319         return err;
3320 }
3321 EXPORT_SYMBOL(netif_rx_ni);
3322
3323 static void net_tx_action(struct softirq_action *h)
3324 {
3325         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3326
3327         if (sd->completion_queue) {
3328                 struct sk_buff *clist;
3329
3330                 local_irq_disable();
3331                 clist = sd->completion_queue;
3332                 sd->completion_queue = NULL;
3333                 local_irq_enable();
3334
3335                 while (clist) {
3336                         struct sk_buff *skb = clist;
3337                         clist = clist->next;
3338
3339                         WARN_ON(atomic_read(&skb->users));
3340                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3341                                 trace_consume_skb(skb);
3342                         else
3343                                 trace_kfree_skb(skb, net_tx_action);
3344                         __kfree_skb(skb);
3345                 }
3346         }
3347
3348         if (sd->output_queue) {
3349                 struct Qdisc *head;
3350
3351                 local_irq_disable();
3352                 head = sd->output_queue;
3353                 sd->output_queue = NULL;
3354                 sd->output_queue_tailp = &sd->output_queue;
3355                 local_irq_enable();
3356
3357                 while (head) {
3358                         struct Qdisc *q = head;
3359                         spinlock_t *root_lock;
3360
3361                         head = head->next_sched;
3362
3363                         root_lock = qdisc_lock(q);
3364                         if (spin_trylock(root_lock)) {
3365                                 smp_mb__before_clear_bit();
3366                                 clear_bit(__QDISC_STATE_SCHED,
3367                                           &q->state);
3368                                 qdisc_run(q);
3369                                 spin_unlock(root_lock);
3370                         } else {
3371                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3372                                               &q->state)) {
3373                                         __netif_reschedule(q);
3374                                 } else {
3375                                         smp_mb__before_clear_bit();
3376                                         clear_bit(__QDISC_STATE_SCHED,
3377                                                   &q->state);
3378                                 }
3379                         }
3380                 }
3381         }
3382 }
3383
3384 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3385     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3386 /* This hook is defined here for ATM LANE */
3387 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3388                              unsigned char *addr) __read_mostly;
3389 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3390 #endif
3391
3392 #ifdef CONFIG_NET_CLS_ACT
3393 /* TODO: Maybe we should just force sch_ingress to be compiled in
3394  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3395  * a compare and 2 stores extra right now if we dont have it on
3396  * but have CONFIG_NET_CLS_ACT
3397  * NOTE: This doesn't stop any functionality; if you dont have
3398  * the ingress scheduler, you just can't add policies on ingress.
3399  *
3400  */
3401 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3402 {
3403         struct net_device *dev = skb->dev;
3404         u32 ttl = G_TC_RTTL(skb->tc_verd);
3405         int result = TC_ACT_OK;
3406         struct Qdisc *q;
3407
3408         if (unlikely(MAX_RED_LOOP < ttl++)) {
3409                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3410                                      skb->skb_iif, dev->ifindex);
3411                 return TC_ACT_SHOT;
3412         }
3413
3414         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3415         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3416
3417         q = rxq->qdisc;
3418         if (q != &noop_qdisc) {
3419                 spin_lock(qdisc_lock(q));
3420                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3421                         result = qdisc_enqueue_root(skb, q);
3422                 spin_unlock(qdisc_lock(q));
3423         }
3424
3425         return result;
3426 }
3427
3428 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3429                                          struct packet_type **pt_prev,
3430                                          int *ret, struct net_device *orig_dev)
3431 {
3432         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3433
3434         if (!rxq || rxq->qdisc == &noop_qdisc)
3435                 goto out;
3436
3437         if (*pt_prev) {
3438                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3439                 *pt_prev = NULL;
3440         }
3441
3442         switch (ing_filter(skb, rxq)) {
3443         case TC_ACT_SHOT:
3444         case TC_ACT_STOLEN:
3445                 kfree_skb(skb);
3446                 return NULL;
3447         }
3448
3449 out:
3450         skb->tc_verd = 0;
3451         return skb;
3452 }
3453 #endif
3454
3455 /**
3456  *      netdev_rx_handler_register - register receive handler
3457  *      @dev: device to register a handler for
3458  *      @rx_handler: receive handler to register
3459  *      @rx_handler_data: data pointer that is used by rx handler
3460  *
3461  *      Register a receive handler for a device. This handler will then be
3462  *      called from __netif_receive_skb. A negative errno code is returned
3463  *      on a failure.
3464  *
3465  *      The caller must hold the rtnl_mutex.
3466  *
3467  *      For a general description of rx_handler, see enum rx_handler_result.
3468  */
3469 int netdev_rx_handler_register(struct net_device *dev,
3470                                rx_handler_func_t *rx_handler,
3471                                void *rx_handler_data)
3472 {
3473         ASSERT_RTNL();
3474
3475         if (dev->rx_handler)
3476                 return -EBUSY;
3477
3478         /* Note: rx_handler_data must be set before rx_handler */
3479         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3480         rcu_assign_pointer(dev->rx_handler, rx_handler);
3481
3482         return 0;
3483 }
3484 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3485
3486 /**
3487  *      netdev_rx_handler_unregister - unregister receive handler
3488  *      @dev: device to unregister a handler from
3489  *
3490  *      Unregister a receive handler from a device.
3491  *
3492  *      The caller must hold the rtnl_mutex.
3493  */
3494 void netdev_rx_handler_unregister(struct net_device *dev)
3495 {
3496
3497         ASSERT_RTNL();
3498         RCU_INIT_POINTER(dev->rx_handler, NULL);
3499         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3500          * section has a guarantee to see a non NULL rx_handler_data
3501          * as well.
3502          */
3503         synchronize_net();
3504         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3505 }
3506 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3507
3508 /*
3509  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3510  * the special handling of PFMEMALLOC skbs.
3511  */
3512 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3513 {
3514         switch (skb->protocol) {
3515         case htons(ETH_P_ARP):
3516         case htons(ETH_P_IP):
3517         case htons(ETH_P_IPV6):
3518         case htons(ETH_P_8021Q):
3519         case htons(ETH_P_8021AD):
3520                 return true;
3521         default:
3522                 return false;
3523         }
3524 }
3525
3526 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3527 {
3528         struct packet_type *ptype, *pt_prev;
3529         rx_handler_func_t *rx_handler;
3530         struct net_device *orig_dev;
3531         struct net_device *null_or_dev;
3532         bool deliver_exact = false;
3533         int ret = NET_RX_DROP;
3534         __be16 type;
3535
3536         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3537
3538         trace_netif_receive_skb(skb);
3539
3540         orig_dev = skb->dev;
3541
3542         skb_reset_network_header(skb);
3543         if (!skb_transport_header_was_set(skb))
3544                 skb_reset_transport_header(skb);
3545         skb_reset_mac_len(skb);
3546
3547         pt_prev = NULL;
3548
3549         rcu_read_lock();
3550
3551 another_round:
3552         skb->skb_iif = skb->dev->ifindex;
3553
3554         __this_cpu_inc(softnet_data.processed);
3555
3556         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3557             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3558                 skb = vlan_untag(skb);
3559                 if (unlikely(!skb))
3560                         goto unlock;
3561         }
3562
3563 #ifdef CONFIG_NET_CLS_ACT
3564         if (skb->tc_verd & TC_NCLS) {
3565                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3566                 goto ncls;
3567         }
3568 #endif
3569
3570         if (pfmemalloc)
3571                 goto skip_taps;
3572
3573         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3574                 if (!ptype->dev || ptype->dev == skb->dev) {
3575                         if (pt_prev)
3576                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3577                         pt_prev = ptype;
3578                 }
3579         }
3580
3581 skip_taps:
3582 #ifdef CONFIG_NET_CLS_ACT
3583         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3584         if (!skb)
3585                 goto unlock;
3586 ncls:
3587 #endif
3588
3589         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3590                 goto drop;
3591
3592         if (vlan_tx_tag_present(skb)) {
3593                 if (pt_prev) {
3594                         ret = deliver_skb(skb, pt_prev, orig_dev);
3595                         pt_prev = NULL;
3596                 }
3597                 if (vlan_do_receive(&skb))
3598                         goto another_round;
3599                 else if (unlikely(!skb))
3600                         goto unlock;
3601         }
3602
3603         rx_handler = rcu_dereference(skb->dev->rx_handler);
3604         if (rx_handler) {
3605                 if (pt_prev) {
3606                         ret = deliver_skb(skb, pt_prev, orig_dev);
3607                         pt_prev = NULL;
3608                 }
3609                 switch (rx_handler(&skb)) {
3610                 case RX_HANDLER_CONSUMED:
3611                         ret = NET_RX_SUCCESS;
3612                         goto unlock;
3613                 case RX_HANDLER_ANOTHER:
3614                         goto another_round;
3615                 case RX_HANDLER_EXACT:
3616                         deliver_exact = true;
3617                 case RX_HANDLER_PASS:
3618                         break;
3619                 default:
3620                         BUG();
3621                 }
3622         }
3623
3624         if (unlikely(vlan_tx_tag_present(skb))) {
3625                 if (vlan_tx_tag_get_id(skb))
3626                         skb->pkt_type = PACKET_OTHERHOST;
3627                 /* Note: we might in the future use prio bits
3628                  * and set skb->priority like in vlan_do_receive()
3629                  * For the time being, just ignore Priority Code Point
3630                  */
3631                 skb->vlan_tci = 0;
3632         }
3633
3634         /* deliver only exact match when indicated */
3635         null_or_dev = deliver_exact ? skb->dev : NULL;
3636
3637         type = skb->protocol;
3638         list_for_each_entry_rcu(ptype,
3639                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3640                 if (ptype->type == type &&
3641                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3642                      ptype->dev == orig_dev)) {
3643                         if (pt_prev)
3644                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3645                         pt_prev = ptype;
3646                 }
3647         }
3648
3649         if (pt_prev) {
3650                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3651                         goto drop;
3652                 else
3653                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3654         } else {
3655 drop:
3656                 atomic_long_inc(&skb->dev->rx_dropped);
3657                 kfree_skb(skb);
3658                 /* Jamal, now you will not able to escape explaining
3659                  * me how you were going to use this. :-)
3660                  */
3661                 ret = NET_RX_DROP;
3662         }
3663
3664 unlock:
3665         rcu_read_unlock();
3666         return ret;
3667 }
3668
3669 static int __netif_receive_skb(struct sk_buff *skb)
3670 {
3671         int ret;
3672
3673         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3674                 unsigned long pflags = current->flags;
3675
3676                 /*
3677                  * PFMEMALLOC skbs are special, they should
3678                  * - be delivered to SOCK_MEMALLOC sockets only
3679                  * - stay away from userspace
3680                  * - have bounded memory usage
3681                  *
3682                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3683                  * context down to all allocation sites.
3684                  */
3685                 current->flags |= PF_MEMALLOC;
3686                 ret = __netif_receive_skb_core(skb, true);
3687                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3688         } else
3689                 ret = __netif_receive_skb_core(skb, false);
3690
3691         return ret;
3692 }
3693
3694 static int netif_receive_skb_internal(struct sk_buff *skb)
3695 {
3696         net_timestamp_check(netdev_tstamp_prequeue, skb);
3697
3698         if (skb_defer_rx_timestamp(skb))
3699                 return NET_RX_SUCCESS;
3700
3701 #ifdef CONFIG_RPS
3702         if (static_key_false(&rps_needed)) {
3703                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3704                 int cpu, ret;
3705
3706                 rcu_read_lock();
3707
3708                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3709
3710                 if (cpu >= 0) {
3711                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3712                         rcu_read_unlock();
3713                         return ret;
3714                 }
3715                 rcu_read_unlock();
3716         }
3717 #endif
3718         return __netif_receive_skb(skb);
3719 }
3720
3721 /**
3722  *      netif_receive_skb - process receive buffer from network
3723  *      @skb: buffer to process
3724  *
3725  *      netif_receive_skb() is the main receive data processing function.
3726  *      It always succeeds. The buffer may be dropped during processing
3727  *      for congestion control or by the protocol layers.
3728  *
3729  *      This function may only be called from softirq context and interrupts
3730  *      should be enabled.
3731  *
3732  *      Return values (usually ignored):
3733  *      NET_RX_SUCCESS: no congestion
3734  *      NET_RX_DROP: packet was dropped
3735  */
3736 int netif_receive_skb(struct sk_buff *skb)
3737 {
3738         trace_netif_receive_skb_entry(skb);
3739
3740         return netif_receive_skb_internal(skb);
3741 }
3742 EXPORT_SYMBOL(netif_receive_skb);
3743
3744 /* Network device is going away, flush any packets still pending
3745  * Called with irqs disabled.
3746  */
3747 static void flush_backlog(void *arg)
3748 {
3749         struct net_device *dev = arg;
3750         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3751         struct sk_buff *skb, *tmp;
3752
3753         rps_lock(sd);
3754         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3755                 if (skb->dev == dev) {
3756                         __skb_unlink(skb, &sd->input_pkt_queue);
3757                         kfree_skb(skb);
3758                         input_queue_head_incr(sd);
3759                 }
3760         }
3761         rps_unlock(sd);
3762
3763         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3764                 if (skb->dev == dev) {
3765                         __skb_unlink(skb, &sd->process_queue);
3766                         kfree_skb(skb);
3767                         input_queue_head_incr(sd);
3768                 }
3769         }
3770 }
3771
3772 static int napi_gro_complete(struct sk_buff *skb)
3773 {
3774         struct packet_offload *ptype;
3775         __be16 type = skb->protocol;
3776         struct list_head *head = &offload_base;
3777         int err = -ENOENT;
3778
3779         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3780
3781         if (NAPI_GRO_CB(skb)->count == 1) {
3782                 skb_shinfo(skb)->gso_size = 0;
3783                 goto out;
3784         }
3785
3786         rcu_read_lock();
3787         list_for_each_entry_rcu(ptype, head, list) {
3788                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3789                         continue;
3790
3791                 err = ptype->callbacks.gro_complete(skb, 0);
3792                 break;
3793         }
3794         rcu_read_unlock();
3795
3796         if (err) {
3797                 WARN_ON(&ptype->list == head);
3798                 kfree_skb(skb);
3799                 return NET_RX_SUCCESS;
3800         }
3801
3802 out:
3803         return netif_receive_skb_internal(skb);
3804 }
3805
3806 /* napi->gro_list contains packets ordered by age.
3807  * youngest packets at the head of it.
3808  * Complete skbs in reverse order to reduce latencies.
3809  */
3810 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3811 {
3812         struct sk_buff *skb, *prev = NULL;
3813
3814         /* scan list and build reverse chain */
3815         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3816                 skb->prev = prev;
3817                 prev = skb;
3818         }
3819
3820         for (skb = prev; skb; skb = prev) {
3821                 skb->next = NULL;
3822
3823                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3824                         return;
3825
3826                 prev = skb->prev;
3827                 napi_gro_complete(skb);
3828                 napi->gro_count--;
3829         }
3830
3831         napi->gro_list = NULL;
3832 }
3833 EXPORT_SYMBOL(napi_gro_flush);
3834
3835 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3836 {
3837         struct sk_buff *p;
3838         unsigned int maclen = skb->dev->hard_header_len;
3839         u32 hash = skb_get_hash_raw(skb);
3840
3841         for (p = napi->gro_list; p; p = p->next) {
3842                 unsigned long diffs;
3843
3844                 NAPI_GRO_CB(p)->flush = 0;
3845
3846                 if (hash != skb_get_hash_raw(p)) {
3847                         NAPI_GRO_CB(p)->same_flow = 0;
3848                         continue;
3849                 }
3850
3851                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3852                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3853                 if (maclen == ETH_HLEN)
3854                         diffs |= compare_ether_header(skb_mac_header(p),
3855                                                       skb_mac_header(skb));
3856                 else if (!diffs)
3857                         diffs = memcmp(skb_mac_header(p),
3858                                        skb_mac_header(skb),
3859                                        maclen);
3860                 NAPI_GRO_CB(p)->same_flow = !diffs;
3861         }
3862 }
3863
3864 static void skb_gro_reset_offset(struct sk_buff *skb)
3865 {
3866         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3867         const skb_frag_t *frag0 = &pinfo->frags[0];
3868
3869         NAPI_GRO_CB(skb)->data_offset = 0;
3870         NAPI_GRO_CB(skb)->frag0 = NULL;
3871         NAPI_GRO_CB(skb)->frag0_len = 0;
3872
3873         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3874             pinfo->nr_frags &&
3875             !PageHighMem(skb_frag_page(frag0))) {
3876                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3877                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3878         }
3879 }
3880
3881 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3882 {
3883         struct skb_shared_info *pinfo = skb_shinfo(skb);
3884
3885         BUG_ON(skb->end - skb->tail < grow);
3886
3887         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3888
3889         skb->data_len -= grow;
3890         skb->tail += grow;
3891
3892         pinfo->frags[0].page_offset += grow;
3893         skb_frag_size_sub(&pinfo->frags[0], grow);
3894
3895         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3896                 skb_frag_unref(skb, 0);
3897                 memmove(pinfo->frags, pinfo->frags + 1,
3898                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3899         }
3900 }
3901
3902 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3903 {
3904         struct sk_buff **pp = NULL;
3905         struct packet_offload *ptype;
3906         __be16 type = skb->protocol;
3907         struct list_head *head = &offload_base;
3908         int same_flow;
3909         enum gro_result ret;
3910         int grow;
3911
3912         if (!(skb->dev->features & NETIF_F_GRO))
3913                 goto normal;
3914
3915         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3916                 goto normal;
3917
3918         gro_list_prepare(napi, skb);
3919         NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
3920
3921         rcu_read_lock();
3922         list_for_each_entry_rcu(ptype, head, list) {
3923                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3924                         continue;
3925
3926                 skb_set_network_header(skb, skb_gro_offset(skb));
3927                 skb_reset_mac_len(skb);
3928                 NAPI_GRO_CB(skb)->same_flow = 0;
3929                 NAPI_GRO_CB(skb)->flush = 0;
3930                 NAPI_GRO_CB(skb)->free = 0;
3931                 NAPI_GRO_CB(skb)->udp_mark = 0;
3932
3933                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3934                 break;
3935         }
3936         rcu_read_unlock();
3937
3938         if (&ptype->list == head)
3939                 goto normal;
3940
3941         same_flow = NAPI_GRO_CB(skb)->same_flow;
3942         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3943
3944         if (pp) {
3945                 struct sk_buff *nskb = *pp;
3946
3947                 *pp = nskb->next;
3948                 nskb->next = NULL;
3949                 napi_gro_complete(nskb);
3950                 napi->gro_count--;
3951         }
3952
3953         if (same_flow)
3954                 goto ok;
3955
3956         if (NAPI_GRO_CB(skb)->flush)
3957                 goto normal;
3958
3959         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
3960                 struct sk_buff *nskb = napi->gro_list;
3961
3962                 /* locate the end of the list to select the 'oldest' flow */
3963                 while (nskb->next) {
3964                         pp = &nskb->next;
3965                         nskb = *pp;
3966                 }
3967                 *pp = NULL;
3968                 nskb->next = NULL;
3969                 napi_gro_complete(nskb);
3970         } else {
3971                 napi->gro_count++;
3972         }
3973         NAPI_GRO_CB(skb)->count = 1;
3974         NAPI_GRO_CB(skb)->age = jiffies;
3975         NAPI_GRO_CB(skb)->last = skb;
3976         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3977         skb->next = napi->gro_list;
3978         napi->gro_list = skb;
3979         ret = GRO_HELD;
3980
3981 pull:
3982         grow = skb_gro_offset(skb) - skb_headlen(skb);
3983         if (grow > 0)
3984                 gro_pull_from_frag0(skb, grow);
3985 ok:
3986         return ret;
3987
3988 normal:
3989         ret = GRO_NORMAL;
3990         goto pull;
3991 }
3992
3993 struct packet_offload *gro_find_receive_by_type(__be16 type)
3994 {
3995         struct list_head *offload_head = &offload_base;
3996         struct packet_offload *ptype;
3997
3998         list_for_each_entry_rcu(ptype, offload_head, list) {
3999                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4000                         continue;
4001                 return ptype;
4002         }
4003         return NULL;
4004 }
4005 EXPORT_SYMBOL(gro_find_receive_by_type);
4006
4007 struct packet_offload *gro_find_complete_by_type(__be16 type)
4008 {
4009         struct list_head *offload_head = &offload_base;
4010         struct packet_offload *ptype;
4011
4012         list_for_each_entry_rcu(ptype, offload_head, list) {
4013                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4014                         continue;
4015                 return ptype;
4016         }
4017         return NULL;
4018 }
4019 EXPORT_SYMBOL(gro_find_complete_by_type);
4020
4021 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4022 {
4023         switch (ret) {
4024         case GRO_NORMAL:
4025                 if (netif_receive_skb_internal(skb))
4026                         ret = GRO_DROP;
4027                 break;
4028
4029         case GRO_DROP:
4030                 kfree_skb(skb);
4031                 break;
4032
4033         case GRO_MERGED_FREE:
4034                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4035                         kmem_cache_free(skbuff_head_cache, skb);
4036                 else
4037                         __kfree_skb(skb);
4038                 break;
4039
4040         case GRO_HELD:
4041         case GRO_MERGED:
4042                 break;
4043         }
4044
4045         return ret;
4046 }
4047
4048 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4049 {
4050         trace_napi_gro_receive_entry(skb);
4051
4052         skb_gro_reset_offset(skb);
4053
4054         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4055 }
4056 EXPORT_SYMBOL(napi_gro_receive);
4057
4058 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4059 {
4060         __skb_pull(skb, skb_headlen(skb));
4061         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4062         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4063         skb->vlan_tci = 0;
4064         skb->dev = napi->dev;
4065         skb->skb_iif = 0;
4066         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4067
4068         napi->skb = skb;
4069 }
4070
4071 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4072 {
4073         struct sk_buff *skb = napi->skb;
4074
4075         if (!skb) {
4076                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4077                 napi->skb = skb;
4078         }
4079         return skb;
4080 }
4081 EXPORT_SYMBOL(napi_get_frags);
4082
4083 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4084                                       struct sk_buff *skb,
4085                                       gro_result_t ret)
4086 {
4087         switch (ret) {
4088         case GRO_NORMAL:
4089         case GRO_HELD:
4090                 __skb_push(skb, ETH_HLEN);
4091                 skb->protocol = eth_type_trans(skb, skb->dev);
4092                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4093                         ret = GRO_DROP;
4094                 break;
4095
4096         case GRO_DROP:
4097         case GRO_MERGED_FREE:
4098                 napi_reuse_skb(napi, skb);
4099                 break;
4100
4101         case GRO_MERGED:
4102                 break;
4103         }
4104
4105         return ret;
4106 }
4107
4108 /* Upper GRO stack assumes network header starts at gro_offset=0
4109  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4110  * We copy ethernet header into skb->data to have a common layout.
4111  */
4112 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4113 {
4114         struct sk_buff *skb = napi->skb;
4115         const struct ethhdr *eth;
4116         unsigned int hlen = sizeof(*eth);
4117
4118         napi->skb = NULL;
4119
4120         skb_reset_mac_header(skb);
4121         skb_gro_reset_offset(skb);
4122
4123         eth = skb_gro_header_fast(skb, 0);
4124         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4125                 eth = skb_gro_header_slow(skb, hlen, 0);
4126                 if (unlikely(!eth)) {
4127                         napi_reuse_skb(napi, skb);
4128                         return NULL;
4129                 }
4130         } else {
4131                 gro_pull_from_frag0(skb, hlen);
4132                 NAPI_GRO_CB(skb)->frag0 += hlen;
4133                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4134         }
4135         __skb_pull(skb, hlen);
4136
4137         /*
4138          * This works because the only protocols we care about don't require
4139          * special handling.
4140          * We'll fix it up properly in napi_frags_finish()
4141          */
4142         skb->protocol = eth->h_proto;
4143
4144         return skb;
4145 }
4146
4147 gro_result_t napi_gro_frags(struct napi_struct *napi)
4148 {
4149         struct sk_buff *skb = napi_frags_skb(napi);
4150
4151         if (!skb)
4152                 return GRO_DROP;
4153
4154         trace_napi_gro_frags_entry(skb);
4155
4156         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4157 }
4158 EXPORT_SYMBOL(napi_gro_frags);
4159
4160 /*
4161  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4162  * Note: called with local irq disabled, but exits with local irq enabled.
4163  */
4164 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4165 {
4166 #ifdef CONFIG_RPS
4167         struct softnet_data *remsd = sd->rps_ipi_list;
4168
4169         if (remsd) {
4170                 sd->rps_ipi_list = NULL;
4171
4172                 local_irq_enable();
4173
4174                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4175                 while (remsd) {
4176                         struct softnet_data *next = remsd->rps_ipi_next;
4177
4178                         if (cpu_online(remsd->cpu))
4179                                 smp_call_function_single_async(remsd->cpu,
4180                                                            &remsd->csd);
4181                         remsd = next;
4182                 }
4183         } else
4184 #endif
4185                 local_irq_enable();
4186 }
4187
4188 static int process_backlog(struct napi_struct *napi, int quota)
4189 {
4190         int work = 0;
4191         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4192
4193 #ifdef CONFIG_RPS
4194         /* Check if we have pending ipi, its better to send them now,
4195          * not waiting net_rx_action() end.
4196          */
4197         if (sd->rps_ipi_list) {
4198                 local_irq_disable();
4199                 net_rps_action_and_irq_enable(sd);
4200         }
4201 #endif
4202         napi->weight = weight_p;
4203         local_irq_disable();
4204         while (work < quota) {
4205                 struct sk_buff *skb;
4206                 unsigned int qlen;
4207
4208                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4209                         local_irq_enable();
4210                         __netif_receive_skb(skb);
4211                         local_irq_disable();
4212                         input_queue_head_incr(sd);
4213                         if (++work >= quota) {
4214                                 local_irq_enable();
4215                                 return work;
4216                         }
4217                 }
4218
4219                 rps_lock(sd);
4220                 qlen = skb_queue_len(&sd->input_pkt_queue);
4221                 if (qlen)
4222                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4223                                                    &sd->process_queue);
4224
4225                 if (qlen < quota - work) {
4226                         /*
4227                          * Inline a custom version of __napi_complete().
4228                          * only current cpu owns and manipulates this napi,
4229                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4230                          * we can use a plain write instead of clear_bit(),
4231                          * and we dont need an smp_mb() memory barrier.
4232                          */
4233                         list_del(&napi->poll_list);
4234                         napi->state = 0;
4235
4236                         quota = work + qlen;
4237                 }
4238                 rps_unlock(sd);
4239         }
4240         local_irq_enable();
4241
4242         return work;
4243 }
4244
4245 /**
4246  * __napi_schedule - schedule for receive
4247  * @n: entry to schedule
4248  *
4249  * The entry's receive function will be scheduled to run
4250  */
4251 void __napi_schedule(struct napi_struct *n)
4252 {
4253         unsigned long flags;
4254
4255         local_irq_save(flags);
4256         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4257         local_irq_restore(flags);
4258 }
4259 EXPORT_SYMBOL(__napi_schedule);
4260
4261 void __napi_complete(struct napi_struct *n)
4262 {
4263         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4264         BUG_ON(n->gro_list);
4265
4266         list_del(&n->poll_list);
4267         smp_mb__before_clear_bit();
4268         clear_bit(NAPI_STATE_SCHED, &n->state);
4269 }
4270 EXPORT_SYMBOL(__napi_complete);
4271
4272 void napi_complete(struct napi_struct *n)
4273 {
4274         unsigned long flags;
4275
4276         /*
4277          * don't let napi dequeue from the cpu poll list
4278          * just in case its running on a different cpu
4279          */
4280         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4281                 return;
4282
4283         napi_gro_flush(n, false);
4284         local_irq_save(flags);
4285         __napi_complete(n);
4286         local_irq_restore(flags);
4287 }
4288 EXPORT_SYMBOL(napi_complete);
4289
4290 /* must be called under rcu_read_lock(), as we dont take a reference */
4291 struct napi_struct *napi_by_id(unsigned int napi_id)
4292 {
4293         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4294         struct napi_struct *napi;
4295
4296         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4297                 if (napi->napi_id == napi_id)
4298                         return napi;
4299
4300         return NULL;
4301 }
4302 EXPORT_SYMBOL_GPL(napi_by_id);
4303
4304 void napi_hash_add(struct napi_struct *napi)
4305 {
4306         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4307
4308                 spin_lock(&napi_hash_lock);
4309
4310                 /* 0 is not a valid id, we also skip an id that is taken
4311                  * we expect both events to be extremely rare
4312                  */
4313                 napi->napi_id = 0;
4314                 while (!napi->napi_id) {
4315                         napi->napi_id = ++napi_gen_id;
4316                         if (napi_by_id(napi->napi_id))
4317                                 napi->napi_id = 0;
4318                 }
4319
4320                 hlist_add_head_rcu(&napi->napi_hash_node,
4321                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4322
4323                 spin_unlock(&napi_hash_lock);
4324         }
4325 }
4326 EXPORT_SYMBOL_GPL(napi_hash_add);
4327
4328 /* Warning : caller is responsible to make sure rcu grace period
4329  * is respected before freeing memory containing @napi
4330  */
4331 void napi_hash_del(struct napi_struct *napi)
4332 {
4333         spin_lock(&napi_hash_lock);
4334
4335         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4336                 hlist_del_rcu(&napi->napi_hash_node);
4337
4338         spin_unlock(&napi_hash_lock);
4339 }
4340 EXPORT_SYMBOL_GPL(napi_hash_del);
4341
4342 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4343                     int (*poll)(struct napi_struct *, int), int weight)
4344 {
4345         INIT_LIST_HEAD(&napi->poll_list);
4346         napi->gro_count = 0;
4347         napi->gro_list = NULL;
4348         napi->skb = NULL;
4349         napi->poll = poll;
4350         if (weight > NAPI_POLL_WEIGHT)
4351                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4352                             weight, dev->name);
4353         napi->weight = weight;
4354         list_add(&napi->dev_list, &dev->napi_list);
4355         napi->dev = dev;
4356 #ifdef CONFIG_NETPOLL
4357         spin_lock_init(&napi->poll_lock);
4358         napi->poll_owner = -1;
4359 #endif
4360         set_bit(NAPI_STATE_SCHED, &napi->state);
4361 }
4362 EXPORT_SYMBOL(netif_napi_add);
4363
4364 void netif_napi_del(struct napi_struct *napi)
4365 {
4366         list_del_init(&napi->dev_list);
4367         napi_free_frags(napi);
4368
4369         kfree_skb_list(napi->gro_list);
4370         napi->gro_list = NULL;
4371         napi->gro_count = 0;
4372 }
4373 EXPORT_SYMBOL(netif_napi_del);
4374
4375 static void net_rx_action(struct softirq_action *h)
4376 {
4377         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4378         unsigned long time_limit = jiffies + 2;
4379         int budget = netdev_budget;
4380         void *have;
4381
4382         local_irq_disable();
4383
4384         while (!list_empty(&sd->poll_list)) {
4385                 struct napi_struct *n;
4386                 int work, weight;
4387
4388                 /* If softirq window is exhuasted then punt.
4389                  * Allow this to run for 2 jiffies since which will allow
4390                  * an average latency of 1.5/HZ.
4391                  */
4392                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4393                         goto softnet_break;
4394
4395                 local_irq_enable();
4396
4397                 /* Even though interrupts have been re-enabled, this
4398                  * access is safe because interrupts can only add new
4399                  * entries to the tail of this list, and only ->poll()
4400                  * calls can remove this head entry from the list.
4401                  */
4402                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4403
4404                 have = netpoll_poll_lock(n);
4405
4406                 weight = n->weight;
4407
4408                 /* This NAPI_STATE_SCHED test is for avoiding a race
4409                  * with netpoll's poll_napi().  Only the entity which
4410                  * obtains the lock and sees NAPI_STATE_SCHED set will
4411                  * actually make the ->poll() call.  Therefore we avoid
4412                  * accidentally calling ->poll() when NAPI is not scheduled.
4413                  */
4414                 work = 0;
4415                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4416                         work = n->poll(n, weight);
4417                         trace_napi_poll(n);
4418                 }
4419
4420                 WARN_ON_ONCE(work > weight);
4421
4422                 budget -= work;
4423
4424                 local_irq_disable();
4425
4426                 /* Drivers must not modify the NAPI state if they
4427                  * consume the entire weight.  In such cases this code
4428                  * still "owns" the NAPI instance and therefore can
4429                  * move the instance around on the list at-will.
4430                  */
4431                 if (unlikely(work == weight)) {
4432                         if (unlikely(napi_disable_pending(n))) {
4433                                 local_irq_enable();
4434                                 napi_complete(n);
4435                                 local_irq_disable();
4436                         } else {
4437                                 if (n->gro_list) {
4438                                         /* flush too old packets
4439                                          * If HZ < 1000, flush all packets.
4440                                          */
4441                                         local_irq_enable();
4442                                         napi_gro_flush(n, HZ >= 1000);
4443                                         local_irq_disable();
4444                                 }
4445                                 list_move_tail(&n->poll_list, &sd->poll_list);
4446                         }
4447                 }
4448
4449                 netpoll_poll_unlock(have);
4450         }
4451 out:
4452         net_rps_action_and_irq_enable(sd);
4453
4454 #ifdef CONFIG_NET_DMA
4455         /*
4456          * There may not be any more sk_buffs coming right now, so push
4457          * any pending DMA copies to hardware
4458          */
4459         dma_issue_pending_all();
4460 #endif
4461
4462         return;
4463
4464 softnet_break:
4465         sd->time_squeeze++;
4466         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4467         goto out;
4468 }
4469
4470 struct netdev_adjacent {
4471         struct net_device *dev;
4472
4473         /* upper master flag, there can only be one master device per list */
4474         bool master;
4475
4476         /* counter for the number of times this device was added to us */
4477         u16 ref_nr;
4478
4479         /* private field for the users */
4480         void *private;
4481
4482         struct list_head list;
4483         struct rcu_head rcu;
4484 };
4485
4486 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4487                                                  struct net_device *adj_dev,
4488                                                  struct list_head *adj_list)
4489 {
4490         struct netdev_adjacent *adj;
4491
4492         list_for_each_entry(adj, adj_list, list) {
4493                 if (adj->dev == adj_dev)
4494                         return adj;
4495         }
4496         return NULL;
4497 }
4498
4499 /**
4500  * netdev_has_upper_dev - Check if device is linked to an upper device
4501  * @dev: device
4502  * @upper_dev: upper device to check
4503  *
4504  * Find out if a device is linked to specified upper device and return true
4505  * in case it is. Note that this checks only immediate upper device,
4506  * not through a complete stack of devices. The caller must hold the RTNL lock.
4507  */
4508 bool netdev_has_upper_dev(struct net_device *dev,
4509                           struct net_device *upper_dev)
4510 {
4511         ASSERT_RTNL();
4512
4513         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4514 }
4515 EXPORT_SYMBOL(netdev_has_upper_dev);
4516
4517 /**
4518  * netdev_has_any_upper_dev - Check if device is linked to some device
4519  * @dev: device
4520  *
4521  * Find out if a device is linked to an upper device and return true in case
4522  * it is. The caller must hold the RTNL lock.
4523  */
4524 static bool netdev_has_any_upper_dev(struct net_device *dev)
4525 {
4526         ASSERT_RTNL();
4527
4528         return !list_empty(&dev->all_adj_list.upper);
4529 }
4530
4531 /**
4532  * netdev_master_upper_dev_get - Get master upper device
4533  * @dev: device
4534  *
4535  * Find a master upper device and return pointer to it or NULL in case
4536  * it's not there. The caller must hold the RTNL lock.
4537  */
4538 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4539 {
4540         struct netdev_adjacent *upper;
4541
4542         ASSERT_RTNL();
4543
4544         if (list_empty(&dev->adj_list.upper))
4545                 return NULL;
4546
4547         upper = list_first_entry(&dev->adj_list.upper,
4548                                  struct netdev_adjacent, list);
4549         if (likely(upper->master))
4550                 return upper->dev;
4551         return NULL;
4552 }
4553 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4554
4555 void *netdev_adjacent_get_private(struct list_head *adj_list)
4556 {
4557         struct netdev_adjacent *adj;
4558
4559         adj = list_entry(adj_list, struct netdev_adjacent, list);
4560
4561         return adj->private;
4562 }
4563 EXPORT_SYMBOL(netdev_adjacent_get_private);
4564
4565 /**
4566  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4567  * @dev: device
4568  * @iter: list_head ** of the current position
4569  *
4570  * Gets the next device from the dev's upper list, starting from iter
4571  * position. The caller must hold RCU read lock.
4572  */
4573 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4574                                                  struct list_head **iter)
4575 {
4576         struct netdev_adjacent *upper;
4577
4578         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4579
4580         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4581
4582         if (&upper->list == &dev->adj_list.upper)
4583                 return NULL;
4584
4585         *iter = &upper->list;
4586
4587         return upper->dev;
4588 }
4589 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4590
4591 /**
4592  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4593  * @dev: device
4594  * @iter: list_head ** of the current position
4595  *
4596  * Gets the next device from the dev's upper list, starting from iter
4597  * position. The caller must hold RCU read lock.
4598  */
4599 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4600                                                      struct list_head **iter)
4601 {
4602         struct netdev_adjacent *upper;
4603
4604         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4605
4606         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4607
4608         if (&upper->list == &dev->all_adj_list.upper)
4609                 return NULL;
4610
4611         *iter = &upper->list;
4612
4613         return upper->dev;
4614 }
4615 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4616
4617 /**
4618  * netdev_lower_get_next_private - Get the next ->private from the
4619  *                                 lower neighbour list
4620  * @dev: device
4621  * @iter: list_head ** of the current position
4622  *
4623  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4624  * list, starting from iter position. The caller must hold either hold the
4625  * RTNL lock or its own locking that guarantees that the neighbour lower
4626  * list will remain unchainged.
4627  */
4628 void *netdev_lower_get_next_private(struct net_device *dev,
4629                                     struct list_head **iter)
4630 {
4631         struct netdev_adjacent *lower;
4632
4633         lower = list_entry(*iter, struct netdev_adjacent, list);
4634
4635         if (&lower->list == &dev->adj_list.lower)
4636                 return NULL;
4637
4638         *iter = lower->list.next;
4639
4640         return lower->private;
4641 }
4642 EXPORT_SYMBOL(netdev_lower_get_next_private);
4643
4644 /**
4645  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4646  *                                     lower neighbour list, RCU
4647  *                                     variant
4648  * @dev: device
4649  * @iter: list_head ** of the current position
4650  *
4651  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4652  * list, starting from iter position. The caller must hold RCU read lock.
4653  */
4654 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4655                                         struct list_head **iter)
4656 {
4657         struct netdev_adjacent *lower;
4658
4659         WARN_ON_ONCE(!rcu_read_lock_held());
4660
4661         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4662
4663         if (&lower->list == &dev->adj_list.lower)
4664                 return NULL;
4665
4666         *iter = &lower->list;
4667
4668         return lower->private;
4669 }
4670 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4671
4672 /**
4673  * netdev_lower_get_next - Get the next device from the lower neighbour
4674  *                         list
4675  * @dev: device
4676  * @iter: list_head ** of the current position
4677  *
4678  * Gets the next netdev_adjacent from the dev's lower neighbour
4679  * list, starting from iter position. The caller must hold RTNL lock or
4680  * its own locking that guarantees that the neighbour lower
4681  * list will remain unchainged.
4682  */
4683 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4684 {
4685         struct netdev_adjacent *lower;
4686
4687         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4688
4689         if (&lower->list == &dev->adj_list.lower)
4690                 return NULL;
4691
4692         *iter = &lower->list;
4693
4694         return lower->dev;
4695 }
4696 EXPORT_SYMBOL(netdev_lower_get_next);
4697
4698 /**
4699  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4700  *                                     lower neighbour list, RCU
4701  *                                     variant
4702  * @dev: device
4703  *
4704  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4705  * list. The caller must hold RCU read lock.
4706  */
4707 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4708 {
4709         struct netdev_adjacent *lower;
4710
4711         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4712                         struct netdev_adjacent, list);
4713         if (lower)
4714                 return lower->private;
4715         return NULL;
4716 }
4717 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4718
4719 /**
4720  * netdev_master_upper_dev_get_rcu - Get master upper device
4721  * @dev: device
4722  *
4723  * Find a master upper device and return pointer to it or NULL in case
4724  * it's not there. The caller must hold the RCU read lock.
4725  */
4726 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4727 {
4728         struct netdev_adjacent *upper;
4729
4730         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4731                                        struct netdev_adjacent, list);
4732         if (upper && likely(upper->master))
4733                 return upper->dev;
4734         return NULL;
4735 }
4736 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4737
4738 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4739                               struct net_device *adj_dev,
4740                               struct list_head *dev_list)
4741 {
4742         char linkname[IFNAMSIZ+7];
4743         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4744                 "upper_%s" : "lower_%s", adj_dev->name);
4745         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4746                                  linkname);
4747 }
4748 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4749                                char *name,
4750                                struct list_head *dev_list)
4751 {
4752         char linkname[IFNAMSIZ+7];
4753         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4754                 "upper_%s" : "lower_%s", name);
4755         sysfs_remove_link(&(dev->dev.kobj), linkname);
4756 }
4757
4758 #define netdev_adjacent_is_neigh_list(dev, dev_list) \
4759                 (dev_list == &dev->adj_list.upper || \
4760                  dev_list == &dev->adj_list.lower)
4761
4762 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4763                                         struct net_device *adj_dev,
4764                                         struct list_head *dev_list,
4765                                         void *private, bool master)
4766 {
4767         struct netdev_adjacent *adj;
4768         int ret;
4769
4770         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4771
4772         if (adj) {
4773                 adj->ref_nr++;
4774                 return 0;
4775         }
4776
4777         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4778         if (!adj)
4779                 return -ENOMEM;
4780
4781         adj->dev = adj_dev;
4782         adj->master = master;
4783         adj->ref_nr = 1;
4784         adj->private = private;
4785         dev_hold(adj_dev);
4786
4787         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4788                  adj_dev->name, dev->name, adj_dev->name);
4789
4790         if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
4791                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4792                 if (ret)
4793                         goto free_adj;
4794         }
4795
4796         /* Ensure that master link is always the first item in list. */
4797         if (master) {
4798                 ret = sysfs_create_link(&(dev->dev.kobj),
4799                                         &(adj_dev->dev.kobj), "master");
4800                 if (ret)
4801                         goto remove_symlinks;
4802
4803                 list_add_rcu(&adj->list, dev_list);
4804         } else {
4805                 list_add_tail_rcu(&adj->list, dev_list);
4806         }
4807
4808         return 0;
4809
4810 remove_symlinks:
4811         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4812                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4813 free_adj:
4814         kfree(adj);
4815         dev_put(adj_dev);
4816
4817         return ret;
4818 }
4819
4820 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4821                                          struct net_device *adj_dev,
4822                                          struct list_head *dev_list)
4823 {
4824         struct netdev_adjacent *adj;
4825
4826         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4827
4828         if (!adj) {
4829                 pr_err("tried to remove device %s from %s\n",
4830                        dev->name, adj_dev->name);
4831                 BUG();
4832         }
4833
4834         if (adj->ref_nr > 1) {
4835                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4836                          adj->ref_nr-1);
4837                 adj->ref_nr--;
4838                 return;
4839         }
4840
4841         if (adj->master)
4842                 sysfs_remove_link(&(dev->dev.kobj), "master");
4843
4844         if (netdev_adjacent_is_neigh_list(dev, dev_list))
4845                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4846
4847         list_del_rcu(&adj->list);
4848         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4849                  adj_dev->name, dev->name, adj_dev->name);
4850         dev_put(adj_dev);
4851         kfree_rcu(adj, rcu);
4852 }
4853
4854 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4855                                             struct net_device *upper_dev,
4856                                             struct list_head *up_list,
4857                                             struct list_head *down_list,
4858                                             void *private, bool master)
4859 {
4860         int ret;
4861
4862         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4863                                            master);
4864         if (ret)
4865                 return ret;
4866
4867         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4868                                            false);
4869         if (ret) {
4870                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4871                 return ret;
4872         }
4873
4874         return 0;
4875 }
4876
4877 static int __netdev_adjacent_dev_link(struct net_device *dev,
4878                                       struct net_device *upper_dev)
4879 {
4880         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4881                                                 &dev->all_adj_list.upper,
4882                                                 &upper_dev->all_adj_list.lower,
4883                                                 NULL, false);
4884 }
4885
4886 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4887                                                struct net_device *upper_dev,
4888                                                struct list_head *up_list,
4889                                                struct list_head *down_list)
4890 {
4891         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4892         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4893 }
4894
4895 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
4896                                          struct net_device *upper_dev)
4897 {
4898         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4899                                            &dev->all_adj_list.upper,
4900                                            &upper_dev->all_adj_list.lower);
4901 }
4902
4903 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4904                                                 struct net_device *upper_dev,
4905                                                 void *private, bool master)
4906 {
4907         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4908
4909         if (ret)
4910                 return ret;
4911
4912         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4913                                                &dev->adj_list.upper,
4914                                                &upper_dev->adj_list.lower,
4915                                                private, master);
4916         if (ret) {
4917                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4918                 return ret;
4919         }
4920
4921         return 0;
4922 }
4923
4924 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4925                                                    struct net_device *upper_dev)
4926 {
4927         __netdev_adjacent_dev_unlink(dev, upper_dev);
4928         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4929                                            &dev->adj_list.upper,
4930                                            &upper_dev->adj_list.lower);
4931 }
4932
4933 static int __netdev_upper_dev_link(struct net_device *dev,
4934                                    struct net_device *upper_dev, bool master,
4935                                    void *private)
4936 {
4937         struct netdev_adjacent *i, *j, *to_i, *to_j;
4938         int ret = 0;
4939
4940         ASSERT_RTNL();
4941
4942         if (dev == upper_dev)
4943                 return -EBUSY;
4944
4945         /* To prevent loops, check if dev is not upper device to upper_dev. */
4946         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4947                 return -EBUSY;
4948
4949         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4950                 return -EEXIST;
4951
4952         if (master && netdev_master_upper_dev_get(dev))
4953                 return -EBUSY;
4954
4955         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4956                                                    master);
4957         if (ret)
4958                 return ret;
4959
4960         /* Now that we linked these devs, make all the upper_dev's
4961          * all_adj_list.upper visible to every dev's all_adj_list.lower an
4962          * versa, and don't forget the devices itself. All of these
4963          * links are non-neighbours.
4964          */
4965         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4966                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4967                         pr_debug("Interlinking %s with %s, non-neighbour\n",
4968                                  i->dev->name, j->dev->name);
4969                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4970                         if (ret)
4971                                 goto rollback_mesh;
4972                 }
4973         }
4974
4975         /* add dev to every upper_dev's upper device */
4976         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4977                 pr_debug("linking %s's upper device %s with %s\n",
4978                          upper_dev->name, i->dev->name, dev->name);
4979                 ret = __netdev_adjacent_dev_link(dev, i->dev);
4980                 if (ret)
4981                         goto rollback_upper_mesh;
4982         }
4983
4984         /* add upper_dev to every dev's lower device */
4985         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4986                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
4987                          i->dev->name, upper_dev->name);
4988                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4989                 if (ret)
4990                         goto rollback_lower_mesh;
4991         }
4992
4993         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4994         return 0;
4995
4996 rollback_lower_mesh:
4997         to_i = i;
4998         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4999                 if (i == to_i)
5000                         break;
5001                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5002         }
5003
5004         i = NULL;
5005
5006 rollback_upper_mesh:
5007         to_i = i;
5008         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5009                 if (i == to_i)
5010                         break;
5011                 __netdev_adjacent_dev_unlink(dev, i->dev);
5012         }
5013
5014         i = j = NULL;
5015
5016 rollback_mesh:
5017         to_i = i;
5018         to_j = j;
5019         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5020                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5021                         if (i == to_i && j == to_j)
5022                                 break;
5023                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5024                 }
5025                 if (i == to_i)
5026                         break;
5027         }
5028
5029         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5030
5031         return ret;
5032 }
5033
5034 /**
5035  * netdev_upper_dev_link - Add a link to the upper device
5036  * @dev: device
5037  * @upper_dev: new upper device
5038  *
5039  * Adds a link to device which is upper to this one. The caller must hold
5040  * the RTNL lock. On a failure a negative errno code is returned.
5041  * On success the reference counts are adjusted and the function
5042  * returns zero.
5043  */
5044 int netdev_upper_dev_link(struct net_device *dev,
5045                           struct net_device *upper_dev)
5046 {
5047         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5048 }
5049 EXPORT_SYMBOL(netdev_upper_dev_link);
5050
5051 /**
5052  * netdev_master_upper_dev_link - Add a master link to the upper device
5053  * @dev: device
5054  * @upper_dev: new upper device
5055  *
5056  * Adds a link to device which is upper to this one. In this case, only
5057  * one master upper device can be linked, although other non-master devices
5058  * might be linked as well. The caller must hold the RTNL lock.
5059  * On a failure a negative errno code is returned. On success the reference
5060  * counts are adjusted and the function returns zero.
5061  */
5062 int netdev_master_upper_dev_link(struct net_device *dev,
5063                                  struct net_device *upper_dev)
5064 {
5065         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5066 }
5067 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5068
5069 int netdev_master_upper_dev_link_private(struct net_device *dev,
5070                                          struct net_device *upper_dev,
5071                                          void *private)
5072 {
5073         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5074 }
5075 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5076
5077 /**
5078  * netdev_upper_dev_unlink - Removes a link to upper device
5079  * @dev: device
5080  * @upper_dev: new upper device
5081  *
5082  * Removes a link to device which is upper to this one. The caller must hold
5083  * the RTNL lock.
5084  */
5085 void netdev_upper_dev_unlink(struct net_device *dev,
5086                              struct net_device *upper_dev)
5087 {
5088         struct netdev_adjacent *i, *j;
5089         ASSERT_RTNL();
5090
5091         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5092
5093         /* Here is the tricky part. We must remove all dev's lower
5094          * devices from all upper_dev's upper devices and vice
5095          * versa, to maintain the graph relationship.
5096          */
5097         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5098                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5099                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5100
5101         /* remove also the devices itself from lower/upper device
5102          * list
5103          */
5104         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5105                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5106
5107         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5108                 __netdev_adjacent_dev_unlink(dev, i->dev);
5109
5110         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5111 }
5112 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5113
5114 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5115 {
5116         struct netdev_adjacent *iter;
5117
5118         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5119                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5120                                           &iter->dev->adj_list.lower);
5121                 netdev_adjacent_sysfs_add(iter->dev, dev,
5122                                           &iter->dev->adj_list.lower);
5123         }
5124
5125         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5126                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5127                                           &iter->dev->adj_list.upper);
5128                 netdev_adjacent_sysfs_add(iter->dev, dev,
5129                                           &iter->dev->adj_list.upper);
5130         }
5131 }
5132
5133 void *netdev_lower_dev_get_private(struct net_device *dev,
5134                                    struct net_device *lower_dev)
5135 {
5136         struct netdev_adjacent *lower;
5137
5138         if (!lower_dev)
5139                 return NULL;
5140         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5141         if (!lower)
5142                 return NULL;
5143
5144         return lower->private;
5145 }
5146 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5147
5148
5149 int dev_get_nest_level(struct net_device *dev,
5150                        bool (*type_check)(struct net_device *dev))
5151 {
5152         struct net_device *lower = NULL;
5153         struct list_head *iter;
5154         int max_nest = -1;
5155         int nest;
5156
5157         ASSERT_RTNL();
5158
5159         netdev_for_each_lower_dev(dev, lower, iter) {
5160                 nest = dev_get_nest_level(lower, type_check);
5161                 if (max_nest < nest)
5162                         max_nest = nest;
5163         }
5164
5165         if (type_check(dev))
5166                 max_nest++;
5167
5168         return max_nest;
5169 }
5170 EXPORT_SYMBOL(dev_get_nest_level);
5171
5172 static void dev_change_rx_flags(struct net_device *dev, int flags)
5173 {
5174         const struct net_device_ops *ops = dev->netdev_ops;
5175
5176         if (ops->ndo_change_rx_flags)
5177                 ops->ndo_change_rx_flags(dev, flags);
5178 }
5179
5180 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5181 {
5182         unsigned int old_flags = dev->flags;
5183         kuid_t uid;
5184         kgid_t gid;
5185
5186         ASSERT_RTNL();
5187
5188         dev->flags |= IFF_PROMISC;
5189         dev->promiscuity += inc;
5190         if (dev->promiscuity == 0) {
5191                 /*
5192                  * Avoid overflow.
5193                  * If inc causes overflow, untouch promisc and return error.
5194                  */
5195                 if (inc < 0)
5196                         dev->flags &= ~IFF_PROMISC;
5197                 else {
5198                         dev->promiscuity -= inc;
5199                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5200                                 dev->name);
5201                         return -EOVERFLOW;
5202                 }
5203         }
5204         if (dev->flags != old_flags) {
5205                 pr_info("device %s %s promiscuous mode\n",
5206                         dev->name,
5207                         dev->flags & IFF_PROMISC ? "entered" : "left");
5208                 if (audit_enabled) {
5209                         current_uid_gid(&uid, &gid);
5210                         audit_log(current->audit_context, GFP_ATOMIC,
5211                                 AUDIT_ANOM_PROMISCUOUS,
5212                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5213                                 dev->name, (dev->flags & IFF_PROMISC),
5214                                 (old_flags & IFF_PROMISC),
5215                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5216                                 from_kuid(&init_user_ns, uid),
5217                                 from_kgid(&init_user_ns, gid),
5218                                 audit_get_sessionid(current));
5219                 }
5220
5221                 dev_change_rx_flags(dev, IFF_PROMISC);
5222         }
5223         if (notify)
5224                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5225         return 0;
5226 }
5227
5228 /**
5229  *      dev_set_promiscuity     - update promiscuity count on a device
5230  *      @dev: device
5231  *      @inc: modifier
5232  *
5233  *      Add or remove promiscuity from a device. While the count in the device
5234  *      remains above zero the interface remains promiscuous. Once it hits zero
5235  *      the device reverts back to normal filtering operation. A negative inc
5236  *      value is used to drop promiscuity on the device.
5237  *      Return 0 if successful or a negative errno code on error.
5238  */
5239 int dev_set_promiscuity(struct net_device *dev, int inc)
5240 {
5241         unsigned int old_flags = dev->flags;
5242         int err;
5243
5244         err = __dev_set_promiscuity(dev, inc, true);
5245         if (err < 0)
5246                 return err;
5247         if (dev->flags != old_flags)
5248                 dev_set_rx_mode(dev);
5249         return err;
5250 }
5251 EXPORT_SYMBOL(dev_set_promiscuity);
5252
5253 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5254 {
5255         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5256
5257         ASSERT_RTNL();
5258
5259         dev->flags |= IFF_ALLMULTI;
5260         dev->allmulti += inc;
5261         if (dev->allmulti == 0) {
5262                 /*
5263                  * Avoid overflow.
5264                  * If inc causes overflow, untouch allmulti and return error.
5265                  */
5266                 if (inc < 0)
5267                         dev->flags &= ~IFF_ALLMULTI;
5268                 else {
5269                         dev->allmulti -= inc;
5270                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5271                                 dev->name);
5272                         return -EOVERFLOW;
5273                 }
5274         }
5275         if (dev->flags ^ old_flags) {
5276                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5277                 dev_set_rx_mode(dev);
5278                 if (notify)
5279                         __dev_notify_flags(dev, old_flags,
5280                                            dev->gflags ^ old_gflags);
5281         }
5282         return 0;
5283 }
5284
5285 /**
5286  *      dev_set_allmulti        - update allmulti count on a device
5287  *      @dev: device
5288  *      @inc: modifier
5289  *
5290  *      Add or remove reception of all multicast frames to a device. While the
5291  *      count in the device remains above zero the interface remains listening
5292  *      to all interfaces. Once it hits zero the device reverts back to normal
5293  *      filtering operation. A negative @inc value is used to drop the counter
5294  *      when releasing a resource needing all multicasts.
5295  *      Return 0 if successful or a negative errno code on error.
5296  */
5297
5298 int dev_set_allmulti(struct net_device *dev, int inc)
5299 {
5300         return __dev_set_allmulti(dev, inc, true);
5301 }
5302 EXPORT_SYMBOL(dev_set_allmulti);
5303
5304 /*
5305  *      Upload unicast and multicast address lists to device and
5306  *      configure RX filtering. When the device doesn't support unicast
5307  *      filtering it is put in promiscuous mode while unicast addresses
5308  *      are present.
5309  */
5310 void __dev_set_rx_mode(struct net_device *dev)
5311 {
5312         const struct net_device_ops *ops = dev->netdev_ops;
5313
5314         /* dev_open will call this function so the list will stay sane. */
5315         if (!(dev->flags&IFF_UP))
5316                 return;
5317
5318         if (!netif_device_present(dev))
5319                 return;
5320
5321         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5322                 /* Unicast addresses changes may only happen under the rtnl,
5323                  * therefore calling __dev_set_promiscuity here is safe.
5324                  */
5325                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5326                         __dev_set_promiscuity(dev, 1, false);
5327                         dev->uc_promisc = true;
5328                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5329                         __dev_set_promiscuity(dev, -1, false);
5330                         dev->uc_promisc = false;
5331                 }
5332         }
5333
5334         if (ops->ndo_set_rx_mode)
5335                 ops->ndo_set_rx_mode(dev);
5336 }
5337
5338 void dev_set_rx_mode(struct net_device *dev)
5339 {
5340         netif_addr_lock_bh(dev);
5341         __dev_set_rx_mode(dev);
5342         netif_addr_unlock_bh(dev);
5343 }
5344
5345 /**
5346  *      dev_get_flags - get flags reported to userspace
5347  *      @dev: device
5348  *
5349  *      Get the combination of flag bits exported through APIs to userspace.
5350  */
5351 unsigned int dev_get_flags(const struct net_device *dev)
5352 {
5353         unsigned int flags;
5354
5355         flags = (dev->flags & ~(IFF_PROMISC |
5356                                 IFF_ALLMULTI |
5357                                 IFF_RUNNING |
5358                                 IFF_LOWER_UP |
5359                                 IFF_DORMANT)) |
5360                 (dev->gflags & (IFF_PROMISC |
5361                                 IFF_ALLMULTI));
5362
5363         if (netif_running(dev)) {
5364                 if (netif_oper_up(dev))
5365                         flags |= IFF_RUNNING;
5366                 if (netif_carrier_ok(dev))
5367                         flags |= IFF_LOWER_UP;
5368                 if (netif_dormant(dev))
5369                         flags |= IFF_DORMANT;
5370         }
5371
5372         return flags;
5373 }
5374 EXPORT_SYMBOL(dev_get_flags);
5375
5376 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5377 {
5378         unsigned int old_flags = dev->flags;
5379         int ret;
5380
5381         ASSERT_RTNL();
5382
5383         /*
5384          *      Set the flags on our device.
5385          */
5386
5387         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5388                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5389                                IFF_AUTOMEDIA)) |
5390                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5391                                     IFF_ALLMULTI));
5392
5393         /*
5394          *      Load in the correct multicast list now the flags have changed.
5395          */
5396
5397         if ((old_flags ^ flags) & IFF_MULTICAST)
5398                 dev_change_rx_flags(dev, IFF_MULTICAST);
5399
5400         dev_set_rx_mode(dev);
5401
5402         /*
5403          *      Have we downed the interface. We handle IFF_UP ourselves
5404          *      according to user attempts to set it, rather than blindly
5405          *      setting it.
5406          */
5407
5408         ret = 0;
5409         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5410                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5411
5412                 if (!ret)
5413                         dev_set_rx_mode(dev);
5414         }
5415
5416         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5417                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5418                 unsigned int old_flags = dev->flags;
5419
5420                 dev->gflags ^= IFF_PROMISC;
5421
5422                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5423                         if (dev->flags != old_flags)
5424                                 dev_set_rx_mode(dev);
5425         }
5426
5427         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5428            is important. Some (broken) drivers set IFF_PROMISC, when
5429            IFF_ALLMULTI is requested not asking us and not reporting.
5430          */
5431         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5432                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5433
5434                 dev->gflags ^= IFF_ALLMULTI;
5435                 __dev_set_allmulti(dev, inc, false);
5436         }
5437
5438         return ret;
5439 }
5440
5441 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5442                         unsigned int gchanges)
5443 {
5444         unsigned int changes = dev->flags ^ old_flags;
5445
5446         if (gchanges)
5447                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5448
5449         if (changes & IFF_UP) {
5450                 if (dev->flags & IFF_UP)
5451                         call_netdevice_notifiers(NETDEV_UP, dev);
5452                 else
5453                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5454         }
5455
5456         if (dev->flags & IFF_UP &&
5457             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5458                 struct netdev_notifier_change_info change_info;
5459
5460                 change_info.flags_changed = changes;
5461                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5462                                               &change_info.info);
5463         }
5464 }
5465
5466 /**
5467  *      dev_change_flags - change device settings
5468  *      @dev: device
5469  *      @flags: device state flags
5470  *
5471  *      Change settings on device based state flags. The flags are
5472  *      in the userspace exported format.
5473  */
5474 int dev_change_flags(struct net_device *dev, unsigned int flags)
5475 {
5476         int ret;
5477         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5478
5479         ret = __dev_change_flags(dev, flags);
5480         if (ret < 0)
5481                 return ret;
5482
5483         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5484         __dev_notify_flags(dev, old_flags, changes);
5485         return ret;
5486 }
5487 EXPORT_SYMBOL(dev_change_flags);
5488
5489 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5490 {
5491         const struct net_device_ops *ops = dev->netdev_ops;
5492
5493         if (ops->ndo_change_mtu)
5494                 return ops->ndo_change_mtu(dev, new_mtu);
5495
5496         dev->mtu = new_mtu;
5497         return 0;
5498 }
5499
5500 /**
5501  *      dev_set_mtu - Change maximum transfer unit
5502  *      @dev: device
5503  *      @new_mtu: new transfer unit
5504  *
5505  *      Change the maximum transfer size of the network device.
5506  */
5507 int dev_set_mtu(struct net_device *dev, int new_mtu)
5508 {
5509         int err, orig_mtu;
5510
5511         if (new_mtu == dev->mtu)
5512                 return 0;
5513
5514         /*      MTU must be positive.    */
5515         if (new_mtu < 0)
5516                 return -EINVAL;
5517
5518         if (!netif_device_present(dev))
5519                 return -ENODEV;
5520
5521         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5522         err = notifier_to_errno(err);
5523         if (err)
5524                 return err;
5525
5526         orig_mtu = dev->mtu;
5527         err = __dev_set_mtu(dev, new_mtu);
5528
5529         if (!err) {
5530                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5531                 err = notifier_to_errno(err);
5532                 if (err) {
5533                         /* setting mtu back and notifying everyone again,
5534                          * so that they have a chance to revert changes.
5535                          */
5536                         __dev_set_mtu(dev, orig_mtu);
5537                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5538                 }
5539         }
5540         return err;
5541 }
5542 EXPORT_SYMBOL(dev_set_mtu);
5543
5544 /**
5545  *      dev_set_group - Change group this device belongs to
5546  *      @dev: device
5547  *      @new_group: group this device should belong to
5548  */
5549 void dev_set_group(struct net_device *dev, int new_group)
5550 {
5551         dev->group = new_group;
5552 }
5553 EXPORT_SYMBOL(dev_set_group);
5554
5555 /**
5556  *      dev_set_mac_address - Change Media Access Control Address
5557  *      @dev: device
5558  *      @sa: new address
5559  *
5560  *      Change the hardware (MAC) address of the device
5561  */
5562 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5563 {
5564         const struct net_device_ops *ops = dev->netdev_ops;
5565         int err;
5566
5567         if (!ops->ndo_set_mac_address)
5568                 return -EOPNOTSUPP;
5569         if (sa->sa_family != dev->type)
5570                 return -EINVAL;
5571         if (!netif_device_present(dev))
5572                 return -ENODEV;
5573         err = ops->ndo_set_mac_address(dev, sa);
5574         if (err)
5575                 return err;
5576         dev->addr_assign_type = NET_ADDR_SET;
5577         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5578         add_device_randomness(dev->dev_addr, dev->addr_len);
5579         return 0;
5580 }
5581 EXPORT_SYMBOL(dev_set_mac_address);
5582
5583 /**
5584  *      dev_change_carrier - Change device carrier
5585  *      @dev: device
5586  *      @new_carrier: new value
5587  *
5588  *      Change device carrier
5589  */
5590 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5591 {
5592         const struct net_device_ops *ops = dev->netdev_ops;
5593
5594         if (!ops->ndo_change_carrier)
5595                 return -EOPNOTSUPP;
5596         if (!netif_device_present(dev))
5597                 return -ENODEV;
5598         return ops->ndo_change_carrier(dev, new_carrier);
5599 }
5600 EXPORT_SYMBOL(dev_change_carrier);
5601
5602 /**
5603  *      dev_get_phys_port_id - Get device physical port ID
5604  *      @dev: device
5605  *      @ppid: port ID
5606  *
5607  *      Get device physical port ID
5608  */
5609 int dev_get_phys_port_id(struct net_device *dev,
5610                          struct netdev_phys_port_id *ppid)
5611 {
5612         const struct net_device_ops *ops = dev->netdev_ops;
5613
5614         if (!ops->ndo_get_phys_port_id)
5615                 return -EOPNOTSUPP;
5616         return ops->ndo_get_phys_port_id(dev, ppid);
5617 }
5618 EXPORT_SYMBOL(dev_get_phys_port_id);
5619
5620 /**
5621  *      dev_new_index   -       allocate an ifindex
5622  *      @net: the applicable net namespace
5623  *
5624  *      Returns a suitable unique value for a new device interface
5625  *      number.  The caller must hold the rtnl semaphore or the
5626  *      dev_base_lock to be sure it remains unique.
5627  */
5628 static int dev_new_index(struct net *net)
5629 {
5630         int ifindex = net->ifindex;
5631         for (;;) {
5632                 if (++ifindex <= 0)
5633                         ifindex = 1;
5634                 if (!__dev_get_by_index(net, ifindex))
5635                         return net->ifindex = ifindex;
5636         }
5637 }
5638
5639 /* Delayed registration/unregisteration */
5640 static LIST_HEAD(net_todo_list);
5641 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5642
5643 static void net_set_todo(struct net_device *dev)
5644 {
5645         list_add_tail(&dev->todo_list, &net_todo_list);
5646         dev_net(dev)->dev_unreg_count++;
5647 }
5648
5649 static void rollback_registered_many(struct list_head *head)
5650 {
5651         struct net_device *dev, *tmp;
5652         LIST_HEAD(close_head);
5653
5654         BUG_ON(dev_boot_phase);
5655         ASSERT_RTNL();
5656
5657         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5658                 /* Some devices call without registering
5659                  * for initialization unwind. Remove those
5660                  * devices and proceed with the remaining.
5661                  */
5662                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5663                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5664                                  dev->name, dev);
5665
5666                         WARN_ON(1);
5667                         list_del(&dev->unreg_list);
5668                         continue;
5669                 }
5670                 dev->dismantle = true;
5671                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5672         }
5673
5674         /* If device is running, close it first. */
5675         list_for_each_entry(dev, head, unreg_list)
5676                 list_add_tail(&dev->close_list, &close_head);
5677         dev_close_many(&close_head);
5678
5679         list_for_each_entry(dev, head, unreg_list) {
5680                 /* And unlink it from device chain. */
5681                 unlist_netdevice(dev);
5682
5683                 dev->reg_state = NETREG_UNREGISTERING;
5684         }
5685
5686         synchronize_net();
5687
5688         list_for_each_entry(dev, head, unreg_list) {
5689                 /* Shutdown queueing discipline. */
5690                 dev_shutdown(dev);
5691
5692
5693                 /* Notify protocols, that we are about to destroy
5694                    this device. They should clean all the things.
5695                 */
5696                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5697
5698                 /*
5699                  *      Flush the unicast and multicast chains
5700                  */
5701                 dev_uc_flush(dev);
5702                 dev_mc_flush(dev);
5703
5704                 if (dev->netdev_ops->ndo_uninit)
5705                         dev->netdev_ops->ndo_uninit(dev);
5706
5707                 if (!dev->rtnl_link_ops ||
5708                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5709                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5710
5711                 /* Notifier chain MUST detach us all upper devices. */
5712                 WARN_ON(netdev_has_any_upper_dev(dev));
5713
5714                 /* Remove entries from kobject tree */
5715                 netdev_unregister_kobject(dev);
5716 #ifdef CONFIG_XPS
5717                 /* Remove XPS queueing entries */
5718                 netif_reset_xps_queues_gt(dev, 0);
5719 #endif
5720         }
5721
5722         synchronize_net();
5723
5724         list_for_each_entry(dev, head, unreg_list)
5725                 dev_put(dev);
5726 }
5727
5728 static void rollback_registered(struct net_device *dev)
5729 {
5730         LIST_HEAD(single);
5731
5732         list_add(&dev->unreg_list, &single);
5733         rollback_registered_many(&single);
5734         list_del(&single);
5735 }
5736
5737 static netdev_features_t netdev_fix_features(struct net_device *dev,
5738         netdev_features_t features)
5739 {
5740         /* Fix illegal checksum combinations */
5741         if ((features & NETIF_F_HW_CSUM) &&
5742             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5743                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5744                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5745         }
5746
5747         /* TSO requires that SG is present as well. */
5748         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5749                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5750                 features &= ~NETIF_F_ALL_TSO;
5751         }
5752
5753         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5754                                         !(features & NETIF_F_IP_CSUM)) {
5755                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5756                 features &= ~NETIF_F_TSO;
5757                 features &= ~NETIF_F_TSO_ECN;
5758         }
5759
5760         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5761                                          !(features & NETIF_F_IPV6_CSUM)) {
5762                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5763                 features &= ~NETIF_F_TSO6;
5764         }
5765
5766         /* TSO ECN requires that TSO is present as well. */
5767         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5768                 features &= ~NETIF_F_TSO_ECN;
5769
5770         /* Software GSO depends on SG. */
5771         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5772                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5773                 features &= ~NETIF_F_GSO;
5774         }
5775
5776         /* UFO needs SG and checksumming */
5777         if (features & NETIF_F_UFO) {
5778                 /* maybe split UFO into V4 and V6? */
5779                 if (!((features & NETIF_F_GEN_CSUM) ||
5780                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5781                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5782                         netdev_dbg(dev,
5783                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5784                         features &= ~NETIF_F_UFO;
5785                 }
5786
5787                 if (!(features & NETIF_F_SG)) {
5788                         netdev_dbg(dev,
5789                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5790                         features &= ~NETIF_F_UFO;
5791                 }
5792         }
5793
5794 #ifdef CONFIG_NET_RX_BUSY_POLL
5795         if (dev->netdev_ops->ndo_busy_poll)
5796                 features |= NETIF_F_BUSY_POLL;
5797         else
5798 #endif
5799                 features &= ~NETIF_F_BUSY_POLL;
5800
5801         return features;
5802 }
5803
5804 int __netdev_update_features(struct net_device *dev)
5805 {
5806         netdev_features_t features;
5807         int err = 0;
5808
5809         ASSERT_RTNL();
5810
5811         features = netdev_get_wanted_features(dev);
5812
5813         if (dev->netdev_ops->ndo_fix_features)
5814                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5815
5816         /* driver might be less strict about feature dependencies */
5817         features = netdev_fix_features(dev, features);
5818
5819         if (dev->features == features)
5820                 return 0;
5821
5822         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5823                 &dev->features, &features);
5824
5825         if (dev->netdev_ops->ndo_set_features)
5826                 err = dev->netdev_ops->ndo_set_features(dev, features);
5827
5828         if (unlikely(err < 0)) {
5829                 netdev_err(dev,
5830                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5831                         err, &features, &dev->features);
5832                 return -1;
5833         }
5834
5835         if (!err)
5836                 dev->features = features;
5837
5838         return 1;
5839 }
5840
5841 /**
5842  *      netdev_update_features - recalculate device features
5843  *      @dev: the device to check
5844  *
5845  *      Recalculate dev->features set and send notifications if it
5846  *      has changed. Should be called after driver or hardware dependent
5847  *      conditions might have changed that influence the features.
5848  */
5849 void netdev_update_features(struct net_device *dev)
5850 {
5851         if (__netdev_update_features(dev))
5852                 netdev_features_change(dev);
5853 }
5854 EXPORT_SYMBOL(netdev_update_features);
5855
5856 /**
5857  *      netdev_change_features - recalculate device features
5858  *      @dev: the device to check
5859  *
5860  *      Recalculate dev->features set and send notifications even
5861  *      if they have not changed. Should be called instead of
5862  *      netdev_update_features() if also dev->vlan_features might
5863  *      have changed to allow the changes to be propagated to stacked
5864  *      VLAN devices.
5865  */
5866 void netdev_change_features(struct net_device *dev)
5867 {
5868         __netdev_update_features(dev);
5869         netdev_features_change(dev);
5870 }
5871 EXPORT_SYMBOL(netdev_change_features);
5872
5873 /**
5874  *      netif_stacked_transfer_operstate -      transfer operstate
5875  *      @rootdev: the root or lower level device to transfer state from
5876  *      @dev: the device to transfer operstate to
5877  *
5878  *      Transfer operational state from root to device. This is normally
5879  *      called when a stacking relationship exists between the root
5880  *      device and the device(a leaf device).
5881  */
5882 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5883                                         struct net_device *dev)
5884 {
5885         if (rootdev->operstate == IF_OPER_DORMANT)
5886                 netif_dormant_on(dev);
5887         else
5888                 netif_dormant_off(dev);
5889
5890         if (netif_carrier_ok(rootdev)) {
5891                 if (!netif_carrier_ok(dev))
5892                         netif_carrier_on(dev);
5893         } else {
5894                 if (netif_carrier_ok(dev))
5895                         netif_carrier_off(dev);
5896         }
5897 }
5898 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5899
5900 #ifdef CONFIG_SYSFS
5901 static int netif_alloc_rx_queues(struct net_device *dev)
5902 {
5903         unsigned int i, count = dev->num_rx_queues;
5904         struct netdev_rx_queue *rx;
5905
5906         BUG_ON(count < 1);
5907
5908         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5909         if (!rx)
5910                 return -ENOMEM;
5911
5912         dev->_rx = rx;
5913
5914         for (i = 0; i < count; i++)
5915                 rx[i].dev = dev;
5916         return 0;
5917 }
5918 #endif
5919
5920 static void netdev_init_one_queue(struct net_device *dev,
5921                                   struct netdev_queue *queue, void *_unused)
5922 {
5923         /* Initialize queue lock */
5924         spin_lock_init(&queue->_xmit_lock);
5925         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5926         queue->xmit_lock_owner = -1;
5927         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5928         queue->dev = dev;
5929 #ifdef CONFIG_BQL
5930         dql_init(&queue->dql, HZ);
5931 #endif
5932 }
5933
5934 static void netif_free_tx_queues(struct net_device *dev)
5935 {
5936         if (is_vmalloc_addr(dev->_tx))
5937                 vfree(dev->_tx);
5938         else
5939                 kfree(dev->_tx);
5940 }
5941
5942 static int netif_alloc_netdev_queues(struct net_device *dev)
5943 {
5944         unsigned int count = dev->num_tx_queues;
5945         struct netdev_queue *tx;
5946         size_t sz = count * sizeof(*tx);
5947
5948         BUG_ON(count < 1 || count > 0xffff);
5949
5950         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5951         if (!tx) {
5952                 tx = vzalloc(sz);
5953                 if (!tx)
5954                         return -ENOMEM;
5955         }
5956         dev->_tx = tx;
5957
5958         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5959         spin_lock_init(&dev->tx_global_lock);
5960
5961         return 0;
5962 }
5963
5964 /**
5965  *      register_netdevice      - register a network device
5966  *      @dev: device to register
5967  *
5968  *      Take a completed network device structure and add it to the kernel
5969  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5970  *      chain. 0 is returned on success. A negative errno code is returned
5971  *      on a failure to set up the device, or if the name is a duplicate.
5972  *
5973  *      Callers must hold the rtnl semaphore. You may want
5974  *      register_netdev() instead of this.
5975  *
5976  *      BUGS:
5977  *      The locking appears insufficient to guarantee two parallel registers
5978  *      will not get the same name.
5979  */
5980
5981 int register_netdevice(struct net_device *dev)
5982 {
5983         int ret;
5984         struct net *net = dev_net(dev);
5985
5986         BUG_ON(dev_boot_phase);
5987         ASSERT_RTNL();
5988
5989         might_sleep();
5990
5991         /* When net_device's are persistent, this will be fatal. */
5992         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5993         BUG_ON(!net);
5994
5995         spin_lock_init(&dev->addr_list_lock);
5996         netdev_set_addr_lockdep_class(dev);
5997
5998         dev->iflink = -1;
5999
6000         ret = dev_get_valid_name(net, dev, dev->name);
6001         if (ret < 0)
6002                 goto out;
6003
6004         /* Init, if this function is available */
6005         if (dev->netdev_ops->ndo_init) {
6006                 ret = dev->netdev_ops->ndo_init(dev);
6007                 if (ret) {
6008                         if (ret > 0)
6009                                 ret = -EIO;
6010                         goto out;
6011                 }
6012         }
6013
6014         if (((dev->hw_features | dev->features) &
6015              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6016             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6017              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6018                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6019                 ret = -EINVAL;
6020                 goto err_uninit;
6021         }
6022
6023         ret = -EBUSY;
6024         if (!dev->ifindex)
6025                 dev->ifindex = dev_new_index(net);
6026         else if (__dev_get_by_index(net, dev->ifindex))
6027                 goto err_uninit;
6028
6029         if (dev->iflink == -1)
6030                 dev->iflink = dev->ifindex;
6031
6032         /* Transfer changeable features to wanted_features and enable
6033          * software offloads (GSO and GRO).
6034          */
6035         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6036         dev->features |= NETIF_F_SOFT_FEATURES;
6037         dev->wanted_features = dev->features & dev->hw_features;
6038
6039         if (!(dev->flags & IFF_LOOPBACK)) {
6040                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6041         }
6042
6043         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6044          */
6045         dev->vlan_features |= NETIF_F_HIGHDMA;
6046
6047         /* Make NETIF_F_SG inheritable to tunnel devices.
6048          */
6049         dev->hw_enc_features |= NETIF_F_SG;
6050
6051         /* Make NETIF_F_SG inheritable to MPLS.
6052          */
6053         dev->mpls_features |= NETIF_F_SG;
6054
6055         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6056         ret = notifier_to_errno(ret);
6057         if (ret)
6058                 goto err_uninit;
6059
6060         ret = netdev_register_kobject(dev);
6061         if (ret)
6062                 goto err_uninit;
6063         dev->reg_state = NETREG_REGISTERED;
6064
6065         __netdev_update_features(dev);
6066
6067         /*
6068          *      Default initial state at registry is that the
6069          *      device is present.
6070          */
6071
6072         set_bit(__LINK_STATE_PRESENT, &dev->state);
6073
6074         linkwatch_init_dev(dev);
6075
6076         dev_init_scheduler(dev);
6077         dev_hold(dev);
6078         list_netdevice(dev);
6079         add_device_randomness(dev->dev_addr, dev->addr_len);
6080
6081         /* If the device has permanent device address, driver should
6082          * set dev_addr and also addr_assign_type should be set to
6083          * NET_ADDR_PERM (default value).
6084          */
6085         if (dev->addr_assign_type == NET_ADDR_PERM)
6086                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6087
6088         /* Notify protocols, that a new device appeared. */
6089         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6090         ret = notifier_to_errno(ret);
6091         if (ret) {
6092                 rollback_registered(dev);
6093                 dev->reg_state = NETREG_UNREGISTERED;
6094         }
6095         /*
6096          *      Prevent userspace races by waiting until the network
6097          *      device is fully setup before sending notifications.
6098          */
6099         if (!dev->rtnl_link_ops ||
6100             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6101                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6102
6103 out:
6104         return ret;
6105
6106 err_uninit:
6107         if (dev->netdev_ops->ndo_uninit)
6108                 dev->netdev_ops->ndo_uninit(dev);
6109         goto out;
6110 }
6111 EXPORT_SYMBOL(register_netdevice);
6112
6113 /**
6114  *      init_dummy_netdev       - init a dummy network device for NAPI
6115  *      @dev: device to init
6116  *
6117  *      This takes a network device structure and initialize the minimum
6118  *      amount of fields so it can be used to schedule NAPI polls without
6119  *      registering a full blown interface. This is to be used by drivers
6120  *      that need to tie several hardware interfaces to a single NAPI
6121  *      poll scheduler due to HW limitations.
6122  */
6123 int init_dummy_netdev(struct net_device *dev)
6124 {
6125         /* Clear everything. Note we don't initialize spinlocks
6126          * are they aren't supposed to be taken by any of the
6127          * NAPI code and this dummy netdev is supposed to be
6128          * only ever used for NAPI polls
6129          */
6130         memset(dev, 0, sizeof(struct net_device));
6131
6132         /* make sure we BUG if trying to hit standard
6133          * register/unregister code path
6134          */
6135         dev->reg_state = NETREG_DUMMY;
6136
6137         /* NAPI wants this */
6138         INIT_LIST_HEAD(&dev->napi_list);
6139
6140         /* a dummy interface is started by default */
6141         set_bit(__LINK_STATE_PRESENT, &dev->state);
6142         set_bit(__LINK_STATE_START, &dev->state);
6143
6144         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6145          * because users of this 'device' dont need to change
6146          * its refcount.
6147          */
6148
6149         return 0;
6150 }
6151 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6152
6153
6154 /**
6155  *      register_netdev - register a network device
6156  *      @dev: device to register
6157  *
6158  *      Take a completed network device structure and add it to the kernel
6159  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6160  *      chain. 0 is returned on success. A negative errno code is returned
6161  *      on a failure to set up the device, or if the name is a duplicate.
6162  *
6163  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6164  *      and expands the device name if you passed a format string to
6165  *      alloc_netdev.
6166  */
6167 int register_netdev(struct net_device *dev)
6168 {
6169         int err;
6170
6171         rtnl_lock();
6172         err = register_netdevice(dev);
6173         rtnl_unlock();
6174         return err;
6175 }
6176 EXPORT_SYMBOL(register_netdev);
6177
6178 int netdev_refcnt_read(const struct net_device *dev)
6179 {
6180         int i, refcnt = 0;
6181
6182         for_each_possible_cpu(i)
6183                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6184         return refcnt;
6185 }
6186 EXPORT_SYMBOL(netdev_refcnt_read);
6187
6188 /**
6189  * netdev_wait_allrefs - wait until all references are gone.
6190  * @dev: target net_device
6191  *
6192  * This is called when unregistering network devices.
6193  *
6194  * Any protocol or device that holds a reference should register
6195  * for netdevice notification, and cleanup and put back the
6196  * reference if they receive an UNREGISTER event.
6197  * We can get stuck here if buggy protocols don't correctly
6198  * call dev_put.
6199  */
6200 static void netdev_wait_allrefs(struct net_device *dev)
6201 {
6202         unsigned long rebroadcast_time, warning_time;
6203         int refcnt;
6204
6205         linkwatch_forget_dev(dev);
6206
6207         rebroadcast_time = warning_time = jiffies;
6208         refcnt = netdev_refcnt_read(dev);
6209
6210         while (refcnt != 0) {
6211                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6212                         rtnl_lock();
6213
6214                         /* Rebroadcast unregister notification */
6215                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6216
6217                         __rtnl_unlock();
6218                         rcu_barrier();
6219                         rtnl_lock();
6220
6221                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6222                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6223                                      &dev->state)) {
6224                                 /* We must not have linkwatch events
6225                                  * pending on unregister. If this
6226                                  * happens, we simply run the queue
6227                                  * unscheduled, resulting in a noop
6228                                  * for this device.
6229                                  */
6230                                 linkwatch_run_queue();
6231                         }
6232
6233                         __rtnl_unlock();
6234
6235                         rebroadcast_time = jiffies;
6236                 }
6237
6238                 msleep(250);
6239
6240                 refcnt = netdev_refcnt_read(dev);
6241
6242                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6243                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6244                                  dev->name, refcnt);
6245                         warning_time = jiffies;
6246                 }
6247         }
6248 }
6249
6250 /* The sequence is:
6251  *
6252  *      rtnl_lock();
6253  *      ...
6254  *      register_netdevice(x1);
6255  *      register_netdevice(x2);
6256  *      ...
6257  *      unregister_netdevice(y1);
6258  *      unregister_netdevice(y2);
6259  *      ...
6260  *      rtnl_unlock();
6261  *      free_netdev(y1);
6262  *      free_netdev(y2);
6263  *
6264  * We are invoked by rtnl_unlock().
6265  * This allows us to deal with problems:
6266  * 1) We can delete sysfs objects which invoke hotplug
6267  *    without deadlocking with linkwatch via keventd.
6268  * 2) Since we run with the RTNL semaphore not held, we can sleep
6269  *    safely in order to wait for the netdev refcnt to drop to zero.
6270  *
6271  * We must not return until all unregister events added during
6272  * the interval the lock was held have been completed.
6273  */
6274 void netdev_run_todo(void)
6275 {
6276         struct list_head list;
6277
6278         /* Snapshot list, allow later requests */
6279         list_replace_init(&net_todo_list, &list);
6280
6281         __rtnl_unlock();
6282
6283
6284         /* Wait for rcu callbacks to finish before next phase */
6285         if (!list_empty(&list))
6286                 rcu_barrier();
6287
6288         while (!list_empty(&list)) {
6289                 struct net_device *dev
6290                         = list_first_entry(&list, struct net_device, todo_list);
6291                 list_del(&dev->todo_list);
6292
6293                 rtnl_lock();
6294                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6295                 __rtnl_unlock();
6296
6297                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6298                         pr_err("network todo '%s' but state %d\n",
6299                                dev->name, dev->reg_state);
6300                         dump_stack();
6301                         continue;
6302                 }
6303
6304                 dev->reg_state = NETREG_UNREGISTERED;
6305
6306                 on_each_cpu(flush_backlog, dev, 1);
6307
6308                 netdev_wait_allrefs(dev);
6309
6310                 /* paranoia */
6311                 BUG_ON(netdev_refcnt_read(dev));
6312                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6313                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6314                 WARN_ON(dev->dn_ptr);
6315
6316                 if (dev->destructor)
6317                         dev->destructor(dev);
6318
6319                 /* Report a network device has been unregistered */
6320                 rtnl_lock();
6321                 dev_net(dev)->dev_unreg_count--;
6322                 __rtnl_unlock();
6323                 wake_up(&netdev_unregistering_wq);
6324
6325                 /* Free network device */
6326                 kobject_put(&dev->dev.kobj);
6327         }
6328 }
6329
6330 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6331  * fields in the same order, with only the type differing.
6332  */
6333 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6334                              const struct net_device_stats *netdev_stats)
6335 {
6336 #if BITS_PER_LONG == 64
6337         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6338         memcpy(stats64, netdev_stats, sizeof(*stats64));
6339 #else
6340         size_t i, n = sizeof(*stats64) / sizeof(u64);
6341         const unsigned long *src = (const unsigned long *)netdev_stats;
6342         u64 *dst = (u64 *)stats64;
6343
6344         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6345                      sizeof(*stats64) / sizeof(u64));
6346         for (i = 0; i < n; i++)
6347                 dst[i] = src[i];
6348 #endif
6349 }
6350 EXPORT_SYMBOL(netdev_stats_to_stats64);
6351
6352 /**
6353  *      dev_get_stats   - get network device statistics
6354  *      @dev: device to get statistics from
6355  *      @storage: place to store stats
6356  *
6357  *      Get network statistics from device. Return @storage.
6358  *      The device driver may provide its own method by setting
6359  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6360  *      otherwise the internal statistics structure is used.
6361  */
6362 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6363                                         struct rtnl_link_stats64 *storage)
6364 {
6365         const struct net_device_ops *ops = dev->netdev_ops;
6366
6367         if (ops->ndo_get_stats64) {
6368                 memset(storage, 0, sizeof(*storage));
6369                 ops->ndo_get_stats64(dev, storage);
6370         } else if (ops->ndo_get_stats) {
6371                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6372         } else {
6373                 netdev_stats_to_stats64(storage, &dev->stats);
6374         }
6375         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6376         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6377         return storage;
6378 }
6379 EXPORT_SYMBOL(dev_get_stats);
6380
6381 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6382 {
6383         struct netdev_queue *queue = dev_ingress_queue(dev);
6384
6385 #ifdef CONFIG_NET_CLS_ACT
6386         if (queue)
6387                 return queue;
6388         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6389         if (!queue)
6390                 return NULL;
6391         netdev_init_one_queue(dev, queue, NULL);
6392         queue->qdisc = &noop_qdisc;
6393         queue->qdisc_sleeping = &noop_qdisc;
6394         rcu_assign_pointer(dev->ingress_queue, queue);
6395 #endif
6396         return queue;
6397 }
6398
6399 static const struct ethtool_ops default_ethtool_ops;
6400
6401 void netdev_set_default_ethtool_ops(struct net_device *dev,
6402                                     const struct ethtool_ops *ops)
6403 {
6404         if (dev->ethtool_ops == &default_ethtool_ops)
6405                 dev->ethtool_ops = ops;
6406 }
6407 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6408
6409 void netdev_freemem(struct net_device *dev)
6410 {
6411         char *addr = (char *)dev - dev->padded;
6412
6413         if (is_vmalloc_addr(addr))
6414                 vfree(addr);
6415         else
6416                 kfree(addr);
6417 }
6418
6419 /**
6420  *      alloc_netdev_mqs - allocate network device
6421  *      @sizeof_priv:   size of private data to allocate space for
6422  *      @name:          device name format string
6423  *      @setup:         callback to initialize device
6424  *      @txqs:          the number of TX subqueues to allocate
6425  *      @rxqs:          the number of RX subqueues to allocate
6426  *
6427  *      Allocates a struct net_device with private data area for driver use
6428  *      and performs basic initialization.  Also allocates subqueue structs
6429  *      for each queue on the device.
6430  */
6431 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6432                 void (*setup)(struct net_device *),
6433                 unsigned int txqs, unsigned int rxqs)
6434 {
6435         struct net_device *dev;
6436         size_t alloc_size;
6437         struct net_device *p;
6438
6439         BUG_ON(strlen(name) >= sizeof(dev->name));
6440
6441         if (txqs < 1) {
6442                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6443                 return NULL;
6444         }
6445
6446 #ifdef CONFIG_SYSFS
6447         if (rxqs < 1) {
6448                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6449                 return NULL;
6450         }
6451 #endif
6452
6453         alloc_size = sizeof(struct net_device);
6454         if (sizeof_priv) {
6455                 /* ensure 32-byte alignment of private area */
6456                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6457                 alloc_size += sizeof_priv;
6458         }
6459         /* ensure 32-byte alignment of whole construct */
6460         alloc_size += NETDEV_ALIGN - 1;
6461
6462         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6463         if (!p)
6464                 p = vzalloc(alloc_size);
6465         if (!p)
6466                 return NULL;
6467
6468         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6469         dev->padded = (char *)dev - (char *)p;
6470
6471         dev->pcpu_refcnt = alloc_percpu(int);
6472         if (!dev->pcpu_refcnt)
6473                 goto free_dev;
6474
6475         if (dev_addr_init(dev))
6476                 goto free_pcpu;
6477
6478         dev_mc_init(dev);
6479         dev_uc_init(dev);
6480
6481         dev_net_set(dev, &init_net);
6482
6483         dev->gso_max_size = GSO_MAX_SIZE;
6484         dev->gso_max_segs = GSO_MAX_SEGS;
6485
6486         INIT_LIST_HEAD(&dev->napi_list);
6487         INIT_LIST_HEAD(&dev->unreg_list);
6488         INIT_LIST_HEAD(&dev->close_list);
6489         INIT_LIST_HEAD(&dev->link_watch_list);
6490         INIT_LIST_HEAD(&dev->adj_list.upper);
6491         INIT_LIST_HEAD(&dev->adj_list.lower);
6492         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6493         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6494         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6495         setup(dev);
6496
6497         dev->num_tx_queues = txqs;
6498         dev->real_num_tx_queues = txqs;
6499         if (netif_alloc_netdev_queues(dev))
6500                 goto free_all;
6501
6502 #ifdef CONFIG_SYSFS
6503         dev->num_rx_queues = rxqs;
6504         dev->real_num_rx_queues = rxqs;
6505         if (netif_alloc_rx_queues(dev))
6506                 goto free_all;
6507 #endif
6508
6509         strcpy(dev->name, name);
6510         dev->group = INIT_NETDEV_GROUP;
6511         if (!dev->ethtool_ops)
6512                 dev->ethtool_ops = &default_ethtool_ops;
6513         return dev;
6514
6515 free_all:
6516         free_netdev(dev);
6517         return NULL;
6518
6519 free_pcpu:
6520         free_percpu(dev->pcpu_refcnt);
6521 free_dev:
6522         netdev_freemem(dev);
6523         return NULL;
6524 }
6525 EXPORT_SYMBOL(alloc_netdev_mqs);
6526
6527 /**
6528  *      free_netdev - free network device
6529  *      @dev: device
6530  *
6531  *      This function does the last stage of destroying an allocated device
6532  *      interface. The reference to the device object is released.
6533  *      If this is the last reference then it will be freed.
6534  */
6535 void free_netdev(struct net_device *dev)
6536 {
6537         struct napi_struct *p, *n;
6538
6539         release_net(dev_net(dev));
6540
6541         netif_free_tx_queues(dev);
6542 #ifdef CONFIG_SYSFS
6543         kfree(dev->_rx);
6544 #endif
6545
6546         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6547
6548         /* Flush device addresses */
6549         dev_addr_flush(dev);
6550
6551         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6552                 netif_napi_del(p);
6553
6554         free_percpu(dev->pcpu_refcnt);
6555         dev->pcpu_refcnt = NULL;
6556
6557         /*  Compatibility with error handling in drivers */
6558         if (dev->reg_state == NETREG_UNINITIALIZED) {
6559                 netdev_freemem(dev);
6560                 return;
6561         }
6562
6563         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6564         dev->reg_state = NETREG_RELEASED;
6565
6566         /* will free via device release */
6567         put_device(&dev->dev);
6568 }
6569 EXPORT_SYMBOL(free_netdev);
6570
6571 /**
6572  *      synchronize_net -  Synchronize with packet receive processing
6573  *
6574  *      Wait for packets currently being received to be done.
6575  *      Does not block later packets from starting.
6576  */
6577 void synchronize_net(void)
6578 {
6579         might_sleep();
6580         if (rtnl_is_locked())
6581                 synchronize_rcu_expedited();
6582         else
6583                 synchronize_rcu();
6584 }
6585 EXPORT_SYMBOL(synchronize_net);
6586
6587 /**
6588  *      unregister_netdevice_queue - remove device from the kernel
6589  *      @dev: device
6590  *      @head: list
6591  *
6592  *      This function shuts down a device interface and removes it
6593  *      from the kernel tables.
6594  *      If head not NULL, device is queued to be unregistered later.
6595  *
6596  *      Callers must hold the rtnl semaphore.  You may want
6597  *      unregister_netdev() instead of this.
6598  */
6599
6600 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6601 {
6602         ASSERT_RTNL();
6603
6604         if (head) {
6605                 list_move_tail(&dev->unreg_list, head);
6606         } else {
6607                 rollback_registered(dev);
6608                 /* Finish processing unregister after unlock */
6609                 net_set_todo(dev);
6610         }
6611 }
6612 EXPORT_SYMBOL(unregister_netdevice_queue);
6613
6614 /**
6615  *      unregister_netdevice_many - unregister many devices
6616  *      @head: list of devices
6617  */
6618 void unregister_netdevice_many(struct list_head *head)
6619 {
6620         struct net_device *dev;
6621
6622         if (!list_empty(head)) {
6623                 rollback_registered_many(head);
6624                 list_for_each_entry(dev, head, unreg_list)
6625                         net_set_todo(dev);
6626         }
6627 }
6628 EXPORT_SYMBOL(unregister_netdevice_many);
6629
6630 /**
6631  *      unregister_netdev - remove device from the kernel
6632  *      @dev: device
6633  *
6634  *      This function shuts down a device interface and removes it
6635  *      from the kernel tables.
6636  *
6637  *      This is just a wrapper for unregister_netdevice that takes
6638  *      the rtnl semaphore.  In general you want to use this and not
6639  *      unregister_netdevice.
6640  */
6641 void unregister_netdev(struct net_device *dev)
6642 {
6643         rtnl_lock();
6644         unregister_netdevice(dev);
6645         rtnl_unlock();
6646 }
6647 EXPORT_SYMBOL(unregister_netdev);
6648
6649 /**
6650  *      dev_change_net_namespace - move device to different nethost namespace
6651  *      @dev: device
6652  *      @net: network namespace
6653  *      @pat: If not NULL name pattern to try if the current device name
6654  *            is already taken in the destination network namespace.
6655  *
6656  *      This function shuts down a device interface and moves it
6657  *      to a new network namespace. On success 0 is returned, on
6658  *      a failure a netagive errno code is returned.
6659  *
6660  *      Callers must hold the rtnl semaphore.
6661  */
6662
6663 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6664 {
6665         int err;
6666
6667         ASSERT_RTNL();
6668
6669         /* Don't allow namespace local devices to be moved. */
6670         err = -EINVAL;
6671         if (dev->features & NETIF_F_NETNS_LOCAL)
6672                 goto out;
6673
6674         /* Ensure the device has been registrered */
6675         if (dev->reg_state != NETREG_REGISTERED)
6676                 goto out;
6677
6678         /* Get out if there is nothing todo */
6679         err = 0;
6680         if (net_eq(dev_net(dev), net))
6681                 goto out;
6682
6683         /* Pick the destination device name, and ensure
6684          * we can use it in the destination network namespace.
6685          */
6686         err = -EEXIST;
6687         if (__dev_get_by_name(net, dev->name)) {
6688                 /* We get here if we can't use the current device name */
6689                 if (!pat)
6690                         goto out;
6691                 if (dev_get_valid_name(net, dev, pat) < 0)
6692                         goto out;
6693         }
6694
6695         /*
6696          * And now a mini version of register_netdevice unregister_netdevice.
6697          */
6698
6699         /* If device is running close it first. */
6700         dev_close(dev);
6701
6702         /* And unlink it from device chain */
6703         err = -ENODEV;
6704         unlist_netdevice(dev);
6705
6706         synchronize_net();
6707
6708         /* Shutdown queueing discipline. */
6709         dev_shutdown(dev);
6710
6711         /* Notify protocols, that we are about to destroy
6712            this device. They should clean all the things.
6713
6714            Note that dev->reg_state stays at NETREG_REGISTERED.
6715            This is wanted because this way 8021q and macvlan know
6716            the device is just moving and can keep their slaves up.
6717         */
6718         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6719         rcu_barrier();
6720         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6721         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6722
6723         /*
6724          *      Flush the unicast and multicast chains
6725          */
6726         dev_uc_flush(dev);
6727         dev_mc_flush(dev);
6728
6729         /* Send a netdev-removed uevent to the old namespace */
6730         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6731
6732         /* Actually switch the network namespace */
6733         dev_net_set(dev, net);
6734
6735         /* If there is an ifindex conflict assign a new one */
6736         if (__dev_get_by_index(net, dev->ifindex)) {
6737                 int iflink = (dev->iflink == dev->ifindex);
6738                 dev->ifindex = dev_new_index(net);
6739                 if (iflink)
6740                         dev->iflink = dev->ifindex;
6741         }
6742
6743         /* Send a netdev-add uevent to the new namespace */
6744         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6745
6746         /* Fixup kobjects */
6747         err = device_rename(&dev->dev, dev->name);
6748         WARN_ON(err);
6749
6750         /* Add the device back in the hashes */
6751         list_netdevice(dev);
6752
6753         /* Notify protocols, that a new device appeared. */
6754         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6755
6756         /*
6757          *      Prevent userspace races by waiting until the network
6758          *      device is fully setup before sending notifications.
6759          */
6760         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6761
6762         synchronize_net();
6763         err = 0;
6764 out:
6765         return err;
6766 }
6767 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6768
6769 static int dev_cpu_callback(struct notifier_block *nfb,
6770                             unsigned long action,
6771                             void *ocpu)
6772 {
6773         struct sk_buff **list_skb;
6774         struct sk_buff *skb;
6775         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6776         struct softnet_data *sd, *oldsd;
6777
6778         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6779                 return NOTIFY_OK;
6780
6781         local_irq_disable();
6782         cpu = smp_processor_id();
6783         sd = &per_cpu(softnet_data, cpu);
6784         oldsd = &per_cpu(softnet_data, oldcpu);
6785
6786         /* Find end of our completion_queue. */
6787         list_skb = &sd->completion_queue;
6788         while (*list_skb)
6789                 list_skb = &(*list_skb)->next;
6790         /* Append completion queue from offline CPU. */
6791         *list_skb = oldsd->completion_queue;
6792         oldsd->completion_queue = NULL;
6793
6794         /* Append output queue from offline CPU. */
6795         if (oldsd->output_queue) {
6796                 *sd->output_queue_tailp = oldsd->output_queue;
6797                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6798                 oldsd->output_queue = NULL;
6799                 oldsd->output_queue_tailp = &oldsd->output_queue;
6800         }
6801         /* Append NAPI poll list from offline CPU. */
6802         if (!list_empty(&oldsd->poll_list)) {
6803                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6804                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6805         }
6806
6807         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6808         local_irq_enable();
6809
6810         /* Process offline CPU's input_pkt_queue */
6811         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6812                 netif_rx_internal(skb);
6813                 input_queue_head_incr(oldsd);
6814         }
6815         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6816                 netif_rx_internal(skb);
6817                 input_queue_head_incr(oldsd);
6818         }
6819
6820         return NOTIFY_OK;
6821 }
6822
6823
6824 /**
6825  *      netdev_increment_features - increment feature set by one
6826  *      @all: current feature set
6827  *      @one: new feature set
6828  *      @mask: mask feature set
6829  *
6830  *      Computes a new feature set after adding a device with feature set
6831  *      @one to the master device with current feature set @all.  Will not
6832  *      enable anything that is off in @mask. Returns the new feature set.
6833  */
6834 netdev_features_t netdev_increment_features(netdev_features_t all,
6835         netdev_features_t one, netdev_features_t mask)
6836 {
6837         if (mask & NETIF_F_GEN_CSUM)
6838                 mask |= NETIF_F_ALL_CSUM;
6839         mask |= NETIF_F_VLAN_CHALLENGED;
6840
6841         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6842         all &= one | ~NETIF_F_ALL_FOR_ALL;
6843
6844         /* If one device supports hw checksumming, set for all. */
6845         if (all & NETIF_F_GEN_CSUM)
6846                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6847
6848         return all;
6849 }
6850 EXPORT_SYMBOL(netdev_increment_features);
6851
6852 static struct hlist_head * __net_init netdev_create_hash(void)
6853 {
6854         int i;
6855         struct hlist_head *hash;
6856
6857         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6858         if (hash != NULL)
6859                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6860                         INIT_HLIST_HEAD(&hash[i]);
6861
6862         return hash;
6863 }
6864
6865 /* Initialize per network namespace state */
6866 static int __net_init netdev_init(struct net *net)
6867 {
6868         if (net != &init_net)
6869                 INIT_LIST_HEAD(&net->dev_base_head);
6870
6871         net->dev_name_head = netdev_create_hash();
6872         if (net->dev_name_head == NULL)
6873                 goto err_name;
6874
6875         net->dev_index_head = netdev_create_hash();
6876         if (net->dev_index_head == NULL)
6877                 goto err_idx;
6878
6879         return 0;
6880
6881 err_idx:
6882         kfree(net->dev_name_head);
6883 err_name:
6884         return -ENOMEM;
6885 }
6886
6887 /**
6888  *      netdev_drivername - network driver for the device
6889  *      @dev: network device
6890  *
6891  *      Determine network driver for device.
6892  */
6893 const char *netdev_drivername(const struct net_device *dev)
6894 {
6895         const struct device_driver *driver;
6896         const struct device *parent;
6897         const char *empty = "";
6898
6899         parent = dev->dev.parent;
6900         if (!parent)
6901                 return empty;
6902
6903         driver = parent->driver;
6904         if (driver && driver->name)
6905                 return driver->name;
6906         return empty;
6907 }
6908
6909 static int __netdev_printk(const char *level, const struct net_device *dev,
6910                            struct va_format *vaf)
6911 {
6912         int r;
6913
6914         if (dev && dev->dev.parent) {
6915                 r = dev_printk_emit(level[1] - '0',
6916                                     dev->dev.parent,
6917                                     "%s %s %s: %pV",
6918                                     dev_driver_string(dev->dev.parent),
6919                                     dev_name(dev->dev.parent),
6920                                     netdev_name(dev), vaf);
6921         } else if (dev) {
6922                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6923         } else {
6924                 r = printk("%s(NULL net_device): %pV", level, vaf);
6925         }
6926
6927         return r;
6928 }
6929
6930 int netdev_printk(const char *level, const struct net_device *dev,
6931                   const char *format, ...)
6932 {
6933         struct va_format vaf;
6934         va_list args;
6935         int r;
6936
6937         va_start(args, format);
6938
6939         vaf.fmt = format;
6940         vaf.va = &args;
6941
6942         r = __netdev_printk(level, dev, &vaf);
6943
6944         va_end(args);
6945
6946         return r;
6947 }
6948 EXPORT_SYMBOL(netdev_printk);
6949
6950 #define define_netdev_printk_level(func, level)                 \
6951 int func(const struct net_device *dev, const char *fmt, ...)    \
6952 {                                                               \
6953         int r;                                                  \
6954         struct va_format vaf;                                   \
6955         va_list args;                                           \
6956                                                                 \
6957         va_start(args, fmt);                                    \
6958                                                                 \
6959         vaf.fmt = fmt;                                          \
6960         vaf.va = &args;                                         \
6961                                                                 \
6962         r = __netdev_printk(level, dev, &vaf);                  \
6963                                                                 \
6964         va_end(args);                                           \
6965                                                                 \
6966         return r;                                               \
6967 }                                                               \
6968 EXPORT_SYMBOL(func);
6969
6970 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6971 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6972 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6973 define_netdev_printk_level(netdev_err, KERN_ERR);
6974 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6975 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6976 define_netdev_printk_level(netdev_info, KERN_INFO);
6977
6978 static void __net_exit netdev_exit(struct net *net)
6979 {
6980         kfree(net->dev_name_head);
6981         kfree(net->dev_index_head);
6982 }
6983
6984 static struct pernet_operations __net_initdata netdev_net_ops = {
6985         .init = netdev_init,
6986         .exit = netdev_exit,
6987 };
6988
6989 static void __net_exit default_device_exit(struct net *net)
6990 {
6991         struct net_device *dev, *aux;
6992         /*
6993          * Push all migratable network devices back to the
6994          * initial network namespace
6995          */
6996         rtnl_lock();
6997         for_each_netdev_safe(net, dev, aux) {
6998                 int err;
6999                 char fb_name[IFNAMSIZ];
7000
7001                 /* Ignore unmoveable devices (i.e. loopback) */
7002                 if (dev->features & NETIF_F_NETNS_LOCAL)
7003                         continue;
7004
7005                 /* Leave virtual devices for the generic cleanup */
7006                 if (dev->rtnl_link_ops)
7007                         continue;
7008
7009                 /* Push remaining network devices to init_net */
7010                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7011                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7012                 if (err) {
7013                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7014                                  __func__, dev->name, err);
7015                         BUG();
7016                 }
7017         }
7018         rtnl_unlock();
7019 }
7020
7021 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7022 {
7023         /* Return with the rtnl_lock held when there are no network
7024          * devices unregistering in any network namespace in net_list.
7025          */
7026         struct net *net;
7027         bool unregistering;
7028         DEFINE_WAIT(wait);
7029
7030         for (;;) {
7031                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7032                                 TASK_UNINTERRUPTIBLE);
7033                 unregistering = false;
7034                 rtnl_lock();
7035                 list_for_each_entry(net, net_list, exit_list) {
7036                         if (net->dev_unreg_count > 0) {
7037                                 unregistering = true;
7038                                 break;
7039                         }
7040                 }
7041                 if (!unregistering)
7042                         break;
7043                 __rtnl_unlock();
7044                 schedule();
7045         }
7046         finish_wait(&netdev_unregistering_wq, &wait);
7047 }
7048
7049 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7050 {
7051         /* At exit all network devices most be removed from a network
7052          * namespace.  Do this in the reverse order of registration.
7053          * Do this across as many network namespaces as possible to
7054          * improve batching efficiency.
7055          */
7056         struct net_device *dev;
7057         struct net *net;
7058         LIST_HEAD(dev_kill_list);
7059
7060         /* To prevent network device cleanup code from dereferencing
7061          * loopback devices or network devices that have been freed
7062          * wait here for all pending unregistrations to complete,
7063          * before unregistring the loopback device and allowing the
7064          * network namespace be freed.
7065          *
7066          * The netdev todo list containing all network devices
7067          * unregistrations that happen in default_device_exit_batch
7068          * will run in the rtnl_unlock() at the end of
7069          * default_device_exit_batch.
7070          */
7071         rtnl_lock_unregistering(net_list);
7072         list_for_each_entry(net, net_list, exit_list) {
7073                 for_each_netdev_reverse(net, dev) {
7074                         if (dev->rtnl_link_ops)
7075                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7076                         else
7077                                 unregister_netdevice_queue(dev, &dev_kill_list);
7078                 }
7079         }
7080         unregister_netdevice_many(&dev_kill_list);
7081         list_del(&dev_kill_list);
7082         rtnl_unlock();
7083 }
7084
7085 static struct pernet_operations __net_initdata default_device_ops = {
7086         .exit = default_device_exit,
7087         .exit_batch = default_device_exit_batch,
7088 };
7089
7090 /*
7091  *      Initialize the DEV module. At boot time this walks the device list and
7092  *      unhooks any devices that fail to initialise (normally hardware not
7093  *      present) and leaves us with a valid list of present and active devices.
7094  *
7095  */
7096
7097 /*
7098  *       This is called single threaded during boot, so no need
7099  *       to take the rtnl semaphore.
7100  */
7101 static int __init net_dev_init(void)
7102 {
7103         int i, rc = -ENOMEM;
7104
7105         BUG_ON(!dev_boot_phase);
7106
7107         if (dev_proc_init())
7108                 goto out;
7109
7110         if (netdev_kobject_init())
7111                 goto out;
7112
7113         INIT_LIST_HEAD(&ptype_all);
7114         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7115                 INIT_LIST_HEAD(&ptype_base[i]);
7116
7117         INIT_LIST_HEAD(&offload_base);
7118
7119         if (register_pernet_subsys(&netdev_net_ops))
7120                 goto out;
7121
7122         /*
7123          *      Initialise the packet receive queues.
7124          */
7125
7126         for_each_possible_cpu(i) {
7127                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7128
7129                 skb_queue_head_init(&sd->input_pkt_queue);
7130                 skb_queue_head_init(&sd->process_queue);
7131                 INIT_LIST_HEAD(&sd->poll_list);
7132                 sd->output_queue_tailp = &sd->output_queue;
7133 #ifdef CONFIG_RPS
7134                 sd->csd.func = rps_trigger_softirq;
7135                 sd->csd.info = sd;
7136                 sd->cpu = i;
7137 #endif
7138
7139                 sd->backlog.poll = process_backlog;
7140                 sd->backlog.weight = weight_p;
7141         }
7142
7143         dev_boot_phase = 0;
7144
7145         /* The loopback device is special if any other network devices
7146          * is present in a network namespace the loopback device must
7147          * be present. Since we now dynamically allocate and free the
7148          * loopback device ensure this invariant is maintained by
7149          * keeping the loopback device as the first device on the
7150          * list of network devices.  Ensuring the loopback devices
7151          * is the first device that appears and the last network device
7152          * that disappears.
7153          */
7154         if (register_pernet_device(&loopback_net_ops))
7155                 goto out;
7156
7157         if (register_pernet_device(&default_device_ops))
7158                 goto out;
7159
7160         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7161         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7162
7163         hotcpu_notifier(dev_cpu_callback, 0);
7164         dst_init();
7165         rc = 0;
7166 out:
7167         return rc;
7168 }
7169
7170 subsys_initcall(net_dev_init);