Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux-2.6-block.git] / net / core / dev.c
1 /*
2  *      NET3    Protocol independent device support routines.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  *      Derived from the non IP parts of dev.c 1.0.19
10  *              Authors:        Ross Biro
11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *      Additional Authors:
15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
17  *              David Hinds <dahinds@users.sourceforge.net>
18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *              Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *      Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *                                      to 2 if register_netdev gets called
25  *                                      before net_dev_init & also removed a
26  *                                      few lines of code in the process.
27  *              Alan Cox        :       device private ioctl copies fields back.
28  *              Alan Cox        :       Transmit queue code does relevant
29  *                                      stunts to keep the queue safe.
30  *              Alan Cox        :       Fixed double lock.
31  *              Alan Cox        :       Fixed promisc NULL pointer trap
32  *              ????????        :       Support the full private ioctl range
33  *              Alan Cox        :       Moved ioctl permission check into
34  *                                      drivers
35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
36  *              Alan Cox        :       100 backlog just doesn't cut it when
37  *                                      you start doing multicast video 8)
38  *              Alan Cox        :       Rewrote net_bh and list manager.
39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
40  *              Alan Cox        :       Took out transmit every packet pass
41  *                                      Saved a few bytes in the ioctl handler
42  *              Alan Cox        :       Network driver sets packet type before
43  *                                      calling netif_rx. Saves a function
44  *                                      call a packet.
45  *              Alan Cox        :       Hashed net_bh()
46  *              Richard Kooijman:       Timestamp fixes.
47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
48  *              Alan Cox        :       Device lock protection.
49  *              Alan Cox        :       Fixed nasty side effect of device close
50  *                                      changes.
51  *              Rudi Cilibrasi  :       Pass the right thing to
52  *                                      set_mac_address()
53  *              Dave Miller     :       32bit quantity for the device lock to
54  *                                      make it work out on a Sparc.
55  *              Bjorn Ekwall    :       Added KERNELD hack.
56  *              Alan Cox        :       Cleaned up the backlog initialise.
57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
58  *                                      1 device.
59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
60  *                                      is no device open function.
61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
63  *              Cyrus Durgin    :       Cleaned for KMOD
64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
65  *                                      A network device unload needs to purge
66  *                                      the backlog queue.
67  *      Paul Rusty Russell      :       SIOCSIFNAME
68  *              Pekka Riikonen  :       Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *                                      indefinitely on dev->refcnt
71  *              J Hadi Salim    :       - Backlog queue sampling
72  *                                      - netif_rx() feedback
73  */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 #include <linux/errqueue.h>
136
137 #include "net-sysfs.h"
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 static DEFINE_SPINLOCK(ptype_lock);
146 static DEFINE_SPINLOCK(offload_lock);
147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
148 struct list_head ptype_all __read_mostly;       /* Taps */
149 static struct list_head offload_base __read_mostly;
150
151 static int netif_rx_internal(struct sk_buff *skb);
152 static int call_netdevice_notifiers_info(unsigned long val,
153                                          struct net_device *dev,
154                                          struct netdev_notifier_info *info);
155
156 /*
157  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
158  * semaphore.
159  *
160  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
161  *
162  * Writers must hold the rtnl semaphore while they loop through the
163  * dev_base_head list, and hold dev_base_lock for writing when they do the
164  * actual updates.  This allows pure readers to access the list even
165  * while a writer is preparing to update it.
166  *
167  * To put it another way, dev_base_lock is held for writing only to
168  * protect against pure readers; the rtnl semaphore provides the
169  * protection against other writers.
170  *
171  * See, for example usages, register_netdevice() and
172  * unregister_netdevice(), which must be called with the rtnl
173  * semaphore held.
174  */
175 DEFINE_RWLOCK(dev_base_lock);
176 EXPORT_SYMBOL(dev_base_lock);
177
178 /* protects napi_hash addition/deletion and napi_gen_id */
179 static DEFINE_SPINLOCK(napi_hash_lock);
180
181 static unsigned int napi_gen_id;
182 static DEFINE_HASHTABLE(napi_hash, 8);
183
184 static seqcount_t devnet_rename_seq;
185
186 static inline void dev_base_seq_inc(struct net *net)
187 {
188         while (++net->dev_base_seq == 0);
189 }
190
191 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
192 {
193         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
194
195         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
196 }
197
198 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
199 {
200         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
201 }
202
203 static inline void rps_lock(struct softnet_data *sd)
204 {
205 #ifdef CONFIG_RPS
206         spin_lock(&sd->input_pkt_queue.lock);
207 #endif
208 }
209
210 static inline void rps_unlock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213         spin_unlock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216
217 /* Device list insertion */
218 static void list_netdevice(struct net_device *dev)
219 {
220         struct net *net = dev_net(dev);
221
222         ASSERT_RTNL();
223
224         write_lock_bh(&dev_base_lock);
225         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
226         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
227         hlist_add_head_rcu(&dev->index_hlist,
228                            dev_index_hash(net, dev->ifindex));
229         write_unlock_bh(&dev_base_lock);
230
231         dev_base_seq_inc(net);
232 }
233
234 /* Device list removal
235  * caller must respect a RCU grace period before freeing/reusing dev
236  */
237 static void unlist_netdevice(struct net_device *dev)
238 {
239         ASSERT_RTNL();
240
241         /* Unlink dev from the device chain */
242         write_lock_bh(&dev_base_lock);
243         list_del_rcu(&dev->dev_list);
244         hlist_del_rcu(&dev->name_hlist);
245         hlist_del_rcu(&dev->index_hlist);
246         write_unlock_bh(&dev_base_lock);
247
248         dev_base_seq_inc(dev_net(dev));
249 }
250
251 /*
252  *      Our notifier list
253  */
254
255 static RAW_NOTIFIER_HEAD(netdev_chain);
256
257 /*
258  *      Device drivers call our routines to queue packets here. We empty the
259  *      queue in the local softnet handler.
260  */
261
262 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
263 EXPORT_PER_CPU_SYMBOL(softnet_data);
264
265 #ifdef CONFIG_LOCKDEP
266 /*
267  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
268  * according to dev->type
269  */
270 static const unsigned short netdev_lock_type[] =
271         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
272          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
273          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
274          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
275          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
276          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
277          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
278          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
279          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
280          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
281          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
282          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
283          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
284          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
285          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
286
287 static const char *const netdev_lock_name[] =
288         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
289          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
290          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
291          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
292          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
293          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
294          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
295          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
296          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
297          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
298          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
299          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
300          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
301          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
302          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
303
304 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
305 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
306
307 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
308 {
309         int i;
310
311         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
312                 if (netdev_lock_type[i] == dev_type)
313                         return i;
314         /* the last key is used by default */
315         return ARRAY_SIZE(netdev_lock_type) - 1;
316 }
317
318 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
319                                                  unsigned short dev_type)
320 {
321         int i;
322
323         i = netdev_lock_pos(dev_type);
324         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
325                                    netdev_lock_name[i]);
326 }
327
328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
329 {
330         int i;
331
332         i = netdev_lock_pos(dev->type);
333         lockdep_set_class_and_name(&dev->addr_list_lock,
334                                    &netdev_addr_lock_key[i],
335                                    netdev_lock_name[i]);
336 }
337 #else
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339                                                  unsigned short dev_type)
340 {
341 }
342 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
343 {
344 }
345 #endif
346
347 /*******************************************************************************
348
349                 Protocol management and registration routines
350
351 *******************************************************************************/
352
353 /*
354  *      Add a protocol ID to the list. Now that the input handler is
355  *      smarter we can dispense with all the messy stuff that used to be
356  *      here.
357  *
358  *      BEWARE!!! Protocol handlers, mangling input packets,
359  *      MUST BE last in hash buckets and checking protocol handlers
360  *      MUST start from promiscuous ptype_all chain in net_bh.
361  *      It is true now, do not change it.
362  *      Explanation follows: if protocol handler, mangling packet, will
363  *      be the first on list, it is not able to sense, that packet
364  *      is cloned and should be copied-on-write, so that it will
365  *      change it and subsequent readers will get broken packet.
366  *                                                      --ANK (980803)
367  */
368
369 static inline struct list_head *ptype_head(const struct packet_type *pt)
370 {
371         if (pt->type == htons(ETH_P_ALL))
372                 return &ptype_all;
373         else
374                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
375 }
376
377 /**
378  *      dev_add_pack - add packet handler
379  *      @pt: packet type declaration
380  *
381  *      Add a protocol handler to the networking stack. The passed &packet_type
382  *      is linked into kernel lists and may not be freed until it has been
383  *      removed from the kernel lists.
384  *
385  *      This call does not sleep therefore it can not
386  *      guarantee all CPU's that are in middle of receiving packets
387  *      will see the new packet type (until the next received packet).
388  */
389
390 void dev_add_pack(struct packet_type *pt)
391 {
392         struct list_head *head = ptype_head(pt);
393
394         spin_lock(&ptype_lock);
395         list_add_rcu(&pt->list, head);
396         spin_unlock(&ptype_lock);
397 }
398 EXPORT_SYMBOL(dev_add_pack);
399
400 /**
401  *      __dev_remove_pack        - remove packet handler
402  *      @pt: packet type declaration
403  *
404  *      Remove a protocol handler that was previously added to the kernel
405  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
406  *      from the kernel lists and can be freed or reused once this function
407  *      returns.
408  *
409  *      The packet type might still be in use by receivers
410  *      and must not be freed until after all the CPU's have gone
411  *      through a quiescent state.
412  */
413 void __dev_remove_pack(struct packet_type *pt)
414 {
415         struct list_head *head = ptype_head(pt);
416         struct packet_type *pt1;
417
418         spin_lock(&ptype_lock);
419
420         list_for_each_entry(pt1, head, list) {
421                 if (pt == pt1) {
422                         list_del_rcu(&pt->list);
423                         goto out;
424                 }
425         }
426
427         pr_warn("dev_remove_pack: %p not found\n", pt);
428 out:
429         spin_unlock(&ptype_lock);
430 }
431 EXPORT_SYMBOL(__dev_remove_pack);
432
433 /**
434  *      dev_remove_pack  - remove packet handler
435  *      @pt: packet type declaration
436  *
437  *      Remove a protocol handler that was previously added to the kernel
438  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
439  *      from the kernel lists and can be freed or reused once this function
440  *      returns.
441  *
442  *      This call sleeps to guarantee that no CPU is looking at the packet
443  *      type after return.
444  */
445 void dev_remove_pack(struct packet_type *pt)
446 {
447         __dev_remove_pack(pt);
448
449         synchronize_net();
450 }
451 EXPORT_SYMBOL(dev_remove_pack);
452
453
454 /**
455  *      dev_add_offload - register offload handlers
456  *      @po: protocol offload declaration
457  *
458  *      Add protocol offload handlers to the networking stack. The passed
459  *      &proto_offload is linked into kernel lists and may not be freed until
460  *      it has been removed from the kernel lists.
461  *
462  *      This call does not sleep therefore it can not
463  *      guarantee all CPU's that are in middle of receiving packets
464  *      will see the new offload handlers (until the next received packet).
465  */
466 void dev_add_offload(struct packet_offload *po)
467 {
468         struct list_head *head = &offload_base;
469
470         spin_lock(&offload_lock);
471         list_add_rcu(&po->list, head);
472         spin_unlock(&offload_lock);
473 }
474 EXPORT_SYMBOL(dev_add_offload);
475
476 /**
477  *      __dev_remove_offload     - remove offload handler
478  *      @po: packet offload declaration
479  *
480  *      Remove a protocol offload handler that was previously added to the
481  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
482  *      is removed from the kernel lists and can be freed or reused once this
483  *      function returns.
484  *
485  *      The packet type might still be in use by receivers
486  *      and must not be freed until after all the CPU's have gone
487  *      through a quiescent state.
488  */
489 static void __dev_remove_offload(struct packet_offload *po)
490 {
491         struct list_head *head = &offload_base;
492         struct packet_offload *po1;
493
494         spin_lock(&offload_lock);
495
496         list_for_each_entry(po1, head, list) {
497                 if (po == po1) {
498                         list_del_rcu(&po->list);
499                         goto out;
500                 }
501         }
502
503         pr_warn("dev_remove_offload: %p not found\n", po);
504 out:
505         spin_unlock(&offload_lock);
506 }
507
508 /**
509  *      dev_remove_offload       - remove packet offload handler
510  *      @po: packet offload declaration
511  *
512  *      Remove a packet offload handler that was previously added to the kernel
513  *      offload handlers by dev_add_offload(). The passed &offload_type is
514  *      removed from the kernel lists and can be freed or reused once this
515  *      function returns.
516  *
517  *      This call sleeps to guarantee that no CPU is looking at the packet
518  *      type after return.
519  */
520 void dev_remove_offload(struct packet_offload *po)
521 {
522         __dev_remove_offload(po);
523
524         synchronize_net();
525 }
526 EXPORT_SYMBOL(dev_remove_offload);
527
528 /******************************************************************************
529
530                       Device Boot-time Settings Routines
531
532 *******************************************************************************/
533
534 /* Boot time configuration table */
535 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
536
537 /**
538  *      netdev_boot_setup_add   - add new setup entry
539  *      @name: name of the device
540  *      @map: configured settings for the device
541  *
542  *      Adds new setup entry to the dev_boot_setup list.  The function
543  *      returns 0 on error and 1 on success.  This is a generic routine to
544  *      all netdevices.
545  */
546 static int netdev_boot_setup_add(char *name, struct ifmap *map)
547 {
548         struct netdev_boot_setup *s;
549         int i;
550
551         s = dev_boot_setup;
552         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
553                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
554                         memset(s[i].name, 0, sizeof(s[i].name));
555                         strlcpy(s[i].name, name, IFNAMSIZ);
556                         memcpy(&s[i].map, map, sizeof(s[i].map));
557                         break;
558                 }
559         }
560
561         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
562 }
563
564 /**
565  *      netdev_boot_setup_check - check boot time settings
566  *      @dev: the netdevice
567  *
568  *      Check boot time settings for the device.
569  *      The found settings are set for the device to be used
570  *      later in the device probing.
571  *      Returns 0 if no settings found, 1 if they are.
572  */
573 int netdev_boot_setup_check(struct net_device *dev)
574 {
575         struct netdev_boot_setup *s = dev_boot_setup;
576         int i;
577
578         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
579                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
580                     !strcmp(dev->name, s[i].name)) {
581                         dev->irq        = s[i].map.irq;
582                         dev->base_addr  = s[i].map.base_addr;
583                         dev->mem_start  = s[i].map.mem_start;
584                         dev->mem_end    = s[i].map.mem_end;
585                         return 1;
586                 }
587         }
588         return 0;
589 }
590 EXPORT_SYMBOL(netdev_boot_setup_check);
591
592
593 /**
594  *      netdev_boot_base        - get address from boot time settings
595  *      @prefix: prefix for network device
596  *      @unit: id for network device
597  *
598  *      Check boot time settings for the base address of device.
599  *      The found settings are set for the device to be used
600  *      later in the device probing.
601  *      Returns 0 if no settings found.
602  */
603 unsigned long netdev_boot_base(const char *prefix, int unit)
604 {
605         const struct netdev_boot_setup *s = dev_boot_setup;
606         char name[IFNAMSIZ];
607         int i;
608
609         sprintf(name, "%s%d", prefix, unit);
610
611         /*
612          * If device already registered then return base of 1
613          * to indicate not to probe for this interface
614          */
615         if (__dev_get_by_name(&init_net, name))
616                 return 1;
617
618         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
619                 if (!strcmp(name, s[i].name))
620                         return s[i].map.base_addr;
621         return 0;
622 }
623
624 /*
625  * Saves at boot time configured settings for any netdevice.
626  */
627 int __init netdev_boot_setup(char *str)
628 {
629         int ints[5];
630         struct ifmap map;
631
632         str = get_options(str, ARRAY_SIZE(ints), ints);
633         if (!str || !*str)
634                 return 0;
635
636         /* Save settings */
637         memset(&map, 0, sizeof(map));
638         if (ints[0] > 0)
639                 map.irq = ints[1];
640         if (ints[0] > 1)
641                 map.base_addr = ints[2];
642         if (ints[0] > 2)
643                 map.mem_start = ints[3];
644         if (ints[0] > 3)
645                 map.mem_end = ints[4];
646
647         /* Add new entry to the list */
648         return netdev_boot_setup_add(str, &map);
649 }
650
651 __setup("netdev=", netdev_boot_setup);
652
653 /*******************************************************************************
654
655                             Device Interface Subroutines
656
657 *******************************************************************************/
658
659 /**
660  *      __dev_get_by_name       - find a device by its name
661  *      @net: the applicable net namespace
662  *      @name: name to find
663  *
664  *      Find an interface by name. Must be called under RTNL semaphore
665  *      or @dev_base_lock. If the name is found a pointer to the device
666  *      is returned. If the name is not found then %NULL is returned. The
667  *      reference counters are not incremented so the caller must be
668  *      careful with locks.
669  */
670
671 struct net_device *__dev_get_by_name(struct net *net, const char *name)
672 {
673         struct net_device *dev;
674         struct hlist_head *head = dev_name_hash(net, name);
675
676         hlist_for_each_entry(dev, head, name_hlist)
677                 if (!strncmp(dev->name, name, IFNAMSIZ))
678                         return dev;
679
680         return NULL;
681 }
682 EXPORT_SYMBOL(__dev_get_by_name);
683
684 /**
685  *      dev_get_by_name_rcu     - find a device by its name
686  *      @net: the applicable net namespace
687  *      @name: name to find
688  *
689  *      Find an interface by name.
690  *      If the name is found a pointer to the device is returned.
691  *      If the name is not found then %NULL is returned.
692  *      The reference counters are not incremented so the caller must be
693  *      careful with locks. The caller must hold RCU lock.
694  */
695
696 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
697 {
698         struct net_device *dev;
699         struct hlist_head *head = dev_name_hash(net, name);
700
701         hlist_for_each_entry_rcu(dev, head, name_hlist)
702                 if (!strncmp(dev->name, name, IFNAMSIZ))
703                         return dev;
704
705         return NULL;
706 }
707 EXPORT_SYMBOL(dev_get_by_name_rcu);
708
709 /**
710  *      dev_get_by_name         - find a device by its name
711  *      @net: the applicable net namespace
712  *      @name: name to find
713  *
714  *      Find an interface by name. This can be called from any
715  *      context and does its own locking. The returned handle has
716  *      the usage count incremented and the caller must use dev_put() to
717  *      release it when it is no longer needed. %NULL is returned if no
718  *      matching device is found.
719  */
720
721 struct net_device *dev_get_by_name(struct net *net, const char *name)
722 {
723         struct net_device *dev;
724
725         rcu_read_lock();
726         dev = dev_get_by_name_rcu(net, name);
727         if (dev)
728                 dev_hold(dev);
729         rcu_read_unlock();
730         return dev;
731 }
732 EXPORT_SYMBOL(dev_get_by_name);
733
734 /**
735  *      __dev_get_by_index - find a device by its ifindex
736  *      @net: the applicable net namespace
737  *      @ifindex: index of device
738  *
739  *      Search for an interface by index. Returns %NULL if the device
740  *      is not found or a pointer to the device. The device has not
741  *      had its reference counter increased so the caller must be careful
742  *      about locking. The caller must hold either the RTNL semaphore
743  *      or @dev_base_lock.
744  */
745
746 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
747 {
748         struct net_device *dev;
749         struct hlist_head *head = dev_index_hash(net, ifindex);
750
751         hlist_for_each_entry(dev, head, index_hlist)
752                 if (dev->ifindex == ifindex)
753                         return dev;
754
755         return NULL;
756 }
757 EXPORT_SYMBOL(__dev_get_by_index);
758
759 /**
760  *      dev_get_by_index_rcu - find a device by its ifindex
761  *      @net: the applicable net namespace
762  *      @ifindex: index of device
763  *
764  *      Search for an interface by index. Returns %NULL if the device
765  *      is not found or a pointer to the device. The device has not
766  *      had its reference counter increased so the caller must be careful
767  *      about locking. The caller must hold RCU lock.
768  */
769
770 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
771 {
772         struct net_device *dev;
773         struct hlist_head *head = dev_index_hash(net, ifindex);
774
775         hlist_for_each_entry_rcu(dev, head, index_hlist)
776                 if (dev->ifindex == ifindex)
777                         return dev;
778
779         return NULL;
780 }
781 EXPORT_SYMBOL(dev_get_by_index_rcu);
782
783
784 /**
785  *      dev_get_by_index - find a device by its ifindex
786  *      @net: the applicable net namespace
787  *      @ifindex: index of device
788  *
789  *      Search for an interface by index. Returns NULL if the device
790  *      is not found or a pointer to the device. The device returned has
791  *      had a reference added and the pointer is safe until the user calls
792  *      dev_put to indicate they have finished with it.
793  */
794
795 struct net_device *dev_get_by_index(struct net *net, int ifindex)
796 {
797         struct net_device *dev;
798
799         rcu_read_lock();
800         dev = dev_get_by_index_rcu(net, ifindex);
801         if (dev)
802                 dev_hold(dev);
803         rcu_read_unlock();
804         return dev;
805 }
806 EXPORT_SYMBOL(dev_get_by_index);
807
808 /**
809  *      netdev_get_name - get a netdevice name, knowing its ifindex.
810  *      @net: network namespace
811  *      @name: a pointer to the buffer where the name will be stored.
812  *      @ifindex: the ifindex of the interface to get the name from.
813  *
814  *      The use of raw_seqcount_begin() and cond_resched() before
815  *      retrying is required as we want to give the writers a chance
816  *      to complete when CONFIG_PREEMPT is not set.
817  */
818 int netdev_get_name(struct net *net, char *name, int ifindex)
819 {
820         struct net_device *dev;
821         unsigned int seq;
822
823 retry:
824         seq = raw_seqcount_begin(&devnet_rename_seq);
825         rcu_read_lock();
826         dev = dev_get_by_index_rcu(net, ifindex);
827         if (!dev) {
828                 rcu_read_unlock();
829                 return -ENODEV;
830         }
831
832         strcpy(name, dev->name);
833         rcu_read_unlock();
834         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
835                 cond_resched();
836                 goto retry;
837         }
838
839         return 0;
840 }
841
842 /**
843  *      dev_getbyhwaddr_rcu - find a device by its hardware address
844  *      @net: the applicable net namespace
845  *      @type: media type of device
846  *      @ha: hardware address
847  *
848  *      Search for an interface by MAC address. Returns NULL if the device
849  *      is not found or a pointer to the device.
850  *      The caller must hold RCU or RTNL.
851  *      The returned device has not had its ref count increased
852  *      and the caller must therefore be careful about locking
853  *
854  */
855
856 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
857                                        const char *ha)
858 {
859         struct net_device *dev;
860
861         for_each_netdev_rcu(net, dev)
862                 if (dev->type == type &&
863                     !memcmp(dev->dev_addr, ha, dev->addr_len))
864                         return dev;
865
866         return NULL;
867 }
868 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
869
870 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
871 {
872         struct net_device *dev;
873
874         ASSERT_RTNL();
875         for_each_netdev(net, dev)
876                 if (dev->type == type)
877                         return dev;
878
879         return NULL;
880 }
881 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
882
883 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
884 {
885         struct net_device *dev, *ret = NULL;
886
887         rcu_read_lock();
888         for_each_netdev_rcu(net, dev)
889                 if (dev->type == type) {
890                         dev_hold(dev);
891                         ret = dev;
892                         break;
893                 }
894         rcu_read_unlock();
895         return ret;
896 }
897 EXPORT_SYMBOL(dev_getfirstbyhwtype);
898
899 /**
900  *      __dev_get_by_flags - find any device with given flags
901  *      @net: the applicable net namespace
902  *      @if_flags: IFF_* values
903  *      @mask: bitmask of bits in if_flags to check
904  *
905  *      Search for any interface with the given flags. Returns NULL if a device
906  *      is not found or a pointer to the device. Must be called inside
907  *      rtnl_lock(), and result refcount is unchanged.
908  */
909
910 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
911                                       unsigned short mask)
912 {
913         struct net_device *dev, *ret;
914
915         ASSERT_RTNL();
916
917         ret = NULL;
918         for_each_netdev(net, dev) {
919                 if (((dev->flags ^ if_flags) & mask) == 0) {
920                         ret = dev;
921                         break;
922                 }
923         }
924         return ret;
925 }
926 EXPORT_SYMBOL(__dev_get_by_flags);
927
928 /**
929  *      dev_valid_name - check if name is okay for network device
930  *      @name: name string
931  *
932  *      Network device names need to be valid file names to
933  *      to allow sysfs to work.  We also disallow any kind of
934  *      whitespace.
935  */
936 bool dev_valid_name(const char *name)
937 {
938         if (*name == '\0')
939                 return false;
940         if (strlen(name) >= IFNAMSIZ)
941                 return false;
942         if (!strcmp(name, ".") || !strcmp(name, ".."))
943                 return false;
944
945         while (*name) {
946                 if (*name == '/' || isspace(*name))
947                         return false;
948                 name++;
949         }
950         return true;
951 }
952 EXPORT_SYMBOL(dev_valid_name);
953
954 /**
955  *      __dev_alloc_name - allocate a name for a device
956  *      @net: network namespace to allocate the device name in
957  *      @name: name format string
958  *      @buf:  scratch buffer and result name string
959  *
960  *      Passed a format string - eg "lt%d" it will try and find a suitable
961  *      id. It scans list of devices to build up a free map, then chooses
962  *      the first empty slot. The caller must hold the dev_base or rtnl lock
963  *      while allocating the name and adding the device in order to avoid
964  *      duplicates.
965  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
966  *      Returns the number of the unit assigned or a negative errno code.
967  */
968
969 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
970 {
971         int i = 0;
972         const char *p;
973         const int max_netdevices = 8*PAGE_SIZE;
974         unsigned long *inuse;
975         struct net_device *d;
976
977         p = strnchr(name, IFNAMSIZ-1, '%');
978         if (p) {
979                 /*
980                  * Verify the string as this thing may have come from
981                  * the user.  There must be either one "%d" and no other "%"
982                  * characters.
983                  */
984                 if (p[1] != 'd' || strchr(p + 2, '%'))
985                         return -EINVAL;
986
987                 /* Use one page as a bit array of possible slots */
988                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
989                 if (!inuse)
990                         return -ENOMEM;
991
992                 for_each_netdev(net, d) {
993                         if (!sscanf(d->name, name, &i))
994                                 continue;
995                         if (i < 0 || i >= max_netdevices)
996                                 continue;
997
998                         /*  avoid cases where sscanf is not exact inverse of printf */
999                         snprintf(buf, IFNAMSIZ, name, i);
1000                         if (!strncmp(buf, d->name, IFNAMSIZ))
1001                                 set_bit(i, inuse);
1002                 }
1003
1004                 i = find_first_zero_bit(inuse, max_netdevices);
1005                 free_page((unsigned long) inuse);
1006         }
1007
1008         if (buf != name)
1009                 snprintf(buf, IFNAMSIZ, name, i);
1010         if (!__dev_get_by_name(net, buf))
1011                 return i;
1012
1013         /* It is possible to run out of possible slots
1014          * when the name is long and there isn't enough space left
1015          * for the digits, or if all bits are used.
1016          */
1017         return -ENFILE;
1018 }
1019
1020 /**
1021  *      dev_alloc_name - allocate a name for a device
1022  *      @dev: device
1023  *      @name: name format string
1024  *
1025  *      Passed a format string - eg "lt%d" it will try and find a suitable
1026  *      id. It scans list of devices to build up a free map, then chooses
1027  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1028  *      while allocating the name and adding the device in order to avoid
1029  *      duplicates.
1030  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1031  *      Returns the number of the unit assigned or a negative errno code.
1032  */
1033
1034 int dev_alloc_name(struct net_device *dev, const char *name)
1035 {
1036         char buf[IFNAMSIZ];
1037         struct net *net;
1038         int ret;
1039
1040         BUG_ON(!dev_net(dev));
1041         net = dev_net(dev);
1042         ret = __dev_alloc_name(net, name, buf);
1043         if (ret >= 0)
1044                 strlcpy(dev->name, buf, IFNAMSIZ);
1045         return ret;
1046 }
1047 EXPORT_SYMBOL(dev_alloc_name);
1048
1049 static int dev_alloc_name_ns(struct net *net,
1050                              struct net_device *dev,
1051                              const char *name)
1052 {
1053         char buf[IFNAMSIZ];
1054         int ret;
1055
1056         ret = __dev_alloc_name(net, name, buf);
1057         if (ret >= 0)
1058                 strlcpy(dev->name, buf, IFNAMSIZ);
1059         return ret;
1060 }
1061
1062 static int dev_get_valid_name(struct net *net,
1063                               struct net_device *dev,
1064                               const char *name)
1065 {
1066         BUG_ON(!net);
1067
1068         if (!dev_valid_name(name))
1069                 return -EINVAL;
1070
1071         if (strchr(name, '%'))
1072                 return dev_alloc_name_ns(net, dev, name);
1073         else if (__dev_get_by_name(net, name))
1074                 return -EEXIST;
1075         else if (dev->name != name)
1076                 strlcpy(dev->name, name, IFNAMSIZ);
1077
1078         return 0;
1079 }
1080
1081 /**
1082  *      dev_change_name - change name of a device
1083  *      @dev: device
1084  *      @newname: name (or format string) must be at least IFNAMSIZ
1085  *
1086  *      Change name of a device, can pass format strings "eth%d".
1087  *      for wildcarding.
1088  */
1089 int dev_change_name(struct net_device *dev, const char *newname)
1090 {
1091         unsigned char old_assign_type;
1092         char oldname[IFNAMSIZ];
1093         int err = 0;
1094         int ret;
1095         struct net *net;
1096
1097         ASSERT_RTNL();
1098         BUG_ON(!dev_net(dev));
1099
1100         net = dev_net(dev);
1101         if (dev->flags & IFF_UP)
1102                 return -EBUSY;
1103
1104         write_seqcount_begin(&devnet_rename_seq);
1105
1106         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1107                 write_seqcount_end(&devnet_rename_seq);
1108                 return 0;
1109         }
1110
1111         memcpy(oldname, dev->name, IFNAMSIZ);
1112
1113         err = dev_get_valid_name(net, dev, newname);
1114         if (err < 0) {
1115                 write_seqcount_end(&devnet_rename_seq);
1116                 return err;
1117         }
1118
1119         if (oldname[0] && !strchr(oldname, '%'))
1120                 netdev_info(dev, "renamed from %s\n", oldname);
1121
1122         old_assign_type = dev->name_assign_type;
1123         dev->name_assign_type = NET_NAME_RENAMED;
1124
1125 rollback:
1126         ret = device_rename(&dev->dev, dev->name);
1127         if (ret) {
1128                 memcpy(dev->name, oldname, IFNAMSIZ);
1129                 dev->name_assign_type = old_assign_type;
1130                 write_seqcount_end(&devnet_rename_seq);
1131                 return ret;
1132         }
1133
1134         write_seqcount_end(&devnet_rename_seq);
1135
1136         netdev_adjacent_rename_links(dev, oldname);
1137
1138         write_lock_bh(&dev_base_lock);
1139         hlist_del_rcu(&dev->name_hlist);
1140         write_unlock_bh(&dev_base_lock);
1141
1142         synchronize_rcu();
1143
1144         write_lock_bh(&dev_base_lock);
1145         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1146         write_unlock_bh(&dev_base_lock);
1147
1148         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1149         ret = notifier_to_errno(ret);
1150
1151         if (ret) {
1152                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1153                 if (err >= 0) {
1154                         err = ret;
1155                         write_seqcount_begin(&devnet_rename_seq);
1156                         memcpy(dev->name, oldname, IFNAMSIZ);
1157                         memcpy(oldname, newname, IFNAMSIZ);
1158                         dev->name_assign_type = old_assign_type;
1159                         old_assign_type = NET_NAME_RENAMED;
1160                         goto rollback;
1161                 } else {
1162                         pr_err("%s: name change rollback failed: %d\n",
1163                                dev->name, ret);
1164                 }
1165         }
1166
1167         return err;
1168 }
1169
1170 /**
1171  *      dev_set_alias - change ifalias of a device
1172  *      @dev: device
1173  *      @alias: name up to IFALIASZ
1174  *      @len: limit of bytes to copy from info
1175  *
1176  *      Set ifalias for a device,
1177  */
1178 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1179 {
1180         char *new_ifalias;
1181
1182         ASSERT_RTNL();
1183
1184         if (len >= IFALIASZ)
1185                 return -EINVAL;
1186
1187         if (!len) {
1188                 kfree(dev->ifalias);
1189                 dev->ifalias = NULL;
1190                 return 0;
1191         }
1192
1193         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1194         if (!new_ifalias)
1195                 return -ENOMEM;
1196         dev->ifalias = new_ifalias;
1197
1198         strlcpy(dev->ifalias, alias, len+1);
1199         return len;
1200 }
1201
1202
1203 /**
1204  *      netdev_features_change - device changes features
1205  *      @dev: device to cause notification
1206  *
1207  *      Called to indicate a device has changed features.
1208  */
1209 void netdev_features_change(struct net_device *dev)
1210 {
1211         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1212 }
1213 EXPORT_SYMBOL(netdev_features_change);
1214
1215 /**
1216  *      netdev_state_change - device changes state
1217  *      @dev: device to cause notification
1218  *
1219  *      Called to indicate a device has changed state. This function calls
1220  *      the notifier chains for netdev_chain and sends a NEWLINK message
1221  *      to the routing socket.
1222  */
1223 void netdev_state_change(struct net_device *dev)
1224 {
1225         if (dev->flags & IFF_UP) {
1226                 struct netdev_notifier_change_info change_info;
1227
1228                 change_info.flags_changed = 0;
1229                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1230                                               &change_info.info);
1231                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1232         }
1233 }
1234 EXPORT_SYMBOL(netdev_state_change);
1235
1236 /**
1237  *      netdev_notify_peers - notify network peers about existence of @dev
1238  *      @dev: network device
1239  *
1240  * Generate traffic such that interested network peers are aware of
1241  * @dev, such as by generating a gratuitous ARP. This may be used when
1242  * a device wants to inform the rest of the network about some sort of
1243  * reconfiguration such as a failover event or virtual machine
1244  * migration.
1245  */
1246 void netdev_notify_peers(struct net_device *dev)
1247 {
1248         rtnl_lock();
1249         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1250         rtnl_unlock();
1251 }
1252 EXPORT_SYMBOL(netdev_notify_peers);
1253
1254 static int __dev_open(struct net_device *dev)
1255 {
1256         const struct net_device_ops *ops = dev->netdev_ops;
1257         int ret;
1258
1259         ASSERT_RTNL();
1260
1261         if (!netif_device_present(dev))
1262                 return -ENODEV;
1263
1264         /* Block netpoll from trying to do any rx path servicing.
1265          * If we don't do this there is a chance ndo_poll_controller
1266          * or ndo_poll may be running while we open the device
1267          */
1268         netpoll_poll_disable(dev);
1269
1270         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1271         ret = notifier_to_errno(ret);
1272         if (ret)
1273                 return ret;
1274
1275         set_bit(__LINK_STATE_START, &dev->state);
1276
1277         if (ops->ndo_validate_addr)
1278                 ret = ops->ndo_validate_addr(dev);
1279
1280         if (!ret && ops->ndo_open)
1281                 ret = ops->ndo_open(dev);
1282
1283         netpoll_poll_enable(dev);
1284
1285         if (ret)
1286                 clear_bit(__LINK_STATE_START, &dev->state);
1287         else {
1288                 dev->flags |= IFF_UP;
1289                 net_dmaengine_get();
1290                 dev_set_rx_mode(dev);
1291                 dev_activate(dev);
1292                 add_device_randomness(dev->dev_addr, dev->addr_len);
1293         }
1294
1295         return ret;
1296 }
1297
1298 /**
1299  *      dev_open        - prepare an interface for use.
1300  *      @dev:   device to open
1301  *
1302  *      Takes a device from down to up state. The device's private open
1303  *      function is invoked and then the multicast lists are loaded. Finally
1304  *      the device is moved into the up state and a %NETDEV_UP message is
1305  *      sent to the netdev notifier chain.
1306  *
1307  *      Calling this function on an active interface is a nop. On a failure
1308  *      a negative errno code is returned.
1309  */
1310 int dev_open(struct net_device *dev)
1311 {
1312         int ret;
1313
1314         if (dev->flags & IFF_UP)
1315                 return 0;
1316
1317         ret = __dev_open(dev);
1318         if (ret < 0)
1319                 return ret;
1320
1321         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1322         call_netdevice_notifiers(NETDEV_UP, dev);
1323
1324         return ret;
1325 }
1326 EXPORT_SYMBOL(dev_open);
1327
1328 static int __dev_close_many(struct list_head *head)
1329 {
1330         struct net_device *dev;
1331
1332         ASSERT_RTNL();
1333         might_sleep();
1334
1335         list_for_each_entry(dev, head, close_list) {
1336                 /* Temporarily disable netpoll until the interface is down */
1337                 netpoll_poll_disable(dev);
1338
1339                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1340
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342
1343                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1344                  * can be even on different cpu. So just clear netif_running().
1345                  *
1346                  * dev->stop() will invoke napi_disable() on all of it's
1347                  * napi_struct instances on this device.
1348                  */
1349                 smp_mb__after_atomic(); /* Commit netif_running(). */
1350         }
1351
1352         dev_deactivate_many(head);
1353
1354         list_for_each_entry(dev, head, close_list) {
1355                 const struct net_device_ops *ops = dev->netdev_ops;
1356
1357                 /*
1358                  *      Call the device specific close. This cannot fail.
1359                  *      Only if device is UP
1360                  *
1361                  *      We allow it to be called even after a DETACH hot-plug
1362                  *      event.
1363                  */
1364                 if (ops->ndo_stop)
1365                         ops->ndo_stop(dev);
1366
1367                 dev->flags &= ~IFF_UP;
1368                 net_dmaengine_put();
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         /*
1441          * If we're trying to disable lro on a vlan device
1442          * use the underlying physical device instead
1443          */
1444         if (is_vlan_dev(dev))
1445                 dev = vlan_dev_real_dev(dev);
1446
1447         /* the same for macvlan devices */
1448         if (netif_is_macvlan(dev))
1449                 dev = macvlan_dev_real_dev(dev);
1450
1451         dev->wanted_features &= ~NETIF_F_LRO;
1452         netdev_update_features(dev);
1453
1454         if (unlikely(dev->features & NETIF_F_LRO))
1455                 netdev_WARN(dev, "failed to disable LRO!\n");
1456 }
1457 EXPORT_SYMBOL(dev_disable_lro);
1458
1459 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1460                                    struct net_device *dev)
1461 {
1462         struct netdev_notifier_info info;
1463
1464         netdev_notifier_info_init(&info, dev);
1465         return nb->notifier_call(nb, val, &info);
1466 }
1467
1468 static int dev_boot_phase = 1;
1469
1470 /**
1471  *      register_netdevice_notifier - register a network notifier block
1472  *      @nb: notifier
1473  *
1474  *      Register a notifier to be called when network device events occur.
1475  *      The notifier passed is linked into the kernel structures and must
1476  *      not be reused until it has been unregistered. A negative errno code
1477  *      is returned on a failure.
1478  *
1479  *      When registered all registration and up events are replayed
1480  *      to the new notifier to allow device to have a race free
1481  *      view of the network device list.
1482  */
1483
1484 int register_netdevice_notifier(struct notifier_block *nb)
1485 {
1486         struct net_device *dev;
1487         struct net_device *last;
1488         struct net *net;
1489         int err;
1490
1491         rtnl_lock();
1492         err = raw_notifier_chain_register(&netdev_chain, nb);
1493         if (err)
1494                 goto unlock;
1495         if (dev_boot_phase)
1496                 goto unlock;
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1500                         err = notifier_to_errno(err);
1501                         if (err)
1502                                 goto rollback;
1503
1504                         if (!(dev->flags & IFF_UP))
1505                                 continue;
1506
1507                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1508                 }
1509         }
1510
1511 unlock:
1512         rtnl_unlock();
1513         return err;
1514
1515 rollback:
1516         last = dev;
1517         for_each_net(net) {
1518                 for_each_netdev(net, dev) {
1519                         if (dev == last)
1520                                 goto outroll;
1521
1522                         if (dev->flags & IFF_UP) {
1523                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1524                                                         dev);
1525                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1526                         }
1527                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1528                 }
1529         }
1530
1531 outroll:
1532         raw_notifier_chain_unregister(&netdev_chain, nb);
1533         goto unlock;
1534 }
1535 EXPORT_SYMBOL(register_netdevice_notifier);
1536
1537 /**
1538  *      unregister_netdevice_notifier - unregister a network notifier block
1539  *      @nb: notifier
1540  *
1541  *      Unregister a notifier previously registered by
1542  *      register_netdevice_notifier(). The notifier is unlinked into the
1543  *      kernel structures and may then be reused. A negative errno code
1544  *      is returned on a failure.
1545  *
1546  *      After unregistering unregister and down device events are synthesized
1547  *      for all devices on the device list to the removed notifier to remove
1548  *      the need for special case cleanup code.
1549  */
1550
1551 int unregister_netdevice_notifier(struct notifier_block *nb)
1552 {
1553         struct net_device *dev;
1554         struct net *net;
1555         int err;
1556
1557         rtnl_lock();
1558         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1559         if (err)
1560                 goto unlock;
1561
1562         for_each_net(net) {
1563                 for_each_netdev(net, dev) {
1564                         if (dev->flags & IFF_UP) {
1565                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1566                                                         dev);
1567                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1568                         }
1569                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1570                 }
1571         }
1572 unlock:
1573         rtnl_unlock();
1574         return err;
1575 }
1576 EXPORT_SYMBOL(unregister_netdevice_notifier);
1577
1578 /**
1579  *      call_netdevice_notifiers_info - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *      @info: notifier information data
1583  *
1584  *      Call all network notifier blocks.  Parameters and return value
1585  *      are as for raw_notifier_call_chain().
1586  */
1587
1588 static int call_netdevice_notifiers_info(unsigned long val,
1589                                          struct net_device *dev,
1590                                          struct netdev_notifier_info *info)
1591 {
1592         ASSERT_RTNL();
1593         netdev_notifier_info_init(info, dev);
1594         return raw_notifier_call_chain(&netdev_chain, val, info);
1595 }
1596
1597 /**
1598  *      call_netdevice_notifiers - call all network notifier blocks
1599  *      @val: value passed unmodified to notifier function
1600  *      @dev: net_device pointer passed unmodified to notifier function
1601  *
1602  *      Call all network notifier blocks.  Parameters and return value
1603  *      are as for raw_notifier_call_chain().
1604  */
1605
1606 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1607 {
1608         struct netdev_notifier_info info;
1609
1610         return call_netdevice_notifiers_info(val, dev, &info);
1611 }
1612 EXPORT_SYMBOL(call_netdevice_notifiers);
1613
1614 static struct static_key netstamp_needed __read_mostly;
1615 #ifdef HAVE_JUMP_LABEL
1616 /* We are not allowed to call static_key_slow_dec() from irq context
1617  * If net_disable_timestamp() is called from irq context, defer the
1618  * static_key_slow_dec() calls.
1619  */
1620 static atomic_t netstamp_needed_deferred;
1621 #endif
1622
1623 void net_enable_timestamp(void)
1624 {
1625 #ifdef HAVE_JUMP_LABEL
1626         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1627
1628         if (deferred) {
1629                 while (--deferred)
1630                         static_key_slow_dec(&netstamp_needed);
1631                 return;
1632         }
1633 #endif
1634         static_key_slow_inc(&netstamp_needed);
1635 }
1636 EXPORT_SYMBOL(net_enable_timestamp);
1637
1638 void net_disable_timestamp(void)
1639 {
1640 #ifdef HAVE_JUMP_LABEL
1641         if (in_interrupt()) {
1642                 atomic_inc(&netstamp_needed_deferred);
1643                 return;
1644         }
1645 #endif
1646         static_key_slow_dec(&netstamp_needed);
1647 }
1648 EXPORT_SYMBOL(net_disable_timestamp);
1649
1650 static inline void net_timestamp_set(struct sk_buff *skb)
1651 {
1652         skb->tstamp.tv64 = 0;
1653         if (static_key_false(&netstamp_needed))
1654                 __net_timestamp(skb);
1655 }
1656
1657 #define net_timestamp_check(COND, SKB)                  \
1658         if (static_key_false(&netstamp_needed)) {               \
1659                 if ((COND) && !(SKB)->tstamp.tv64)      \
1660                         __net_timestamp(SKB);           \
1661         }                                               \
1662
1663 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1664 {
1665         unsigned int len;
1666
1667         if (!(dev->flags & IFF_UP))
1668                 return false;
1669
1670         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1671         if (skb->len <= len)
1672                 return true;
1673
1674         /* if TSO is enabled, we don't care about the length as the packet
1675          * could be forwarded without being segmented before
1676          */
1677         if (skb_is_gso(skb))
1678                 return true;
1679
1680         return false;
1681 }
1682 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1683
1684 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1685 {
1686         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1687                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1688                         atomic_long_inc(&dev->rx_dropped);
1689                         kfree_skb(skb);
1690                         return NET_RX_DROP;
1691                 }
1692         }
1693
1694         if (unlikely(!is_skb_forwardable(dev, skb))) {
1695                 atomic_long_inc(&dev->rx_dropped);
1696                 kfree_skb(skb);
1697                 return NET_RX_DROP;
1698         }
1699
1700         skb_scrub_packet(skb, true);
1701         skb->protocol = eth_type_trans(skb, dev);
1702
1703         return 0;
1704 }
1705 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1706
1707 /**
1708  * dev_forward_skb - loopback an skb to another netif
1709  *
1710  * @dev: destination network device
1711  * @skb: buffer to forward
1712  *
1713  * return values:
1714  *      NET_RX_SUCCESS  (no congestion)
1715  *      NET_RX_DROP     (packet was dropped, but freed)
1716  *
1717  * dev_forward_skb can be used for injecting an skb from the
1718  * start_xmit function of one device into the receive queue
1719  * of another device.
1720  *
1721  * The receiving device may be in another namespace, so
1722  * we have to clear all information in the skb that could
1723  * impact namespace isolation.
1724  */
1725 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1726 {
1727         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1728 }
1729 EXPORT_SYMBOL_GPL(dev_forward_skb);
1730
1731 static inline int deliver_skb(struct sk_buff *skb,
1732                               struct packet_type *pt_prev,
1733                               struct net_device *orig_dev)
1734 {
1735         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1736                 return -ENOMEM;
1737         atomic_inc(&skb->users);
1738         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1739 }
1740
1741 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1742 {
1743         if (!ptype->af_packet_priv || !skb->sk)
1744                 return false;
1745
1746         if (ptype->id_match)
1747                 return ptype->id_match(ptype, skb->sk);
1748         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1749                 return true;
1750
1751         return false;
1752 }
1753
1754 /*
1755  *      Support routine. Sends outgoing frames to any network
1756  *      taps currently in use.
1757  */
1758
1759 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1760 {
1761         struct packet_type *ptype;
1762         struct sk_buff *skb2 = NULL;
1763         struct packet_type *pt_prev = NULL;
1764
1765         rcu_read_lock();
1766         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1767                 /* Never send packets back to the socket
1768                  * they originated from - MvS (miquels@drinkel.ow.org)
1769                  */
1770                 if ((ptype->dev == dev || !ptype->dev) &&
1771                     (!skb_loop_sk(ptype, skb))) {
1772                         if (pt_prev) {
1773                                 deliver_skb(skb2, pt_prev, skb->dev);
1774                                 pt_prev = ptype;
1775                                 continue;
1776                         }
1777
1778                         skb2 = skb_clone(skb, GFP_ATOMIC);
1779                         if (!skb2)
1780                                 break;
1781
1782                         net_timestamp_set(skb2);
1783
1784                         /* skb->nh should be correctly
1785                            set by sender, so that the second statement is
1786                            just protection against buggy protocols.
1787                          */
1788                         skb_reset_mac_header(skb2);
1789
1790                         if (skb_network_header(skb2) < skb2->data ||
1791                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1792                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1793                                                      ntohs(skb2->protocol),
1794                                                      dev->name);
1795                                 skb_reset_network_header(skb2);
1796                         }
1797
1798                         skb2->transport_header = skb2->network_header;
1799                         skb2->pkt_type = PACKET_OUTGOING;
1800                         pt_prev = ptype;
1801                 }
1802         }
1803         if (pt_prev)
1804                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1805         rcu_read_unlock();
1806 }
1807
1808 /**
1809  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1810  * @dev: Network device
1811  * @txq: number of queues available
1812  *
1813  * If real_num_tx_queues is changed the tc mappings may no longer be
1814  * valid. To resolve this verify the tc mapping remains valid and if
1815  * not NULL the mapping. With no priorities mapping to this
1816  * offset/count pair it will no longer be used. In the worst case TC0
1817  * is invalid nothing can be done so disable priority mappings. If is
1818  * expected that drivers will fix this mapping if they can before
1819  * calling netif_set_real_num_tx_queues.
1820  */
1821 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1822 {
1823         int i;
1824         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1825
1826         /* If TC0 is invalidated disable TC mapping */
1827         if (tc->offset + tc->count > txq) {
1828                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1829                 dev->num_tc = 0;
1830                 return;
1831         }
1832
1833         /* Invalidated prio to tc mappings set to TC0 */
1834         for (i = 1; i < TC_BITMASK + 1; i++) {
1835                 int q = netdev_get_prio_tc_map(dev, i);
1836
1837                 tc = &dev->tc_to_txq[q];
1838                 if (tc->offset + tc->count > txq) {
1839                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1840                                 i, q);
1841                         netdev_set_prio_tc_map(dev, i, 0);
1842                 }
1843         }
1844 }
1845
1846 #ifdef CONFIG_XPS
1847 static DEFINE_MUTEX(xps_map_mutex);
1848 #define xmap_dereference(P)             \
1849         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1850
1851 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1852                                         int cpu, u16 index)
1853 {
1854         struct xps_map *map = NULL;
1855         int pos;
1856
1857         if (dev_maps)
1858                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1859
1860         for (pos = 0; map && pos < map->len; pos++) {
1861                 if (map->queues[pos] == index) {
1862                         if (map->len > 1) {
1863                                 map->queues[pos] = map->queues[--map->len];
1864                         } else {
1865                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1866                                 kfree_rcu(map, rcu);
1867                                 map = NULL;
1868                         }
1869                         break;
1870                 }
1871         }
1872
1873         return map;
1874 }
1875
1876 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1877 {
1878         struct xps_dev_maps *dev_maps;
1879         int cpu, i;
1880         bool active = false;
1881
1882         mutex_lock(&xps_map_mutex);
1883         dev_maps = xmap_dereference(dev->xps_maps);
1884
1885         if (!dev_maps)
1886                 goto out_no_maps;
1887
1888         for_each_possible_cpu(cpu) {
1889                 for (i = index; i < dev->num_tx_queues; i++) {
1890                         if (!remove_xps_queue(dev_maps, cpu, i))
1891                                 break;
1892                 }
1893                 if (i == dev->num_tx_queues)
1894                         active = true;
1895         }
1896
1897         if (!active) {
1898                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1899                 kfree_rcu(dev_maps, rcu);
1900         }
1901
1902         for (i = index; i < dev->num_tx_queues; i++)
1903                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1904                                              NUMA_NO_NODE);
1905
1906 out_no_maps:
1907         mutex_unlock(&xps_map_mutex);
1908 }
1909
1910 static struct xps_map *expand_xps_map(struct xps_map *map,
1911                                       int cpu, u16 index)
1912 {
1913         struct xps_map *new_map;
1914         int alloc_len = XPS_MIN_MAP_ALLOC;
1915         int i, pos;
1916
1917         for (pos = 0; map && pos < map->len; pos++) {
1918                 if (map->queues[pos] != index)
1919                         continue;
1920                 return map;
1921         }
1922
1923         /* Need to add queue to this CPU's existing map */
1924         if (map) {
1925                 if (pos < map->alloc_len)
1926                         return map;
1927
1928                 alloc_len = map->alloc_len * 2;
1929         }
1930
1931         /* Need to allocate new map to store queue on this CPU's map */
1932         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1933                                cpu_to_node(cpu));
1934         if (!new_map)
1935                 return NULL;
1936
1937         for (i = 0; i < pos; i++)
1938                 new_map->queues[i] = map->queues[i];
1939         new_map->alloc_len = alloc_len;
1940         new_map->len = pos;
1941
1942         return new_map;
1943 }
1944
1945 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1946                         u16 index)
1947 {
1948         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1949         struct xps_map *map, *new_map;
1950         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1951         int cpu, numa_node_id = -2;
1952         bool active = false;
1953
1954         mutex_lock(&xps_map_mutex);
1955
1956         dev_maps = xmap_dereference(dev->xps_maps);
1957
1958         /* allocate memory for queue storage */
1959         for_each_online_cpu(cpu) {
1960                 if (!cpumask_test_cpu(cpu, mask))
1961                         continue;
1962
1963                 if (!new_dev_maps)
1964                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1965                 if (!new_dev_maps) {
1966                         mutex_unlock(&xps_map_mutex);
1967                         return -ENOMEM;
1968                 }
1969
1970                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1971                                  NULL;
1972
1973                 map = expand_xps_map(map, cpu, index);
1974                 if (!map)
1975                         goto error;
1976
1977                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1978         }
1979
1980         if (!new_dev_maps)
1981                 goto out_no_new_maps;
1982
1983         for_each_possible_cpu(cpu) {
1984                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1985                         /* add queue to CPU maps */
1986                         int pos = 0;
1987
1988                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989                         while ((pos < map->len) && (map->queues[pos] != index))
1990                                 pos++;
1991
1992                         if (pos == map->len)
1993                                 map->queues[map->len++] = index;
1994 #ifdef CONFIG_NUMA
1995                         if (numa_node_id == -2)
1996                                 numa_node_id = cpu_to_node(cpu);
1997                         else if (numa_node_id != cpu_to_node(cpu))
1998                                 numa_node_id = -1;
1999 #endif
2000                 } else if (dev_maps) {
2001                         /* fill in the new device map from the old device map */
2002                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2003                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2004                 }
2005
2006         }
2007
2008         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2009
2010         /* Cleanup old maps */
2011         if (dev_maps) {
2012                 for_each_possible_cpu(cpu) {
2013                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2014                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2015                         if (map && map != new_map)
2016                                 kfree_rcu(map, rcu);
2017                 }
2018
2019                 kfree_rcu(dev_maps, rcu);
2020         }
2021
2022         dev_maps = new_dev_maps;
2023         active = true;
2024
2025 out_no_new_maps:
2026         /* update Tx queue numa node */
2027         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2028                                      (numa_node_id >= 0) ? numa_node_id :
2029                                      NUMA_NO_NODE);
2030
2031         if (!dev_maps)
2032                 goto out_no_maps;
2033
2034         /* removes queue from unused CPUs */
2035         for_each_possible_cpu(cpu) {
2036                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2037                         continue;
2038
2039                 if (remove_xps_queue(dev_maps, cpu, index))
2040                         active = true;
2041         }
2042
2043         /* free map if not active */
2044         if (!active) {
2045                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2046                 kfree_rcu(dev_maps, rcu);
2047         }
2048
2049 out_no_maps:
2050         mutex_unlock(&xps_map_mutex);
2051
2052         return 0;
2053 error:
2054         /* remove any maps that we added */
2055         for_each_possible_cpu(cpu) {
2056                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2057                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2058                                  NULL;
2059                 if (new_map && new_map != map)
2060                         kfree(new_map);
2061         }
2062
2063         mutex_unlock(&xps_map_mutex);
2064
2065         kfree(new_dev_maps);
2066         return -ENOMEM;
2067 }
2068 EXPORT_SYMBOL(netif_set_xps_queue);
2069
2070 #endif
2071 /*
2072  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2073  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2074  */
2075 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2076 {
2077         int rc;
2078
2079         if (txq < 1 || txq > dev->num_tx_queues)
2080                 return -EINVAL;
2081
2082         if (dev->reg_state == NETREG_REGISTERED ||
2083             dev->reg_state == NETREG_UNREGISTERING) {
2084                 ASSERT_RTNL();
2085
2086                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2087                                                   txq);
2088                 if (rc)
2089                         return rc;
2090
2091                 if (dev->num_tc)
2092                         netif_setup_tc(dev, txq);
2093
2094                 if (txq < dev->real_num_tx_queues) {
2095                         qdisc_reset_all_tx_gt(dev, txq);
2096 #ifdef CONFIG_XPS
2097                         netif_reset_xps_queues_gt(dev, txq);
2098 #endif
2099                 }
2100         }
2101
2102         dev->real_num_tx_queues = txq;
2103         return 0;
2104 }
2105 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2106
2107 #ifdef CONFIG_SYSFS
2108 /**
2109  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2110  *      @dev: Network device
2111  *      @rxq: Actual number of RX queues
2112  *
2113  *      This must be called either with the rtnl_lock held or before
2114  *      registration of the net device.  Returns 0 on success, or a
2115  *      negative error code.  If called before registration, it always
2116  *      succeeds.
2117  */
2118 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2119 {
2120         int rc;
2121
2122         if (rxq < 1 || rxq > dev->num_rx_queues)
2123                 return -EINVAL;
2124
2125         if (dev->reg_state == NETREG_REGISTERED) {
2126                 ASSERT_RTNL();
2127
2128                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2129                                                   rxq);
2130                 if (rc)
2131                         return rc;
2132         }
2133
2134         dev->real_num_rx_queues = rxq;
2135         return 0;
2136 }
2137 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2138 #endif
2139
2140 /**
2141  * netif_get_num_default_rss_queues - default number of RSS queues
2142  *
2143  * This routine should set an upper limit on the number of RSS queues
2144  * used by default by multiqueue devices.
2145  */
2146 int netif_get_num_default_rss_queues(void)
2147 {
2148         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2149 }
2150 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2151
2152 static inline void __netif_reschedule(struct Qdisc *q)
2153 {
2154         struct softnet_data *sd;
2155         unsigned long flags;
2156
2157         local_irq_save(flags);
2158         sd = &__get_cpu_var(softnet_data);
2159         q->next_sched = NULL;
2160         *sd->output_queue_tailp = q;
2161         sd->output_queue_tailp = &q->next_sched;
2162         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2163         local_irq_restore(flags);
2164 }
2165
2166 void __netif_schedule(struct Qdisc *q)
2167 {
2168         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2169                 __netif_reschedule(q);
2170 }
2171 EXPORT_SYMBOL(__netif_schedule);
2172
2173 struct dev_kfree_skb_cb {
2174         enum skb_free_reason reason;
2175 };
2176
2177 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2178 {
2179         return (struct dev_kfree_skb_cb *)skb->cb;
2180 }
2181
2182 void netif_schedule_queue(struct netdev_queue *txq)
2183 {
2184         rcu_read_lock();
2185         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2186                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2187
2188                 __netif_schedule(q);
2189         }
2190         rcu_read_unlock();
2191 }
2192 EXPORT_SYMBOL(netif_schedule_queue);
2193
2194 /**
2195  *      netif_wake_subqueue - allow sending packets on subqueue
2196  *      @dev: network device
2197  *      @queue_index: sub queue index
2198  *
2199  * Resume individual transmit queue of a device with multiple transmit queues.
2200  */
2201 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2202 {
2203         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2204
2205         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2206                 struct Qdisc *q;
2207
2208                 rcu_read_lock();
2209                 q = rcu_dereference(txq->qdisc);
2210                 __netif_schedule(q);
2211                 rcu_read_unlock();
2212         }
2213 }
2214 EXPORT_SYMBOL(netif_wake_subqueue);
2215
2216 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2217 {
2218         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2219                 struct Qdisc *q;
2220
2221                 rcu_read_lock();
2222                 q = rcu_dereference(dev_queue->qdisc);
2223                 __netif_schedule(q);
2224                 rcu_read_unlock();
2225         }
2226 }
2227 EXPORT_SYMBOL(netif_tx_wake_queue);
2228
2229 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2230 {
2231         unsigned long flags;
2232
2233         if (likely(atomic_read(&skb->users) == 1)) {
2234                 smp_rmb();
2235                 atomic_set(&skb->users, 0);
2236         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2237                 return;
2238         }
2239         get_kfree_skb_cb(skb)->reason = reason;
2240         local_irq_save(flags);
2241         skb->next = __this_cpu_read(softnet_data.completion_queue);
2242         __this_cpu_write(softnet_data.completion_queue, skb);
2243         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2244         local_irq_restore(flags);
2245 }
2246 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2247
2248 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2249 {
2250         if (in_irq() || irqs_disabled())
2251                 __dev_kfree_skb_irq(skb, reason);
2252         else
2253                 dev_kfree_skb(skb);
2254 }
2255 EXPORT_SYMBOL(__dev_kfree_skb_any);
2256
2257
2258 /**
2259  * netif_device_detach - mark device as removed
2260  * @dev: network device
2261  *
2262  * Mark device as removed from system and therefore no longer available.
2263  */
2264 void netif_device_detach(struct net_device *dev)
2265 {
2266         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2267             netif_running(dev)) {
2268                 netif_tx_stop_all_queues(dev);
2269         }
2270 }
2271 EXPORT_SYMBOL(netif_device_detach);
2272
2273 /**
2274  * netif_device_attach - mark device as attached
2275  * @dev: network device
2276  *
2277  * Mark device as attached from system and restart if needed.
2278  */
2279 void netif_device_attach(struct net_device *dev)
2280 {
2281         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2282             netif_running(dev)) {
2283                 netif_tx_wake_all_queues(dev);
2284                 __netdev_watchdog_up(dev);
2285         }
2286 }
2287 EXPORT_SYMBOL(netif_device_attach);
2288
2289 static void skb_warn_bad_offload(const struct sk_buff *skb)
2290 {
2291         static const netdev_features_t null_features = 0;
2292         struct net_device *dev = skb->dev;
2293         const char *driver = "";
2294
2295         if (!net_ratelimit())
2296                 return;
2297
2298         if (dev && dev->dev.parent)
2299                 driver = dev_driver_string(dev->dev.parent);
2300
2301         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2302              "gso_type=%d ip_summed=%d\n",
2303              driver, dev ? &dev->features : &null_features,
2304              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2305              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2306              skb_shinfo(skb)->gso_type, skb->ip_summed);
2307 }
2308
2309 /*
2310  * Invalidate hardware checksum when packet is to be mangled, and
2311  * complete checksum manually on outgoing path.
2312  */
2313 int skb_checksum_help(struct sk_buff *skb)
2314 {
2315         __wsum csum;
2316         int ret = 0, offset;
2317
2318         if (skb->ip_summed == CHECKSUM_COMPLETE)
2319                 goto out_set_summed;
2320
2321         if (unlikely(skb_shinfo(skb)->gso_size)) {
2322                 skb_warn_bad_offload(skb);
2323                 return -EINVAL;
2324         }
2325
2326         /* Before computing a checksum, we should make sure no frag could
2327          * be modified by an external entity : checksum could be wrong.
2328          */
2329         if (skb_has_shared_frag(skb)) {
2330                 ret = __skb_linearize(skb);
2331                 if (ret)
2332                         goto out;
2333         }
2334
2335         offset = skb_checksum_start_offset(skb);
2336         BUG_ON(offset >= skb_headlen(skb));
2337         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2338
2339         offset += skb->csum_offset;
2340         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2341
2342         if (skb_cloned(skb) &&
2343             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2344                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2345                 if (ret)
2346                         goto out;
2347         }
2348
2349         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2350 out_set_summed:
2351         skb->ip_summed = CHECKSUM_NONE;
2352 out:
2353         return ret;
2354 }
2355 EXPORT_SYMBOL(skb_checksum_help);
2356
2357 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2358 {
2359         unsigned int vlan_depth = skb->mac_len;
2360         __be16 type = skb->protocol;
2361
2362         /* Tunnel gso handlers can set protocol to ethernet. */
2363         if (type == htons(ETH_P_TEB)) {
2364                 struct ethhdr *eth;
2365
2366                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2367                         return 0;
2368
2369                 eth = (struct ethhdr *)skb_mac_header(skb);
2370                 type = eth->h_proto;
2371         }
2372
2373         /* if skb->protocol is 802.1Q/AD then the header should already be
2374          * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2375          * ETH_HLEN otherwise
2376          */
2377         if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2378                 if (vlan_depth) {
2379                         if (WARN_ON(vlan_depth < VLAN_HLEN))
2380                                 return 0;
2381                         vlan_depth -= VLAN_HLEN;
2382                 } else {
2383                         vlan_depth = ETH_HLEN;
2384                 }
2385                 do {
2386                         struct vlan_hdr *vh;
2387
2388                         if (unlikely(!pskb_may_pull(skb,
2389                                                     vlan_depth + VLAN_HLEN)))
2390                                 return 0;
2391
2392                         vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2393                         type = vh->h_vlan_encapsulated_proto;
2394                         vlan_depth += VLAN_HLEN;
2395                 } while (type == htons(ETH_P_8021Q) ||
2396                          type == htons(ETH_P_8021AD));
2397         }
2398
2399         *depth = vlan_depth;
2400
2401         return type;
2402 }
2403
2404 /**
2405  *      skb_mac_gso_segment - mac layer segmentation handler.
2406  *      @skb: buffer to segment
2407  *      @features: features for the output path (see dev->features)
2408  */
2409 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2410                                     netdev_features_t features)
2411 {
2412         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2413         struct packet_offload *ptype;
2414         int vlan_depth = skb->mac_len;
2415         __be16 type = skb_network_protocol(skb, &vlan_depth);
2416
2417         if (unlikely(!type))
2418                 return ERR_PTR(-EINVAL);
2419
2420         __skb_pull(skb, vlan_depth);
2421
2422         rcu_read_lock();
2423         list_for_each_entry_rcu(ptype, &offload_base, list) {
2424                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2425                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2426                                 int err;
2427
2428                                 err = ptype->callbacks.gso_send_check(skb);
2429                                 segs = ERR_PTR(err);
2430                                 if (err || skb_gso_ok(skb, features))
2431                                         break;
2432                                 __skb_push(skb, (skb->data -
2433                                                  skb_network_header(skb)));
2434                         }
2435                         segs = ptype->callbacks.gso_segment(skb, features);
2436                         break;
2437                 }
2438         }
2439         rcu_read_unlock();
2440
2441         __skb_push(skb, skb->data - skb_mac_header(skb));
2442
2443         return segs;
2444 }
2445 EXPORT_SYMBOL(skb_mac_gso_segment);
2446
2447
2448 /* openvswitch calls this on rx path, so we need a different check.
2449  */
2450 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2451 {
2452         if (tx_path)
2453                 return skb->ip_summed != CHECKSUM_PARTIAL;
2454         else
2455                 return skb->ip_summed == CHECKSUM_NONE;
2456 }
2457
2458 /**
2459  *      __skb_gso_segment - Perform segmentation on skb.
2460  *      @skb: buffer to segment
2461  *      @features: features for the output path (see dev->features)
2462  *      @tx_path: whether it is called in TX path
2463  *
2464  *      This function segments the given skb and returns a list of segments.
2465  *
2466  *      It may return NULL if the skb requires no segmentation.  This is
2467  *      only possible when GSO is used for verifying header integrity.
2468  */
2469 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2470                                   netdev_features_t features, bool tx_path)
2471 {
2472         if (unlikely(skb_needs_check(skb, tx_path))) {
2473                 int err;
2474
2475                 skb_warn_bad_offload(skb);
2476
2477                 err = skb_cow_head(skb, 0);
2478                 if (err < 0)
2479                         return ERR_PTR(err);
2480         }
2481
2482         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2483         SKB_GSO_CB(skb)->encap_level = 0;
2484
2485         skb_reset_mac_header(skb);
2486         skb_reset_mac_len(skb);
2487
2488         return skb_mac_gso_segment(skb, features);
2489 }
2490 EXPORT_SYMBOL(__skb_gso_segment);
2491
2492 /* Take action when hardware reception checksum errors are detected. */
2493 #ifdef CONFIG_BUG
2494 void netdev_rx_csum_fault(struct net_device *dev)
2495 {
2496         if (net_ratelimit()) {
2497                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2498                 dump_stack();
2499         }
2500 }
2501 EXPORT_SYMBOL(netdev_rx_csum_fault);
2502 #endif
2503
2504 /* Actually, we should eliminate this check as soon as we know, that:
2505  * 1. IOMMU is present and allows to map all the memory.
2506  * 2. No high memory really exists on this machine.
2507  */
2508
2509 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2510 {
2511 #ifdef CONFIG_HIGHMEM
2512         int i;
2513         if (!(dev->features & NETIF_F_HIGHDMA)) {
2514                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2515                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2516                         if (PageHighMem(skb_frag_page(frag)))
2517                                 return 1;
2518                 }
2519         }
2520
2521         if (PCI_DMA_BUS_IS_PHYS) {
2522                 struct device *pdev = dev->dev.parent;
2523
2524                 if (!pdev)
2525                         return 0;
2526                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2527                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2528                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2529                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2530                                 return 1;
2531                 }
2532         }
2533 #endif
2534         return 0;
2535 }
2536
2537 /* If MPLS offload request, verify we are testing hardware MPLS features
2538  * instead of standard features for the netdev.
2539  */
2540 #ifdef CONFIG_NET_MPLS_GSO
2541 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2542                                            netdev_features_t features,
2543                                            __be16 type)
2544 {
2545         if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
2546                 features &= skb->dev->mpls_features;
2547
2548         return features;
2549 }
2550 #else
2551 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2552                                            netdev_features_t features,
2553                                            __be16 type)
2554 {
2555         return features;
2556 }
2557 #endif
2558
2559 static netdev_features_t harmonize_features(struct sk_buff *skb,
2560         netdev_features_t features)
2561 {
2562         int tmp;
2563         __be16 type;
2564
2565         type = skb_network_protocol(skb, &tmp);
2566         features = net_mpls_features(skb, features, type);
2567
2568         if (skb->ip_summed != CHECKSUM_NONE &&
2569             !can_checksum_protocol(features, type)) {
2570                 features &= ~NETIF_F_ALL_CSUM;
2571         } else if (illegal_highdma(skb->dev, skb)) {
2572                 features &= ~NETIF_F_SG;
2573         }
2574
2575         return features;
2576 }
2577
2578 netdev_features_t netif_skb_features(struct sk_buff *skb)
2579 {
2580         __be16 protocol = skb->protocol;
2581         netdev_features_t features = skb->dev->features;
2582
2583         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2584                 features &= ~NETIF_F_GSO_MASK;
2585
2586         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2587                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2588                 protocol = veh->h_vlan_encapsulated_proto;
2589         } else if (!vlan_tx_tag_present(skb)) {
2590                 return harmonize_features(skb, features);
2591         }
2592
2593         features = netdev_intersect_features(features,
2594                                              skb->dev->vlan_features |
2595                                              NETIF_F_HW_VLAN_CTAG_TX |
2596                                              NETIF_F_HW_VLAN_STAG_TX);
2597
2598         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2599                 features = netdev_intersect_features(features,
2600                                                      NETIF_F_SG |
2601                                                      NETIF_F_HIGHDMA |
2602                                                      NETIF_F_FRAGLIST |
2603                                                      NETIF_F_GEN_CSUM |
2604                                                      NETIF_F_HW_VLAN_CTAG_TX |
2605                                                      NETIF_F_HW_VLAN_STAG_TX);
2606
2607         return harmonize_features(skb, features);
2608 }
2609 EXPORT_SYMBOL(netif_skb_features);
2610
2611 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2612                     struct netdev_queue *txq, bool more)
2613 {
2614         unsigned int len;
2615         int rc;
2616
2617         if (!list_empty(&ptype_all))
2618                 dev_queue_xmit_nit(skb, dev);
2619
2620         len = skb->len;
2621         trace_net_dev_start_xmit(skb, dev);
2622         rc = netdev_start_xmit(skb, dev, txq, more);
2623         trace_net_dev_xmit(skb, rc, dev, len);
2624
2625         return rc;
2626 }
2627
2628 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2629                                     struct netdev_queue *txq, int *ret)
2630 {
2631         struct sk_buff *skb = first;
2632         int rc = NETDEV_TX_OK;
2633
2634         while (skb) {
2635                 struct sk_buff *next = skb->next;
2636
2637                 skb->next = NULL;
2638                 rc = xmit_one(skb, dev, txq, next != NULL);
2639                 if (unlikely(!dev_xmit_complete(rc))) {
2640                         skb->next = next;
2641                         goto out;
2642                 }
2643
2644                 skb = next;
2645                 if (netif_xmit_stopped(txq) && skb) {
2646                         rc = NETDEV_TX_BUSY;
2647                         break;
2648                 }
2649         }
2650
2651 out:
2652         *ret = rc;
2653         return skb;
2654 }
2655
2656 struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, netdev_features_t features)
2657 {
2658         if (vlan_tx_tag_present(skb) &&
2659             !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2660                 skb = __vlan_put_tag(skb, skb->vlan_proto,
2661                                      vlan_tx_tag_get(skb));
2662                 if (skb)
2663                         skb->vlan_tci = 0;
2664         }
2665         return skb;
2666 }
2667
2668 struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2669 {
2670         netdev_features_t features;
2671
2672         if (skb->next)
2673                 return skb;
2674
2675         /* If device doesn't need skb->dst, release it right now while
2676          * its hot in this cpu cache
2677          */
2678         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2679                 skb_dst_drop(skb);
2680
2681         features = netif_skb_features(skb);
2682         skb = validate_xmit_vlan(skb, features);
2683         if (unlikely(!skb))
2684                 goto out_null;
2685
2686         /* If encapsulation offload request, verify we are testing
2687          * hardware encapsulation features instead of standard
2688          * features for the netdev
2689          */
2690         if (skb->encapsulation)
2691                 features &= dev->hw_enc_features;
2692
2693         if (netif_needs_gso(skb, features)) {
2694                 struct sk_buff *segs;
2695
2696                 segs = skb_gso_segment(skb, features);
2697                 if (IS_ERR(segs)) {
2698                         segs = NULL;
2699                 } else if (segs) {
2700                         consume_skb(skb);
2701                         skb = segs;
2702                 }
2703         } else {
2704                 if (skb_needs_linearize(skb, features) &&
2705                     __skb_linearize(skb))
2706                         goto out_kfree_skb;
2707
2708                 /* If packet is not checksummed and device does not
2709                  * support checksumming for this protocol, complete
2710                  * checksumming here.
2711                  */
2712                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2713                         if (skb->encapsulation)
2714                                 skb_set_inner_transport_header(skb,
2715                                                                skb_checksum_start_offset(skb));
2716                         else
2717                                 skb_set_transport_header(skb,
2718                                                          skb_checksum_start_offset(skb));
2719                         if (!(features & NETIF_F_ALL_CSUM) &&
2720                             skb_checksum_help(skb))
2721                                 goto out_kfree_skb;
2722                 }
2723         }
2724
2725         return skb;
2726
2727 out_kfree_skb:
2728         kfree_skb(skb);
2729 out_null:
2730         return NULL;
2731 }
2732
2733 static void qdisc_pkt_len_init(struct sk_buff *skb)
2734 {
2735         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2736
2737         qdisc_skb_cb(skb)->pkt_len = skb->len;
2738
2739         /* To get more precise estimation of bytes sent on wire,
2740          * we add to pkt_len the headers size of all segments
2741          */
2742         if (shinfo->gso_size)  {
2743                 unsigned int hdr_len;
2744                 u16 gso_segs = shinfo->gso_segs;
2745
2746                 /* mac layer + network layer */
2747                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2748
2749                 /* + transport layer */
2750                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2751                         hdr_len += tcp_hdrlen(skb);
2752                 else
2753                         hdr_len += sizeof(struct udphdr);
2754
2755                 if (shinfo->gso_type & SKB_GSO_DODGY)
2756                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2757                                                 shinfo->gso_size);
2758
2759                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2760         }
2761 }
2762
2763 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2764                                  struct net_device *dev,
2765                                  struct netdev_queue *txq)
2766 {
2767         spinlock_t *root_lock = qdisc_lock(q);
2768         bool contended;
2769         int rc;
2770
2771         qdisc_pkt_len_init(skb);
2772         qdisc_calculate_pkt_len(skb, q);
2773         /*
2774          * Heuristic to force contended enqueues to serialize on a
2775          * separate lock before trying to get qdisc main lock.
2776          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2777          * often and dequeue packets faster.
2778          */
2779         contended = qdisc_is_running(q);
2780         if (unlikely(contended))
2781                 spin_lock(&q->busylock);
2782
2783         spin_lock(root_lock);
2784         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2785                 kfree_skb(skb);
2786                 rc = NET_XMIT_DROP;
2787         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2788                    qdisc_run_begin(q)) {
2789                 /*
2790                  * This is a work-conserving queue; there are no old skbs
2791                  * waiting to be sent out; and the qdisc is not running -
2792                  * xmit the skb directly.
2793                  */
2794                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2795                         skb_dst_force(skb);
2796
2797                 qdisc_bstats_update(q, skb);
2798
2799                 skb = validate_xmit_skb(skb, dev);
2800                 if (skb && sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2801                         if (unlikely(contended)) {
2802                                 spin_unlock(&q->busylock);
2803                                 contended = false;
2804                         }
2805                         __qdisc_run(q);
2806                 } else
2807                         qdisc_run_end(q);
2808
2809                 rc = NET_XMIT_SUCCESS;
2810         } else {
2811                 skb_dst_force(skb);
2812                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2813                 if (qdisc_run_begin(q)) {
2814                         if (unlikely(contended)) {
2815                                 spin_unlock(&q->busylock);
2816                                 contended = false;
2817                         }
2818                         __qdisc_run(q);
2819                 }
2820         }
2821         spin_unlock(root_lock);
2822         if (unlikely(contended))
2823                 spin_unlock(&q->busylock);
2824         return rc;
2825 }
2826
2827 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2828 static void skb_update_prio(struct sk_buff *skb)
2829 {
2830         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2831
2832         if (!skb->priority && skb->sk && map) {
2833                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2834
2835                 if (prioidx < map->priomap_len)
2836                         skb->priority = map->priomap[prioidx];
2837         }
2838 }
2839 #else
2840 #define skb_update_prio(skb)
2841 #endif
2842
2843 static DEFINE_PER_CPU(int, xmit_recursion);
2844 #define RECURSION_LIMIT 10
2845
2846 /**
2847  *      dev_loopback_xmit - loop back @skb
2848  *      @skb: buffer to transmit
2849  */
2850 int dev_loopback_xmit(struct sk_buff *skb)
2851 {
2852         skb_reset_mac_header(skb);
2853         __skb_pull(skb, skb_network_offset(skb));
2854         skb->pkt_type = PACKET_LOOPBACK;
2855         skb->ip_summed = CHECKSUM_UNNECESSARY;
2856         WARN_ON(!skb_dst(skb));
2857         skb_dst_force(skb);
2858         netif_rx_ni(skb);
2859         return 0;
2860 }
2861 EXPORT_SYMBOL(dev_loopback_xmit);
2862
2863 /**
2864  *      __dev_queue_xmit - transmit a buffer
2865  *      @skb: buffer to transmit
2866  *      @accel_priv: private data used for L2 forwarding offload
2867  *
2868  *      Queue a buffer for transmission to a network device. The caller must
2869  *      have set the device and priority and built the buffer before calling
2870  *      this function. The function can be called from an interrupt.
2871  *
2872  *      A negative errno code is returned on a failure. A success does not
2873  *      guarantee the frame will be transmitted as it may be dropped due
2874  *      to congestion or traffic shaping.
2875  *
2876  * -----------------------------------------------------------------------------------
2877  *      I notice this method can also return errors from the queue disciplines,
2878  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2879  *      be positive.
2880  *
2881  *      Regardless of the return value, the skb is consumed, so it is currently
2882  *      difficult to retry a send to this method.  (You can bump the ref count
2883  *      before sending to hold a reference for retry if you are careful.)
2884  *
2885  *      When calling this method, interrupts MUST be enabled.  This is because
2886  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2887  *          --BLG
2888  */
2889 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2890 {
2891         struct net_device *dev = skb->dev;
2892         struct netdev_queue *txq;
2893         struct Qdisc *q;
2894         int rc = -ENOMEM;
2895
2896         skb_reset_mac_header(skb);
2897
2898         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2899                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2900
2901         /* Disable soft irqs for various locks below. Also
2902          * stops preemption for RCU.
2903          */
2904         rcu_read_lock_bh();
2905
2906         skb_update_prio(skb);
2907
2908         txq = netdev_pick_tx(dev, skb, accel_priv);
2909         q = rcu_dereference_bh(txq->qdisc);
2910
2911 #ifdef CONFIG_NET_CLS_ACT
2912         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2913 #endif
2914         trace_net_dev_queue(skb);
2915         if (q->enqueue) {
2916                 rc = __dev_xmit_skb(skb, q, dev, txq);
2917                 goto out;
2918         }
2919
2920         /* The device has no queue. Common case for software devices:
2921            loopback, all the sorts of tunnels...
2922
2923            Really, it is unlikely that netif_tx_lock protection is necessary
2924            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2925            counters.)
2926            However, it is possible, that they rely on protection
2927            made by us here.
2928
2929            Check this and shot the lock. It is not prone from deadlocks.
2930            Either shot noqueue qdisc, it is even simpler 8)
2931          */
2932         if (dev->flags & IFF_UP) {
2933                 int cpu = smp_processor_id(); /* ok because BHs are off */
2934
2935                 if (txq->xmit_lock_owner != cpu) {
2936
2937                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2938                                 goto recursion_alert;
2939
2940                         skb = validate_xmit_skb(skb, dev);
2941                         if (!skb)
2942                                 goto drop;
2943
2944                         HARD_TX_LOCK(dev, txq, cpu);
2945
2946                         if (!netif_xmit_stopped(txq)) {
2947                                 __this_cpu_inc(xmit_recursion);
2948                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2949                                 __this_cpu_dec(xmit_recursion);
2950                                 if (dev_xmit_complete(rc)) {
2951                                         HARD_TX_UNLOCK(dev, txq);
2952                                         goto out;
2953                                 }
2954                         }
2955                         HARD_TX_UNLOCK(dev, txq);
2956                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2957                                              dev->name);
2958                 } else {
2959                         /* Recursion is detected! It is possible,
2960                          * unfortunately
2961                          */
2962 recursion_alert:
2963                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2964                                              dev->name);
2965                 }
2966         }
2967
2968         rc = -ENETDOWN;
2969 drop:
2970         rcu_read_unlock_bh();
2971
2972         atomic_long_inc(&dev->tx_dropped);
2973         kfree_skb_list(skb);
2974         return rc;
2975 out:
2976         rcu_read_unlock_bh();
2977         return rc;
2978 }
2979
2980 int dev_queue_xmit(struct sk_buff *skb)
2981 {
2982         return __dev_queue_xmit(skb, NULL);
2983 }
2984 EXPORT_SYMBOL(dev_queue_xmit);
2985
2986 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2987 {
2988         return __dev_queue_xmit(skb, accel_priv);
2989 }
2990 EXPORT_SYMBOL(dev_queue_xmit_accel);
2991
2992
2993 /*=======================================================================
2994                         Receiver routines
2995   =======================================================================*/
2996
2997 int netdev_max_backlog __read_mostly = 1000;
2998 EXPORT_SYMBOL(netdev_max_backlog);
2999
3000 int netdev_tstamp_prequeue __read_mostly = 1;
3001 int netdev_budget __read_mostly = 300;
3002 int weight_p __read_mostly = 64;            /* old backlog weight */
3003
3004 /* Called with irq disabled */
3005 static inline void ____napi_schedule(struct softnet_data *sd,
3006                                      struct napi_struct *napi)
3007 {
3008         list_add_tail(&napi->poll_list, &sd->poll_list);
3009         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3010 }
3011
3012 #ifdef CONFIG_RPS
3013
3014 /* One global table that all flow-based protocols share. */
3015 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3016 EXPORT_SYMBOL(rps_sock_flow_table);
3017
3018 struct static_key rps_needed __read_mostly;
3019
3020 static struct rps_dev_flow *
3021 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3022             struct rps_dev_flow *rflow, u16 next_cpu)
3023 {
3024         if (next_cpu != RPS_NO_CPU) {
3025 #ifdef CONFIG_RFS_ACCEL
3026                 struct netdev_rx_queue *rxqueue;
3027                 struct rps_dev_flow_table *flow_table;
3028                 struct rps_dev_flow *old_rflow;
3029                 u32 flow_id;
3030                 u16 rxq_index;
3031                 int rc;
3032
3033                 /* Should we steer this flow to a different hardware queue? */
3034                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3035                     !(dev->features & NETIF_F_NTUPLE))
3036                         goto out;
3037                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3038                 if (rxq_index == skb_get_rx_queue(skb))
3039                         goto out;
3040
3041                 rxqueue = dev->_rx + rxq_index;
3042                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3043                 if (!flow_table)
3044                         goto out;
3045                 flow_id = skb_get_hash(skb) & flow_table->mask;
3046                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3047                                                         rxq_index, flow_id);
3048                 if (rc < 0)
3049                         goto out;
3050                 old_rflow = rflow;
3051                 rflow = &flow_table->flows[flow_id];
3052                 rflow->filter = rc;
3053                 if (old_rflow->filter == rflow->filter)
3054                         old_rflow->filter = RPS_NO_FILTER;
3055         out:
3056 #endif
3057                 rflow->last_qtail =
3058                         per_cpu(softnet_data, next_cpu).input_queue_head;
3059         }
3060
3061         rflow->cpu = next_cpu;
3062         return rflow;
3063 }
3064
3065 /*
3066  * get_rps_cpu is called from netif_receive_skb and returns the target
3067  * CPU from the RPS map of the receiving queue for a given skb.
3068  * rcu_read_lock must be held on entry.
3069  */
3070 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3071                        struct rps_dev_flow **rflowp)
3072 {
3073         struct netdev_rx_queue *rxqueue;
3074         struct rps_map *map;
3075         struct rps_dev_flow_table *flow_table;
3076         struct rps_sock_flow_table *sock_flow_table;
3077         int cpu = -1;
3078         u16 tcpu;
3079         u32 hash;
3080
3081         if (skb_rx_queue_recorded(skb)) {
3082                 u16 index = skb_get_rx_queue(skb);
3083                 if (unlikely(index >= dev->real_num_rx_queues)) {
3084                         WARN_ONCE(dev->real_num_rx_queues > 1,
3085                                   "%s received packet on queue %u, but number "
3086                                   "of RX queues is %u\n",
3087                                   dev->name, index, dev->real_num_rx_queues);
3088                         goto done;
3089                 }
3090                 rxqueue = dev->_rx + index;
3091         } else
3092                 rxqueue = dev->_rx;
3093
3094         map = rcu_dereference(rxqueue->rps_map);
3095         if (map) {
3096                 if (map->len == 1 &&
3097                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3098                         tcpu = map->cpus[0];
3099                         if (cpu_online(tcpu))
3100                                 cpu = tcpu;
3101                         goto done;
3102                 }
3103         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3104                 goto done;
3105         }
3106
3107         skb_reset_network_header(skb);
3108         hash = skb_get_hash(skb);
3109         if (!hash)
3110                 goto done;
3111
3112         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3113         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3114         if (flow_table && sock_flow_table) {
3115                 u16 next_cpu;
3116                 struct rps_dev_flow *rflow;
3117
3118                 rflow = &flow_table->flows[hash & flow_table->mask];
3119                 tcpu = rflow->cpu;
3120
3121                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3122
3123                 /*
3124                  * If the desired CPU (where last recvmsg was done) is
3125                  * different from current CPU (one in the rx-queue flow
3126                  * table entry), switch if one of the following holds:
3127                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3128                  *   - Current CPU is offline.
3129                  *   - The current CPU's queue tail has advanced beyond the
3130                  *     last packet that was enqueued using this table entry.
3131                  *     This guarantees that all previous packets for the flow
3132                  *     have been dequeued, thus preserving in order delivery.
3133                  */
3134                 if (unlikely(tcpu != next_cpu) &&
3135                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3136                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3137                       rflow->last_qtail)) >= 0)) {
3138                         tcpu = next_cpu;
3139                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3140                 }
3141
3142                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3143                         *rflowp = rflow;
3144                         cpu = tcpu;
3145                         goto done;
3146                 }
3147         }
3148
3149         if (map) {
3150                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3151                 if (cpu_online(tcpu)) {
3152                         cpu = tcpu;
3153                         goto done;
3154                 }
3155         }
3156
3157 done:
3158         return cpu;
3159 }
3160
3161 #ifdef CONFIG_RFS_ACCEL
3162
3163 /**
3164  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3165  * @dev: Device on which the filter was set
3166  * @rxq_index: RX queue index
3167  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3168  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3169  *
3170  * Drivers that implement ndo_rx_flow_steer() should periodically call
3171  * this function for each installed filter and remove the filters for
3172  * which it returns %true.
3173  */
3174 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3175                          u32 flow_id, u16 filter_id)
3176 {
3177         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3178         struct rps_dev_flow_table *flow_table;
3179         struct rps_dev_flow *rflow;
3180         bool expire = true;
3181         int cpu;
3182
3183         rcu_read_lock();
3184         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3185         if (flow_table && flow_id <= flow_table->mask) {
3186                 rflow = &flow_table->flows[flow_id];
3187                 cpu = ACCESS_ONCE(rflow->cpu);
3188                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3189                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3190                            rflow->last_qtail) <
3191                      (int)(10 * flow_table->mask)))
3192                         expire = false;
3193         }
3194         rcu_read_unlock();
3195         return expire;
3196 }
3197 EXPORT_SYMBOL(rps_may_expire_flow);
3198
3199 #endif /* CONFIG_RFS_ACCEL */
3200
3201 /* Called from hardirq (IPI) context */
3202 static void rps_trigger_softirq(void *data)
3203 {
3204         struct softnet_data *sd = data;
3205
3206         ____napi_schedule(sd, &sd->backlog);
3207         sd->received_rps++;
3208 }
3209
3210 #endif /* CONFIG_RPS */
3211
3212 /*
3213  * Check if this softnet_data structure is another cpu one
3214  * If yes, queue it to our IPI list and return 1
3215  * If no, return 0
3216  */
3217 static int rps_ipi_queued(struct softnet_data *sd)
3218 {
3219 #ifdef CONFIG_RPS
3220         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3221
3222         if (sd != mysd) {
3223                 sd->rps_ipi_next = mysd->rps_ipi_list;
3224                 mysd->rps_ipi_list = sd;
3225
3226                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3227                 return 1;
3228         }
3229 #endif /* CONFIG_RPS */
3230         return 0;
3231 }
3232
3233 #ifdef CONFIG_NET_FLOW_LIMIT
3234 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3235 #endif
3236
3237 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3238 {
3239 #ifdef CONFIG_NET_FLOW_LIMIT
3240         struct sd_flow_limit *fl;
3241         struct softnet_data *sd;
3242         unsigned int old_flow, new_flow;
3243
3244         if (qlen < (netdev_max_backlog >> 1))
3245                 return false;
3246
3247         sd = &__get_cpu_var(softnet_data);
3248
3249         rcu_read_lock();
3250         fl = rcu_dereference(sd->flow_limit);
3251         if (fl) {
3252                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3253                 old_flow = fl->history[fl->history_head];
3254                 fl->history[fl->history_head] = new_flow;
3255
3256                 fl->history_head++;
3257                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3258
3259                 if (likely(fl->buckets[old_flow]))
3260                         fl->buckets[old_flow]--;
3261
3262                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3263                         fl->count++;
3264                         rcu_read_unlock();
3265                         return true;
3266                 }
3267         }
3268         rcu_read_unlock();
3269 #endif
3270         return false;
3271 }
3272
3273 /*
3274  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3275  * queue (may be a remote CPU queue).
3276  */
3277 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3278                               unsigned int *qtail)
3279 {
3280         struct softnet_data *sd;
3281         unsigned long flags;
3282         unsigned int qlen;
3283
3284         sd = &per_cpu(softnet_data, cpu);
3285
3286         local_irq_save(flags);
3287
3288         rps_lock(sd);
3289         qlen = skb_queue_len(&sd->input_pkt_queue);
3290         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3291                 if (skb_queue_len(&sd->input_pkt_queue)) {
3292 enqueue:
3293                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3294                         input_queue_tail_incr_save(sd, qtail);
3295                         rps_unlock(sd);
3296                         local_irq_restore(flags);
3297                         return NET_RX_SUCCESS;
3298                 }
3299
3300                 /* Schedule NAPI for backlog device
3301                  * We can use non atomic operation since we own the queue lock
3302                  */
3303                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3304                         if (!rps_ipi_queued(sd))
3305                                 ____napi_schedule(sd, &sd->backlog);
3306                 }
3307                 goto enqueue;
3308         }
3309
3310         sd->dropped++;
3311         rps_unlock(sd);
3312
3313         local_irq_restore(flags);
3314
3315         atomic_long_inc(&skb->dev->rx_dropped);
3316         kfree_skb(skb);
3317         return NET_RX_DROP;
3318 }
3319
3320 static int netif_rx_internal(struct sk_buff *skb)
3321 {
3322         int ret;
3323
3324         net_timestamp_check(netdev_tstamp_prequeue, skb);
3325
3326         trace_netif_rx(skb);
3327 #ifdef CONFIG_RPS
3328         if (static_key_false(&rps_needed)) {
3329                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3330                 int cpu;
3331
3332                 preempt_disable();
3333                 rcu_read_lock();
3334
3335                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3336                 if (cpu < 0)
3337                         cpu = smp_processor_id();
3338
3339                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3340
3341                 rcu_read_unlock();
3342                 preempt_enable();
3343         } else
3344 #endif
3345         {
3346                 unsigned int qtail;
3347                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3348                 put_cpu();
3349         }
3350         return ret;
3351 }
3352
3353 /**
3354  *      netif_rx        -       post buffer to the network code
3355  *      @skb: buffer to post
3356  *
3357  *      This function receives a packet from a device driver and queues it for
3358  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3359  *      may be dropped during processing for congestion control or by the
3360  *      protocol layers.
3361  *
3362  *      return values:
3363  *      NET_RX_SUCCESS  (no congestion)
3364  *      NET_RX_DROP     (packet was dropped)
3365  *
3366  */
3367
3368 int netif_rx(struct sk_buff *skb)
3369 {
3370         trace_netif_rx_entry(skb);
3371
3372         return netif_rx_internal(skb);
3373 }
3374 EXPORT_SYMBOL(netif_rx);
3375
3376 int netif_rx_ni(struct sk_buff *skb)
3377 {
3378         int err;
3379
3380         trace_netif_rx_ni_entry(skb);
3381
3382         preempt_disable();
3383         err = netif_rx_internal(skb);
3384         if (local_softirq_pending())
3385                 do_softirq();
3386         preempt_enable();
3387
3388         return err;
3389 }
3390 EXPORT_SYMBOL(netif_rx_ni);
3391
3392 static void net_tx_action(struct softirq_action *h)
3393 {
3394         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3395
3396         if (sd->completion_queue) {
3397                 struct sk_buff *clist;
3398
3399                 local_irq_disable();
3400                 clist = sd->completion_queue;
3401                 sd->completion_queue = NULL;
3402                 local_irq_enable();
3403
3404                 while (clist) {
3405                         struct sk_buff *skb = clist;
3406                         clist = clist->next;
3407
3408                         WARN_ON(atomic_read(&skb->users));
3409                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3410                                 trace_consume_skb(skb);
3411                         else
3412                                 trace_kfree_skb(skb, net_tx_action);
3413                         __kfree_skb(skb);
3414                 }
3415         }
3416
3417         if (sd->output_queue) {
3418                 struct Qdisc *head;
3419
3420                 local_irq_disable();
3421                 head = sd->output_queue;
3422                 sd->output_queue = NULL;
3423                 sd->output_queue_tailp = &sd->output_queue;
3424                 local_irq_enable();
3425
3426                 while (head) {
3427                         struct Qdisc *q = head;
3428                         spinlock_t *root_lock;
3429
3430                         head = head->next_sched;
3431
3432                         root_lock = qdisc_lock(q);
3433                         if (spin_trylock(root_lock)) {
3434                                 smp_mb__before_atomic();
3435                                 clear_bit(__QDISC_STATE_SCHED,
3436                                           &q->state);
3437                                 qdisc_run(q);
3438                                 spin_unlock(root_lock);
3439                         } else {
3440                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3441                                               &q->state)) {
3442                                         __netif_reschedule(q);
3443                                 } else {
3444                                         smp_mb__before_atomic();
3445                                         clear_bit(__QDISC_STATE_SCHED,
3446                                                   &q->state);
3447                                 }
3448                         }
3449                 }
3450         }
3451 }
3452
3453 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3454     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3455 /* This hook is defined here for ATM LANE */
3456 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3457                              unsigned char *addr) __read_mostly;
3458 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3459 #endif
3460
3461 #ifdef CONFIG_NET_CLS_ACT
3462 /* TODO: Maybe we should just force sch_ingress to be compiled in
3463  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3464  * a compare and 2 stores extra right now if we dont have it on
3465  * but have CONFIG_NET_CLS_ACT
3466  * NOTE: This doesn't stop any functionality; if you dont have
3467  * the ingress scheduler, you just can't add policies on ingress.
3468  *
3469  */
3470 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3471 {
3472         struct net_device *dev = skb->dev;
3473         u32 ttl = G_TC_RTTL(skb->tc_verd);
3474         int result = TC_ACT_OK;
3475         struct Qdisc *q;
3476
3477         if (unlikely(MAX_RED_LOOP < ttl++)) {
3478                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3479                                      skb->skb_iif, dev->ifindex);
3480                 return TC_ACT_SHOT;
3481         }
3482
3483         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3484         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3485
3486         q = rcu_dereference(rxq->qdisc);
3487         if (q != &noop_qdisc) {
3488                 spin_lock(qdisc_lock(q));
3489                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3490                         result = qdisc_enqueue_root(skb, q);
3491                 spin_unlock(qdisc_lock(q));
3492         }
3493
3494         return result;
3495 }
3496
3497 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3498                                          struct packet_type **pt_prev,
3499                                          int *ret, struct net_device *orig_dev)
3500 {
3501         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3502
3503         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3504                 goto out;
3505
3506         if (*pt_prev) {
3507                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3508                 *pt_prev = NULL;
3509         }
3510
3511         switch (ing_filter(skb, rxq)) {
3512         case TC_ACT_SHOT:
3513         case TC_ACT_STOLEN:
3514                 kfree_skb(skb);
3515                 return NULL;
3516         }
3517
3518 out:
3519         skb->tc_verd = 0;
3520         return skb;
3521 }
3522 #endif
3523
3524 /**
3525  *      netdev_rx_handler_register - register receive handler
3526  *      @dev: device to register a handler for
3527  *      @rx_handler: receive handler to register
3528  *      @rx_handler_data: data pointer that is used by rx handler
3529  *
3530  *      Register a receive handler for a device. This handler will then be
3531  *      called from __netif_receive_skb. A negative errno code is returned
3532  *      on a failure.
3533  *
3534  *      The caller must hold the rtnl_mutex.
3535  *
3536  *      For a general description of rx_handler, see enum rx_handler_result.
3537  */
3538 int netdev_rx_handler_register(struct net_device *dev,
3539                                rx_handler_func_t *rx_handler,
3540                                void *rx_handler_data)
3541 {
3542         ASSERT_RTNL();
3543
3544         if (dev->rx_handler)
3545                 return -EBUSY;
3546
3547         /* Note: rx_handler_data must be set before rx_handler */
3548         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3549         rcu_assign_pointer(dev->rx_handler, rx_handler);
3550
3551         return 0;
3552 }
3553 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3554
3555 /**
3556  *      netdev_rx_handler_unregister - unregister receive handler
3557  *      @dev: device to unregister a handler from
3558  *
3559  *      Unregister a receive handler from a device.
3560  *
3561  *      The caller must hold the rtnl_mutex.
3562  */
3563 void netdev_rx_handler_unregister(struct net_device *dev)
3564 {
3565
3566         ASSERT_RTNL();
3567         RCU_INIT_POINTER(dev->rx_handler, NULL);
3568         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3569          * section has a guarantee to see a non NULL rx_handler_data
3570          * as well.
3571          */
3572         synchronize_net();
3573         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3574 }
3575 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3576
3577 /*
3578  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3579  * the special handling of PFMEMALLOC skbs.
3580  */
3581 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3582 {
3583         switch (skb->protocol) {
3584         case htons(ETH_P_ARP):
3585         case htons(ETH_P_IP):
3586         case htons(ETH_P_IPV6):
3587         case htons(ETH_P_8021Q):
3588         case htons(ETH_P_8021AD):
3589                 return true;
3590         default:
3591                 return false;
3592         }
3593 }
3594
3595 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3596 {
3597         struct packet_type *ptype, *pt_prev;
3598         rx_handler_func_t *rx_handler;
3599         struct net_device *orig_dev;
3600         struct net_device *null_or_dev;
3601         bool deliver_exact = false;
3602         int ret = NET_RX_DROP;
3603         __be16 type;
3604
3605         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3606
3607         trace_netif_receive_skb(skb);
3608
3609         orig_dev = skb->dev;
3610
3611         skb_reset_network_header(skb);
3612         if (!skb_transport_header_was_set(skb))
3613                 skb_reset_transport_header(skb);
3614         skb_reset_mac_len(skb);
3615
3616         pt_prev = NULL;
3617
3618         rcu_read_lock();
3619
3620 another_round:
3621         skb->skb_iif = skb->dev->ifindex;
3622
3623         __this_cpu_inc(softnet_data.processed);
3624
3625         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3626             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3627                 skb = skb_vlan_untag(skb);
3628                 if (unlikely(!skb))
3629                         goto unlock;
3630         }
3631
3632 #ifdef CONFIG_NET_CLS_ACT
3633         if (skb->tc_verd & TC_NCLS) {
3634                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3635                 goto ncls;
3636         }
3637 #endif
3638
3639         if (pfmemalloc)
3640                 goto skip_taps;
3641
3642         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3643                 if (!ptype->dev || ptype->dev == skb->dev) {
3644                         if (pt_prev)
3645                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3646                         pt_prev = ptype;
3647                 }
3648         }
3649
3650 skip_taps:
3651 #ifdef CONFIG_NET_CLS_ACT
3652         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3653         if (!skb)
3654                 goto unlock;
3655 ncls:
3656 #endif
3657
3658         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3659                 goto drop;
3660
3661         if (vlan_tx_tag_present(skb)) {
3662                 if (pt_prev) {
3663                         ret = deliver_skb(skb, pt_prev, orig_dev);
3664                         pt_prev = NULL;
3665                 }
3666                 if (vlan_do_receive(&skb))
3667                         goto another_round;
3668                 else if (unlikely(!skb))
3669                         goto unlock;
3670         }
3671
3672         rx_handler = rcu_dereference(skb->dev->rx_handler);
3673         if (rx_handler) {
3674                 if (pt_prev) {
3675                         ret = deliver_skb(skb, pt_prev, orig_dev);
3676                         pt_prev = NULL;
3677                 }
3678                 switch (rx_handler(&skb)) {
3679                 case RX_HANDLER_CONSUMED:
3680                         ret = NET_RX_SUCCESS;
3681                         goto unlock;
3682                 case RX_HANDLER_ANOTHER:
3683                         goto another_round;
3684                 case RX_HANDLER_EXACT:
3685                         deliver_exact = true;
3686                 case RX_HANDLER_PASS:
3687                         break;
3688                 default:
3689                         BUG();
3690                 }
3691         }
3692
3693         if (unlikely(vlan_tx_tag_present(skb))) {
3694                 if (vlan_tx_tag_get_id(skb))
3695                         skb->pkt_type = PACKET_OTHERHOST;
3696                 /* Note: we might in the future use prio bits
3697                  * and set skb->priority like in vlan_do_receive()
3698                  * For the time being, just ignore Priority Code Point
3699                  */
3700                 skb->vlan_tci = 0;
3701         }
3702
3703         /* deliver only exact match when indicated */
3704         null_or_dev = deliver_exact ? skb->dev : NULL;
3705
3706         type = skb->protocol;
3707         list_for_each_entry_rcu(ptype,
3708                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3709                 if (ptype->type == type &&
3710                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3711                      ptype->dev == orig_dev)) {
3712                         if (pt_prev)
3713                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3714                         pt_prev = ptype;
3715                 }
3716         }
3717
3718         if (pt_prev) {
3719                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3720                         goto drop;
3721                 else
3722                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3723         } else {
3724 drop:
3725                 atomic_long_inc(&skb->dev->rx_dropped);
3726                 kfree_skb(skb);
3727                 /* Jamal, now you will not able to escape explaining
3728                  * me how you were going to use this. :-)
3729                  */
3730                 ret = NET_RX_DROP;
3731         }
3732
3733 unlock:
3734         rcu_read_unlock();
3735         return ret;
3736 }
3737
3738 static int __netif_receive_skb(struct sk_buff *skb)
3739 {
3740         int ret;
3741
3742         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3743                 unsigned long pflags = current->flags;
3744
3745                 /*
3746                  * PFMEMALLOC skbs are special, they should
3747                  * - be delivered to SOCK_MEMALLOC sockets only
3748                  * - stay away from userspace
3749                  * - have bounded memory usage
3750                  *
3751                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3752                  * context down to all allocation sites.
3753                  */
3754                 current->flags |= PF_MEMALLOC;
3755                 ret = __netif_receive_skb_core(skb, true);
3756                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3757         } else
3758                 ret = __netif_receive_skb_core(skb, false);
3759
3760         return ret;
3761 }
3762
3763 static int netif_receive_skb_internal(struct sk_buff *skb)
3764 {
3765         net_timestamp_check(netdev_tstamp_prequeue, skb);
3766
3767         if (skb_defer_rx_timestamp(skb))
3768                 return NET_RX_SUCCESS;
3769
3770 #ifdef CONFIG_RPS
3771         if (static_key_false(&rps_needed)) {
3772                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3773                 int cpu, ret;
3774
3775                 rcu_read_lock();
3776
3777                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3778
3779                 if (cpu >= 0) {
3780                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3781                         rcu_read_unlock();
3782                         return ret;
3783                 }
3784                 rcu_read_unlock();
3785         }
3786 #endif
3787         return __netif_receive_skb(skb);
3788 }
3789
3790 /**
3791  *      netif_receive_skb - process receive buffer from network
3792  *      @skb: buffer to process
3793  *
3794  *      netif_receive_skb() is the main receive data processing function.
3795  *      It always succeeds. The buffer may be dropped during processing
3796  *      for congestion control or by the protocol layers.
3797  *
3798  *      This function may only be called from softirq context and interrupts
3799  *      should be enabled.
3800  *
3801  *      Return values (usually ignored):
3802  *      NET_RX_SUCCESS: no congestion
3803  *      NET_RX_DROP: packet was dropped
3804  */
3805 int netif_receive_skb(struct sk_buff *skb)
3806 {
3807         trace_netif_receive_skb_entry(skb);
3808
3809         return netif_receive_skb_internal(skb);
3810 }
3811 EXPORT_SYMBOL(netif_receive_skb);
3812
3813 /* Network device is going away, flush any packets still pending
3814  * Called with irqs disabled.
3815  */
3816 static void flush_backlog(void *arg)
3817 {
3818         struct net_device *dev = arg;
3819         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3820         struct sk_buff *skb, *tmp;
3821
3822         rps_lock(sd);
3823         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3824                 if (skb->dev == dev) {
3825                         __skb_unlink(skb, &sd->input_pkt_queue);
3826                         kfree_skb(skb);
3827                         input_queue_head_incr(sd);
3828                 }
3829         }
3830         rps_unlock(sd);
3831
3832         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3833                 if (skb->dev == dev) {
3834                         __skb_unlink(skb, &sd->process_queue);
3835                         kfree_skb(skb);
3836                         input_queue_head_incr(sd);
3837                 }
3838         }
3839 }
3840
3841 static int napi_gro_complete(struct sk_buff *skb)
3842 {
3843         struct packet_offload *ptype;
3844         __be16 type = skb->protocol;
3845         struct list_head *head = &offload_base;
3846         int err = -ENOENT;
3847
3848         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3849
3850         if (NAPI_GRO_CB(skb)->count == 1) {
3851                 skb_shinfo(skb)->gso_size = 0;
3852                 goto out;
3853         }
3854
3855         rcu_read_lock();
3856         list_for_each_entry_rcu(ptype, head, list) {
3857                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3858                         continue;
3859
3860                 err = ptype->callbacks.gro_complete(skb, 0);
3861                 break;
3862         }
3863         rcu_read_unlock();
3864
3865         if (err) {
3866                 WARN_ON(&ptype->list == head);
3867                 kfree_skb(skb);
3868                 return NET_RX_SUCCESS;
3869         }
3870
3871 out:
3872         return netif_receive_skb_internal(skb);
3873 }
3874
3875 /* napi->gro_list contains packets ordered by age.
3876  * youngest packets at the head of it.
3877  * Complete skbs in reverse order to reduce latencies.
3878  */
3879 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3880 {
3881         struct sk_buff *skb, *prev = NULL;
3882
3883         /* scan list and build reverse chain */
3884         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3885                 skb->prev = prev;
3886                 prev = skb;
3887         }
3888
3889         for (skb = prev; skb; skb = prev) {
3890                 skb->next = NULL;
3891
3892                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3893                         return;
3894
3895                 prev = skb->prev;
3896                 napi_gro_complete(skb);
3897                 napi->gro_count--;
3898         }
3899
3900         napi->gro_list = NULL;
3901 }
3902 EXPORT_SYMBOL(napi_gro_flush);
3903
3904 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3905 {
3906         struct sk_buff *p;
3907         unsigned int maclen = skb->dev->hard_header_len;
3908         u32 hash = skb_get_hash_raw(skb);
3909
3910         for (p = napi->gro_list; p; p = p->next) {
3911                 unsigned long diffs;
3912
3913                 NAPI_GRO_CB(p)->flush = 0;
3914
3915                 if (hash != skb_get_hash_raw(p)) {
3916                         NAPI_GRO_CB(p)->same_flow = 0;
3917                         continue;
3918                 }
3919
3920                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3921                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3922                 if (maclen == ETH_HLEN)
3923                         diffs |= compare_ether_header(skb_mac_header(p),
3924                                                       skb_mac_header(skb));
3925                 else if (!diffs)
3926                         diffs = memcmp(skb_mac_header(p),
3927                                        skb_mac_header(skb),
3928                                        maclen);
3929                 NAPI_GRO_CB(p)->same_flow = !diffs;
3930         }
3931 }
3932
3933 static void skb_gro_reset_offset(struct sk_buff *skb)
3934 {
3935         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3936         const skb_frag_t *frag0 = &pinfo->frags[0];
3937
3938         NAPI_GRO_CB(skb)->data_offset = 0;
3939         NAPI_GRO_CB(skb)->frag0 = NULL;
3940         NAPI_GRO_CB(skb)->frag0_len = 0;
3941
3942         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3943             pinfo->nr_frags &&
3944             !PageHighMem(skb_frag_page(frag0))) {
3945                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3946                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3947         }
3948 }
3949
3950 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3951 {
3952         struct skb_shared_info *pinfo = skb_shinfo(skb);
3953
3954         BUG_ON(skb->end - skb->tail < grow);
3955
3956         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3957
3958         skb->data_len -= grow;
3959         skb->tail += grow;
3960
3961         pinfo->frags[0].page_offset += grow;
3962         skb_frag_size_sub(&pinfo->frags[0], grow);
3963
3964         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3965                 skb_frag_unref(skb, 0);
3966                 memmove(pinfo->frags, pinfo->frags + 1,
3967                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3968         }
3969 }
3970
3971 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3972 {
3973         struct sk_buff **pp = NULL;
3974         struct packet_offload *ptype;
3975         __be16 type = skb->protocol;
3976         struct list_head *head = &offload_base;
3977         int same_flow;
3978         enum gro_result ret;
3979         int grow;
3980
3981         if (!(skb->dev->features & NETIF_F_GRO))
3982                 goto normal;
3983
3984         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3985                 goto normal;
3986
3987         gro_list_prepare(napi, skb);
3988
3989         rcu_read_lock();
3990         list_for_each_entry_rcu(ptype, head, list) {
3991                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3992                         continue;
3993
3994                 skb_set_network_header(skb, skb_gro_offset(skb));
3995                 skb_reset_mac_len(skb);
3996                 NAPI_GRO_CB(skb)->same_flow = 0;
3997                 NAPI_GRO_CB(skb)->flush = 0;
3998                 NAPI_GRO_CB(skb)->free = 0;
3999                 NAPI_GRO_CB(skb)->udp_mark = 0;
4000
4001                 /* Setup for GRO checksum validation */
4002                 switch (skb->ip_summed) {
4003                 case CHECKSUM_COMPLETE:
4004                         NAPI_GRO_CB(skb)->csum = skb->csum;
4005                         NAPI_GRO_CB(skb)->csum_valid = 1;
4006                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4007                         break;
4008                 case CHECKSUM_UNNECESSARY:
4009                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4010                         NAPI_GRO_CB(skb)->csum_valid = 0;
4011                         break;
4012                 default:
4013                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4014                         NAPI_GRO_CB(skb)->csum_valid = 0;
4015                 }
4016
4017                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4018                 break;
4019         }
4020         rcu_read_unlock();
4021
4022         if (&ptype->list == head)
4023                 goto normal;
4024
4025         same_flow = NAPI_GRO_CB(skb)->same_flow;
4026         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4027
4028         if (pp) {
4029                 struct sk_buff *nskb = *pp;
4030
4031                 *pp = nskb->next;
4032                 nskb->next = NULL;
4033                 napi_gro_complete(nskb);
4034                 napi->gro_count--;
4035         }
4036
4037         if (same_flow)
4038                 goto ok;
4039
4040         if (NAPI_GRO_CB(skb)->flush)
4041                 goto normal;
4042
4043         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4044                 struct sk_buff *nskb = napi->gro_list;
4045
4046                 /* locate the end of the list to select the 'oldest' flow */
4047                 while (nskb->next) {
4048                         pp = &nskb->next;
4049                         nskb = *pp;
4050                 }
4051                 *pp = NULL;
4052                 nskb->next = NULL;
4053                 napi_gro_complete(nskb);
4054         } else {
4055                 napi->gro_count++;
4056         }
4057         NAPI_GRO_CB(skb)->count = 1;
4058         NAPI_GRO_CB(skb)->age = jiffies;
4059         NAPI_GRO_CB(skb)->last = skb;
4060         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4061         skb->next = napi->gro_list;
4062         napi->gro_list = skb;
4063         ret = GRO_HELD;
4064
4065 pull:
4066         grow = skb_gro_offset(skb) - skb_headlen(skb);
4067         if (grow > 0)
4068                 gro_pull_from_frag0(skb, grow);
4069 ok:
4070         return ret;
4071
4072 normal:
4073         ret = GRO_NORMAL;
4074         goto pull;
4075 }
4076
4077 struct packet_offload *gro_find_receive_by_type(__be16 type)
4078 {
4079         struct list_head *offload_head = &offload_base;
4080         struct packet_offload *ptype;
4081
4082         list_for_each_entry_rcu(ptype, offload_head, list) {
4083                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4084                         continue;
4085                 return ptype;
4086         }
4087         return NULL;
4088 }
4089 EXPORT_SYMBOL(gro_find_receive_by_type);
4090
4091 struct packet_offload *gro_find_complete_by_type(__be16 type)
4092 {
4093         struct list_head *offload_head = &offload_base;
4094         struct packet_offload *ptype;
4095
4096         list_for_each_entry_rcu(ptype, offload_head, list) {
4097                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4098                         continue;
4099                 return ptype;
4100         }
4101         return NULL;
4102 }
4103 EXPORT_SYMBOL(gro_find_complete_by_type);
4104
4105 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4106 {
4107         switch (ret) {
4108         case GRO_NORMAL:
4109                 if (netif_receive_skb_internal(skb))
4110                         ret = GRO_DROP;
4111                 break;
4112
4113         case GRO_DROP:
4114                 kfree_skb(skb);
4115                 break;
4116
4117         case GRO_MERGED_FREE:
4118                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4119                         kmem_cache_free(skbuff_head_cache, skb);
4120                 else
4121                         __kfree_skb(skb);
4122                 break;
4123
4124         case GRO_HELD:
4125         case GRO_MERGED:
4126                 break;
4127         }
4128
4129         return ret;
4130 }
4131
4132 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4133 {
4134         trace_napi_gro_receive_entry(skb);
4135
4136         skb_gro_reset_offset(skb);
4137
4138         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4139 }
4140 EXPORT_SYMBOL(napi_gro_receive);
4141
4142 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4143 {
4144         __skb_pull(skb, skb_headlen(skb));
4145         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4146         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4147         skb->vlan_tci = 0;
4148         skb->dev = napi->dev;
4149         skb->skb_iif = 0;
4150         skb->encapsulation = 0;
4151         skb_shinfo(skb)->gso_type = 0;
4152         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4153
4154         napi->skb = skb;
4155 }
4156
4157 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4158 {
4159         struct sk_buff *skb = napi->skb;
4160
4161         if (!skb) {
4162                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
4163                 napi->skb = skb;
4164         }
4165         return skb;
4166 }
4167 EXPORT_SYMBOL(napi_get_frags);
4168
4169 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4170                                       struct sk_buff *skb,
4171                                       gro_result_t ret)
4172 {
4173         switch (ret) {
4174         case GRO_NORMAL:
4175         case GRO_HELD:
4176                 __skb_push(skb, ETH_HLEN);
4177                 skb->protocol = eth_type_trans(skb, skb->dev);
4178                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4179                         ret = GRO_DROP;
4180                 break;
4181
4182         case GRO_DROP:
4183         case GRO_MERGED_FREE:
4184                 napi_reuse_skb(napi, skb);
4185                 break;
4186
4187         case GRO_MERGED:
4188                 break;
4189         }
4190
4191         return ret;
4192 }
4193
4194 /* Upper GRO stack assumes network header starts at gro_offset=0
4195  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4196  * We copy ethernet header into skb->data to have a common layout.
4197  */
4198 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4199 {
4200         struct sk_buff *skb = napi->skb;
4201         const struct ethhdr *eth;
4202         unsigned int hlen = sizeof(*eth);
4203
4204         napi->skb = NULL;
4205
4206         skb_reset_mac_header(skb);
4207         skb_gro_reset_offset(skb);
4208
4209         eth = skb_gro_header_fast(skb, 0);
4210         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4211                 eth = skb_gro_header_slow(skb, hlen, 0);
4212                 if (unlikely(!eth)) {
4213                         napi_reuse_skb(napi, skb);
4214                         return NULL;
4215                 }
4216         } else {
4217                 gro_pull_from_frag0(skb, hlen);
4218                 NAPI_GRO_CB(skb)->frag0 += hlen;
4219                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4220         }
4221         __skb_pull(skb, hlen);
4222
4223         /*
4224          * This works because the only protocols we care about don't require
4225          * special handling.
4226          * We'll fix it up properly in napi_frags_finish()
4227          */
4228         skb->protocol = eth->h_proto;
4229
4230         return skb;
4231 }
4232
4233 gro_result_t napi_gro_frags(struct napi_struct *napi)
4234 {
4235         struct sk_buff *skb = napi_frags_skb(napi);
4236
4237         if (!skb)
4238                 return GRO_DROP;
4239
4240         trace_napi_gro_frags_entry(skb);
4241
4242         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4243 }
4244 EXPORT_SYMBOL(napi_gro_frags);
4245
4246 /* Compute the checksum from gro_offset and return the folded value
4247  * after adding in any pseudo checksum.
4248  */
4249 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4250 {
4251         __wsum wsum;
4252         __sum16 sum;
4253
4254         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4255
4256         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4257         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4258         if (likely(!sum)) {
4259                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4260                     !skb->csum_complete_sw)
4261                         netdev_rx_csum_fault(skb->dev);
4262         }
4263
4264         NAPI_GRO_CB(skb)->csum = wsum;
4265         NAPI_GRO_CB(skb)->csum_valid = 1;
4266
4267         return sum;
4268 }
4269 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4270
4271 /*
4272  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4273  * Note: called with local irq disabled, but exits with local irq enabled.
4274  */
4275 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4276 {
4277 #ifdef CONFIG_RPS
4278         struct softnet_data *remsd = sd->rps_ipi_list;
4279
4280         if (remsd) {
4281                 sd->rps_ipi_list = NULL;
4282
4283                 local_irq_enable();
4284
4285                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4286                 while (remsd) {
4287                         struct softnet_data *next = remsd->rps_ipi_next;
4288
4289                         if (cpu_online(remsd->cpu))
4290                                 smp_call_function_single_async(remsd->cpu,
4291                                                            &remsd->csd);
4292                         remsd = next;
4293                 }
4294         } else
4295 #endif
4296                 local_irq_enable();
4297 }
4298
4299 static int process_backlog(struct napi_struct *napi, int quota)
4300 {
4301         int work = 0;
4302         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4303
4304 #ifdef CONFIG_RPS
4305         /* Check if we have pending ipi, its better to send them now,
4306          * not waiting net_rx_action() end.
4307          */
4308         if (sd->rps_ipi_list) {
4309                 local_irq_disable();
4310                 net_rps_action_and_irq_enable(sd);
4311         }
4312 #endif
4313         napi->weight = weight_p;
4314         local_irq_disable();
4315         while (1) {
4316                 struct sk_buff *skb;
4317
4318                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4319                         local_irq_enable();
4320                         __netif_receive_skb(skb);
4321                         local_irq_disable();
4322                         input_queue_head_incr(sd);
4323                         if (++work >= quota) {
4324                                 local_irq_enable();
4325                                 return work;
4326                         }
4327                 }
4328
4329                 rps_lock(sd);
4330                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4331                         /*
4332                          * Inline a custom version of __napi_complete().
4333                          * only current cpu owns and manipulates this napi,
4334                          * and NAPI_STATE_SCHED is the only possible flag set
4335                          * on backlog.
4336                          * We can use a plain write instead of clear_bit(),
4337                          * and we dont need an smp_mb() memory barrier.
4338                          */
4339                         list_del(&napi->poll_list);
4340                         napi->state = 0;
4341                         rps_unlock(sd);
4342
4343                         break;
4344                 }
4345
4346                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4347                                            &sd->process_queue);
4348                 rps_unlock(sd);
4349         }
4350         local_irq_enable();
4351
4352         return work;
4353 }
4354
4355 /**
4356  * __napi_schedule - schedule for receive
4357  * @n: entry to schedule
4358  *
4359  * The entry's receive function will be scheduled to run
4360  */
4361 void __napi_schedule(struct napi_struct *n)
4362 {
4363         unsigned long flags;
4364
4365         local_irq_save(flags);
4366         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4367         local_irq_restore(flags);
4368 }
4369 EXPORT_SYMBOL(__napi_schedule);
4370
4371 void __napi_complete(struct napi_struct *n)
4372 {
4373         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4374         BUG_ON(n->gro_list);
4375
4376         list_del(&n->poll_list);
4377         smp_mb__before_atomic();
4378         clear_bit(NAPI_STATE_SCHED, &n->state);
4379 }
4380 EXPORT_SYMBOL(__napi_complete);
4381
4382 void napi_complete(struct napi_struct *n)
4383 {
4384         unsigned long flags;
4385
4386         /*
4387          * don't let napi dequeue from the cpu poll list
4388          * just in case its running on a different cpu
4389          */
4390         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4391                 return;
4392
4393         napi_gro_flush(n, false);
4394         local_irq_save(flags);
4395         __napi_complete(n);
4396         local_irq_restore(flags);
4397 }
4398 EXPORT_SYMBOL(napi_complete);
4399
4400 /* must be called under rcu_read_lock(), as we dont take a reference */
4401 struct napi_struct *napi_by_id(unsigned int napi_id)
4402 {
4403         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4404         struct napi_struct *napi;
4405
4406         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4407                 if (napi->napi_id == napi_id)
4408                         return napi;
4409
4410         return NULL;
4411 }
4412 EXPORT_SYMBOL_GPL(napi_by_id);
4413
4414 void napi_hash_add(struct napi_struct *napi)
4415 {
4416         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4417
4418                 spin_lock(&napi_hash_lock);
4419
4420                 /* 0 is not a valid id, we also skip an id that is taken
4421                  * we expect both events to be extremely rare
4422                  */
4423                 napi->napi_id = 0;
4424                 while (!napi->napi_id) {
4425                         napi->napi_id = ++napi_gen_id;
4426                         if (napi_by_id(napi->napi_id))
4427                                 napi->napi_id = 0;
4428                 }
4429
4430                 hlist_add_head_rcu(&napi->napi_hash_node,
4431                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4432
4433                 spin_unlock(&napi_hash_lock);
4434         }
4435 }
4436 EXPORT_SYMBOL_GPL(napi_hash_add);
4437
4438 /* Warning : caller is responsible to make sure rcu grace period
4439  * is respected before freeing memory containing @napi
4440  */
4441 void napi_hash_del(struct napi_struct *napi)
4442 {
4443         spin_lock(&napi_hash_lock);
4444
4445         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4446                 hlist_del_rcu(&napi->napi_hash_node);
4447
4448         spin_unlock(&napi_hash_lock);
4449 }
4450 EXPORT_SYMBOL_GPL(napi_hash_del);
4451
4452 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4453                     int (*poll)(struct napi_struct *, int), int weight)
4454 {
4455         INIT_LIST_HEAD(&napi->poll_list);
4456         napi->gro_count = 0;
4457         napi->gro_list = NULL;
4458         napi->skb = NULL;
4459         napi->poll = poll;
4460         if (weight > NAPI_POLL_WEIGHT)
4461                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4462                             weight, dev->name);
4463         napi->weight = weight;
4464         list_add(&napi->dev_list, &dev->napi_list);
4465         napi->dev = dev;
4466 #ifdef CONFIG_NETPOLL
4467         spin_lock_init(&napi->poll_lock);
4468         napi->poll_owner = -1;
4469 #endif
4470         set_bit(NAPI_STATE_SCHED, &napi->state);
4471 }
4472 EXPORT_SYMBOL(netif_napi_add);
4473
4474 void netif_napi_del(struct napi_struct *napi)
4475 {
4476         list_del_init(&napi->dev_list);
4477         napi_free_frags(napi);
4478
4479         kfree_skb_list(napi->gro_list);
4480         napi->gro_list = NULL;
4481         napi->gro_count = 0;
4482 }
4483 EXPORT_SYMBOL(netif_napi_del);
4484
4485 static void net_rx_action(struct softirq_action *h)
4486 {
4487         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4488         unsigned long time_limit = jiffies + 2;
4489         int budget = netdev_budget;
4490         void *have;
4491
4492         local_irq_disable();
4493
4494         while (!list_empty(&sd->poll_list)) {
4495                 struct napi_struct *n;
4496                 int work, weight;
4497
4498                 /* If softirq window is exhuasted then punt.
4499                  * Allow this to run for 2 jiffies since which will allow
4500                  * an average latency of 1.5/HZ.
4501                  */
4502                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4503                         goto softnet_break;
4504
4505                 local_irq_enable();
4506
4507                 /* Even though interrupts have been re-enabled, this
4508                  * access is safe because interrupts can only add new
4509                  * entries to the tail of this list, and only ->poll()
4510                  * calls can remove this head entry from the list.
4511                  */
4512                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4513
4514                 have = netpoll_poll_lock(n);
4515
4516                 weight = n->weight;
4517
4518                 /* This NAPI_STATE_SCHED test is for avoiding a race
4519                  * with netpoll's poll_napi().  Only the entity which
4520                  * obtains the lock and sees NAPI_STATE_SCHED set will
4521                  * actually make the ->poll() call.  Therefore we avoid
4522                  * accidentally calling ->poll() when NAPI is not scheduled.
4523                  */
4524                 work = 0;
4525                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4526                         work = n->poll(n, weight);
4527                         trace_napi_poll(n);
4528                 }
4529
4530                 WARN_ON_ONCE(work > weight);
4531
4532                 budget -= work;
4533
4534                 local_irq_disable();
4535
4536                 /* Drivers must not modify the NAPI state if they
4537                  * consume the entire weight.  In such cases this code
4538                  * still "owns" the NAPI instance and therefore can
4539                  * move the instance around on the list at-will.
4540                  */
4541                 if (unlikely(work == weight)) {
4542                         if (unlikely(napi_disable_pending(n))) {
4543                                 local_irq_enable();
4544                                 napi_complete(n);
4545                                 local_irq_disable();
4546                         } else {
4547                                 if (n->gro_list) {
4548                                         /* flush too old packets
4549                                          * If HZ < 1000, flush all packets.
4550                                          */
4551                                         local_irq_enable();
4552                                         napi_gro_flush(n, HZ >= 1000);
4553                                         local_irq_disable();
4554                                 }
4555                                 list_move_tail(&n->poll_list, &sd->poll_list);
4556                         }
4557                 }
4558
4559                 netpoll_poll_unlock(have);
4560         }
4561 out:
4562         net_rps_action_and_irq_enable(sd);
4563
4564 #ifdef CONFIG_NET_DMA
4565         /*
4566          * There may not be any more sk_buffs coming right now, so push
4567          * any pending DMA copies to hardware
4568          */
4569         dma_issue_pending_all();
4570 #endif
4571
4572         return;
4573
4574 softnet_break:
4575         sd->time_squeeze++;
4576         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4577         goto out;
4578 }
4579
4580 struct netdev_adjacent {
4581         struct net_device *dev;
4582
4583         /* upper master flag, there can only be one master device per list */
4584         bool master;
4585
4586         /* counter for the number of times this device was added to us */
4587         u16 ref_nr;
4588
4589         /* private field for the users */
4590         void *private;
4591
4592         struct list_head list;
4593         struct rcu_head rcu;
4594 };
4595
4596 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4597                                                  struct net_device *adj_dev,
4598                                                  struct list_head *adj_list)
4599 {
4600         struct netdev_adjacent *adj;
4601
4602         list_for_each_entry(adj, adj_list, list) {
4603                 if (adj->dev == adj_dev)
4604                         return adj;
4605         }
4606         return NULL;
4607 }
4608
4609 /**
4610  * netdev_has_upper_dev - Check if device is linked to an upper device
4611  * @dev: device
4612  * @upper_dev: upper device to check
4613  *
4614  * Find out if a device is linked to specified upper device and return true
4615  * in case it is. Note that this checks only immediate upper device,
4616  * not through a complete stack of devices. The caller must hold the RTNL lock.
4617  */
4618 bool netdev_has_upper_dev(struct net_device *dev,
4619                           struct net_device *upper_dev)
4620 {
4621         ASSERT_RTNL();
4622
4623         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4624 }
4625 EXPORT_SYMBOL(netdev_has_upper_dev);
4626
4627 /**
4628  * netdev_has_any_upper_dev - Check if device is linked to some device
4629  * @dev: device
4630  *
4631  * Find out if a device is linked to an upper device and return true in case
4632  * it is. The caller must hold the RTNL lock.
4633  */
4634 static bool netdev_has_any_upper_dev(struct net_device *dev)
4635 {
4636         ASSERT_RTNL();
4637
4638         return !list_empty(&dev->all_adj_list.upper);
4639 }
4640
4641 /**
4642  * netdev_master_upper_dev_get - Get master upper device
4643  * @dev: device
4644  *
4645  * Find a master upper device and return pointer to it or NULL in case
4646  * it's not there. The caller must hold the RTNL lock.
4647  */
4648 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4649 {
4650         struct netdev_adjacent *upper;
4651
4652         ASSERT_RTNL();
4653
4654         if (list_empty(&dev->adj_list.upper))
4655                 return NULL;
4656
4657         upper = list_first_entry(&dev->adj_list.upper,
4658                                  struct netdev_adjacent, list);
4659         if (likely(upper->master))
4660                 return upper->dev;
4661         return NULL;
4662 }
4663 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4664
4665 void *netdev_adjacent_get_private(struct list_head *adj_list)
4666 {
4667         struct netdev_adjacent *adj;
4668
4669         adj = list_entry(adj_list, struct netdev_adjacent, list);
4670
4671         return adj->private;
4672 }
4673 EXPORT_SYMBOL(netdev_adjacent_get_private);
4674
4675 /**
4676  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4677  * @dev: device
4678  * @iter: list_head ** of the current position
4679  *
4680  * Gets the next device from the dev's upper list, starting from iter
4681  * position. The caller must hold RCU read lock.
4682  */
4683 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4684                                                  struct list_head **iter)
4685 {
4686         struct netdev_adjacent *upper;
4687
4688         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4689
4690         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4691
4692         if (&upper->list == &dev->adj_list.upper)
4693                 return NULL;
4694
4695         *iter = &upper->list;
4696
4697         return upper->dev;
4698 }
4699 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4700
4701 /**
4702  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4703  * @dev: device
4704  * @iter: list_head ** of the current position
4705  *
4706  * Gets the next device from the dev's upper list, starting from iter
4707  * position. The caller must hold RCU read lock.
4708  */
4709 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4710                                                      struct list_head **iter)
4711 {
4712         struct netdev_adjacent *upper;
4713
4714         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4715
4716         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4717
4718         if (&upper->list == &dev->all_adj_list.upper)
4719                 return NULL;
4720
4721         *iter = &upper->list;
4722
4723         return upper->dev;
4724 }
4725 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4726
4727 /**
4728  * netdev_lower_get_next_private - Get the next ->private from the
4729  *                                 lower neighbour list
4730  * @dev: device
4731  * @iter: list_head ** of the current position
4732  *
4733  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4734  * list, starting from iter position. The caller must hold either hold the
4735  * RTNL lock or its own locking that guarantees that the neighbour lower
4736  * list will remain unchainged.
4737  */
4738 void *netdev_lower_get_next_private(struct net_device *dev,
4739                                     struct list_head **iter)
4740 {
4741         struct netdev_adjacent *lower;
4742
4743         lower = list_entry(*iter, struct netdev_adjacent, list);
4744
4745         if (&lower->list == &dev->adj_list.lower)
4746                 return NULL;
4747
4748         *iter = lower->list.next;
4749
4750         return lower->private;
4751 }
4752 EXPORT_SYMBOL(netdev_lower_get_next_private);
4753
4754 /**
4755  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4756  *                                     lower neighbour list, RCU
4757  *                                     variant
4758  * @dev: device
4759  * @iter: list_head ** of the current position
4760  *
4761  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4762  * list, starting from iter position. The caller must hold RCU read lock.
4763  */
4764 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4765                                         struct list_head **iter)
4766 {
4767         struct netdev_adjacent *lower;
4768
4769         WARN_ON_ONCE(!rcu_read_lock_held());
4770
4771         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4772
4773         if (&lower->list == &dev->adj_list.lower)
4774                 return NULL;
4775
4776         *iter = &lower->list;
4777
4778         return lower->private;
4779 }
4780 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4781
4782 /**
4783  * netdev_lower_get_next - Get the next device from the lower neighbour
4784  *                         list
4785  * @dev: device
4786  * @iter: list_head ** of the current position
4787  *
4788  * Gets the next netdev_adjacent from the dev's lower neighbour
4789  * list, starting from iter position. The caller must hold RTNL lock or
4790  * its own locking that guarantees that the neighbour lower
4791  * list will remain unchainged.
4792  */
4793 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4794 {
4795         struct netdev_adjacent *lower;
4796
4797         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4798
4799         if (&lower->list == &dev->adj_list.lower)
4800                 return NULL;
4801
4802         *iter = &lower->list;
4803
4804         return lower->dev;
4805 }
4806 EXPORT_SYMBOL(netdev_lower_get_next);
4807
4808 /**
4809  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4810  *                                     lower neighbour list, RCU
4811  *                                     variant
4812  * @dev: device
4813  *
4814  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4815  * list. The caller must hold RCU read lock.
4816  */
4817 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4818 {
4819         struct netdev_adjacent *lower;
4820
4821         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4822                         struct netdev_adjacent, list);
4823         if (lower)
4824                 return lower->private;
4825         return NULL;
4826 }
4827 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4828
4829 /**
4830  * netdev_master_upper_dev_get_rcu - Get master upper device
4831  * @dev: device
4832  *
4833  * Find a master upper device and return pointer to it or NULL in case
4834  * it's not there. The caller must hold the RCU read lock.
4835  */
4836 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4837 {
4838         struct netdev_adjacent *upper;
4839
4840         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4841                                        struct netdev_adjacent, list);
4842         if (upper && likely(upper->master))
4843                 return upper->dev;
4844         return NULL;
4845 }
4846 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4847
4848 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4849                               struct net_device *adj_dev,
4850                               struct list_head *dev_list)
4851 {
4852         char linkname[IFNAMSIZ+7];
4853         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4854                 "upper_%s" : "lower_%s", adj_dev->name);
4855         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4856                                  linkname);
4857 }
4858 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4859                                char *name,
4860                                struct list_head *dev_list)
4861 {
4862         char linkname[IFNAMSIZ+7];
4863         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4864                 "upper_%s" : "lower_%s", name);
4865         sysfs_remove_link(&(dev->dev.kobj), linkname);
4866 }
4867
4868 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4869                                                  struct net_device *adj_dev,
4870                                                  struct list_head *dev_list)
4871 {
4872         return (dev_list == &dev->adj_list.upper ||
4873                 dev_list == &dev->adj_list.lower) &&
4874                 net_eq(dev_net(dev), dev_net(adj_dev));
4875 }
4876
4877 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4878                                         struct net_device *adj_dev,
4879                                         struct list_head *dev_list,
4880                                         void *private, bool master)
4881 {
4882         struct netdev_adjacent *adj;
4883         int ret;
4884
4885         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4886
4887         if (adj) {
4888                 adj->ref_nr++;
4889                 return 0;
4890         }
4891
4892         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4893         if (!adj)
4894                 return -ENOMEM;
4895
4896         adj->dev = adj_dev;
4897         adj->master = master;
4898         adj->ref_nr = 1;
4899         adj->private = private;
4900         dev_hold(adj_dev);
4901
4902         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4903                  adj_dev->name, dev->name, adj_dev->name);
4904
4905         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4906                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4907                 if (ret)
4908                         goto free_adj;
4909         }
4910
4911         /* Ensure that master link is always the first item in list. */
4912         if (master) {
4913                 ret = sysfs_create_link(&(dev->dev.kobj),
4914                                         &(adj_dev->dev.kobj), "master");
4915                 if (ret)
4916                         goto remove_symlinks;
4917
4918                 list_add_rcu(&adj->list, dev_list);
4919         } else {
4920                 list_add_tail_rcu(&adj->list, dev_list);
4921         }
4922
4923         return 0;
4924
4925 remove_symlinks:
4926         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4927                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4928 free_adj:
4929         kfree(adj);
4930         dev_put(adj_dev);
4931
4932         return ret;
4933 }
4934
4935 static void __netdev_adjacent_dev_remove(struct net_device *dev,
4936                                          struct net_device *adj_dev,
4937                                          struct list_head *dev_list)
4938 {
4939         struct netdev_adjacent *adj;
4940
4941         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4942
4943         if (!adj) {
4944                 pr_err("tried to remove device %s from %s\n",
4945                        dev->name, adj_dev->name);
4946                 BUG();
4947         }
4948
4949         if (adj->ref_nr > 1) {
4950                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4951                          adj->ref_nr-1);
4952                 adj->ref_nr--;
4953                 return;
4954         }
4955
4956         if (adj->master)
4957                 sysfs_remove_link(&(dev->dev.kobj), "master");
4958
4959         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4960                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4961
4962         list_del_rcu(&adj->list);
4963         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4964                  adj_dev->name, dev->name, adj_dev->name);
4965         dev_put(adj_dev);
4966         kfree_rcu(adj, rcu);
4967 }
4968
4969 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4970                                             struct net_device *upper_dev,
4971                                             struct list_head *up_list,
4972                                             struct list_head *down_list,
4973                                             void *private, bool master)
4974 {
4975         int ret;
4976
4977         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4978                                            master);
4979         if (ret)
4980                 return ret;
4981
4982         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4983                                            false);
4984         if (ret) {
4985                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4986                 return ret;
4987         }
4988
4989         return 0;
4990 }
4991
4992 static int __netdev_adjacent_dev_link(struct net_device *dev,
4993                                       struct net_device *upper_dev)
4994 {
4995         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4996                                                 &dev->all_adj_list.upper,
4997                                                 &upper_dev->all_adj_list.lower,
4998                                                 NULL, false);
4999 }
5000
5001 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5002                                                struct net_device *upper_dev,
5003                                                struct list_head *up_list,
5004                                                struct list_head *down_list)
5005 {
5006         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5007         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5008 }
5009
5010 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5011                                          struct net_device *upper_dev)
5012 {
5013         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5014                                            &dev->all_adj_list.upper,
5015                                            &upper_dev->all_adj_list.lower);
5016 }
5017
5018 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5019                                                 struct net_device *upper_dev,
5020                                                 void *private, bool master)
5021 {
5022         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5023
5024         if (ret)
5025                 return ret;
5026
5027         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5028                                                &dev->adj_list.upper,
5029                                                &upper_dev->adj_list.lower,
5030                                                private, master);
5031         if (ret) {
5032                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5033                 return ret;
5034         }
5035
5036         return 0;
5037 }
5038
5039 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5040                                                    struct net_device *upper_dev)
5041 {
5042         __netdev_adjacent_dev_unlink(dev, upper_dev);
5043         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5044                                            &dev->adj_list.upper,
5045                                            &upper_dev->adj_list.lower);
5046 }
5047
5048 static int __netdev_upper_dev_link(struct net_device *dev,
5049                                    struct net_device *upper_dev, bool master,
5050                                    void *private)
5051 {
5052         struct netdev_adjacent *i, *j, *to_i, *to_j;
5053         int ret = 0;
5054
5055         ASSERT_RTNL();
5056
5057         if (dev == upper_dev)
5058                 return -EBUSY;
5059
5060         /* To prevent loops, check if dev is not upper device to upper_dev. */
5061         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5062                 return -EBUSY;
5063
5064         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5065                 return -EEXIST;
5066
5067         if (master && netdev_master_upper_dev_get(dev))
5068                 return -EBUSY;
5069
5070         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5071                                                    master);
5072         if (ret)
5073                 return ret;
5074
5075         /* Now that we linked these devs, make all the upper_dev's
5076          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5077          * versa, and don't forget the devices itself. All of these
5078          * links are non-neighbours.
5079          */
5080         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5081                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5082                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5083                                  i->dev->name, j->dev->name);
5084                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5085                         if (ret)
5086                                 goto rollback_mesh;
5087                 }
5088         }
5089
5090         /* add dev to every upper_dev's upper device */
5091         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5092                 pr_debug("linking %s's upper device %s with %s\n",
5093                          upper_dev->name, i->dev->name, dev->name);
5094                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5095                 if (ret)
5096                         goto rollback_upper_mesh;
5097         }
5098
5099         /* add upper_dev to every dev's lower device */
5100         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5101                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5102                          i->dev->name, upper_dev->name);
5103                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5104                 if (ret)
5105                         goto rollback_lower_mesh;
5106         }
5107
5108         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5109         return 0;
5110
5111 rollback_lower_mesh:
5112         to_i = i;
5113         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5114                 if (i == to_i)
5115                         break;
5116                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5117         }
5118
5119         i = NULL;
5120
5121 rollback_upper_mesh:
5122         to_i = i;
5123         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5124                 if (i == to_i)
5125                         break;
5126                 __netdev_adjacent_dev_unlink(dev, i->dev);
5127         }
5128
5129         i = j = NULL;
5130
5131 rollback_mesh:
5132         to_i = i;
5133         to_j = j;
5134         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5135                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5136                         if (i == to_i && j == to_j)
5137                                 break;
5138                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5139                 }
5140                 if (i == to_i)
5141                         break;
5142         }
5143
5144         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5145
5146         return ret;
5147 }
5148
5149 /**
5150  * netdev_upper_dev_link - Add a link to the upper device
5151  * @dev: device
5152  * @upper_dev: new upper device
5153  *
5154  * Adds a link to device which is upper to this one. The caller must hold
5155  * the RTNL lock. On a failure a negative errno code is returned.
5156  * On success the reference counts are adjusted and the function
5157  * returns zero.
5158  */
5159 int netdev_upper_dev_link(struct net_device *dev,
5160                           struct net_device *upper_dev)
5161 {
5162         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5163 }
5164 EXPORT_SYMBOL(netdev_upper_dev_link);
5165
5166 /**
5167  * netdev_master_upper_dev_link - Add a master link to the upper device
5168  * @dev: device
5169  * @upper_dev: new upper device
5170  *
5171  * Adds a link to device which is upper to this one. In this case, only
5172  * one master upper device can be linked, although other non-master devices
5173  * might be linked as well. The caller must hold the RTNL lock.
5174  * On a failure a negative errno code is returned. On success the reference
5175  * counts are adjusted and the function returns zero.
5176  */
5177 int netdev_master_upper_dev_link(struct net_device *dev,
5178                                  struct net_device *upper_dev)
5179 {
5180         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5181 }
5182 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5183
5184 int netdev_master_upper_dev_link_private(struct net_device *dev,
5185                                          struct net_device *upper_dev,
5186                                          void *private)
5187 {
5188         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5189 }
5190 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5191
5192 /**
5193  * netdev_upper_dev_unlink - Removes a link to upper device
5194  * @dev: device
5195  * @upper_dev: new upper device
5196  *
5197  * Removes a link to device which is upper to this one. The caller must hold
5198  * the RTNL lock.
5199  */
5200 void netdev_upper_dev_unlink(struct net_device *dev,
5201                              struct net_device *upper_dev)
5202 {
5203         struct netdev_adjacent *i, *j;
5204         ASSERT_RTNL();
5205
5206         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5207
5208         /* Here is the tricky part. We must remove all dev's lower
5209          * devices from all upper_dev's upper devices and vice
5210          * versa, to maintain the graph relationship.
5211          */
5212         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5213                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5214                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5215
5216         /* remove also the devices itself from lower/upper device
5217          * list
5218          */
5219         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5220                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5221
5222         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5223                 __netdev_adjacent_dev_unlink(dev, i->dev);
5224
5225         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5226 }
5227 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5228
5229 void netdev_adjacent_add_links(struct net_device *dev)
5230 {
5231         struct netdev_adjacent *iter;
5232
5233         struct net *net = dev_net(dev);
5234
5235         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5236                 if (!net_eq(net,dev_net(iter->dev)))
5237                         continue;
5238                 netdev_adjacent_sysfs_add(iter->dev, dev,
5239                                           &iter->dev->adj_list.lower);
5240                 netdev_adjacent_sysfs_add(dev, iter->dev,
5241                                           &dev->adj_list.upper);
5242         }
5243
5244         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5245                 if (!net_eq(net,dev_net(iter->dev)))
5246                         continue;
5247                 netdev_adjacent_sysfs_add(iter->dev, dev,
5248                                           &iter->dev->adj_list.upper);
5249                 netdev_adjacent_sysfs_add(dev, iter->dev,
5250                                           &dev->adj_list.lower);
5251         }
5252 }
5253
5254 void netdev_adjacent_del_links(struct net_device *dev)
5255 {
5256         struct netdev_adjacent *iter;
5257
5258         struct net *net = dev_net(dev);
5259
5260         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5261                 if (!net_eq(net,dev_net(iter->dev)))
5262                         continue;
5263                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5264                                           &iter->dev->adj_list.lower);
5265                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5266                                           &dev->adj_list.upper);
5267         }
5268
5269         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5270                 if (!net_eq(net,dev_net(iter->dev)))
5271                         continue;
5272                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5273                                           &iter->dev->adj_list.upper);
5274                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5275                                           &dev->adj_list.lower);
5276         }
5277 }
5278
5279 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5280 {
5281         struct netdev_adjacent *iter;
5282
5283         struct net *net = dev_net(dev);
5284
5285         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5286                 if (!net_eq(net,dev_net(iter->dev)))
5287                         continue;
5288                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5289                                           &iter->dev->adj_list.lower);
5290                 netdev_adjacent_sysfs_add(iter->dev, dev,
5291                                           &iter->dev->adj_list.lower);
5292         }
5293
5294         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5295                 if (!net_eq(net,dev_net(iter->dev)))
5296                         continue;
5297                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5298                                           &iter->dev->adj_list.upper);
5299                 netdev_adjacent_sysfs_add(iter->dev, dev,
5300                                           &iter->dev->adj_list.upper);
5301         }
5302 }
5303
5304 void *netdev_lower_dev_get_private(struct net_device *dev,
5305                                    struct net_device *lower_dev)
5306 {
5307         struct netdev_adjacent *lower;
5308
5309         if (!lower_dev)
5310                 return NULL;
5311         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5312         if (!lower)
5313                 return NULL;
5314
5315         return lower->private;
5316 }
5317 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5318
5319
5320 int dev_get_nest_level(struct net_device *dev,
5321                        bool (*type_check)(struct net_device *dev))
5322 {
5323         struct net_device *lower = NULL;
5324         struct list_head *iter;
5325         int max_nest = -1;
5326         int nest;
5327
5328         ASSERT_RTNL();
5329
5330         netdev_for_each_lower_dev(dev, lower, iter) {
5331                 nest = dev_get_nest_level(lower, type_check);
5332                 if (max_nest < nest)
5333                         max_nest = nest;
5334         }
5335
5336         if (type_check(dev))
5337                 max_nest++;
5338
5339         return max_nest;
5340 }
5341 EXPORT_SYMBOL(dev_get_nest_level);
5342
5343 static void dev_change_rx_flags(struct net_device *dev, int flags)
5344 {
5345         const struct net_device_ops *ops = dev->netdev_ops;
5346
5347         if (ops->ndo_change_rx_flags)
5348                 ops->ndo_change_rx_flags(dev, flags);
5349 }
5350
5351 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5352 {
5353         unsigned int old_flags = dev->flags;
5354         kuid_t uid;
5355         kgid_t gid;
5356
5357         ASSERT_RTNL();
5358
5359         dev->flags |= IFF_PROMISC;
5360         dev->promiscuity += inc;
5361         if (dev->promiscuity == 0) {
5362                 /*
5363                  * Avoid overflow.
5364                  * If inc causes overflow, untouch promisc and return error.
5365                  */
5366                 if (inc < 0)
5367                         dev->flags &= ~IFF_PROMISC;
5368                 else {
5369                         dev->promiscuity -= inc;
5370                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5371                                 dev->name);
5372                         return -EOVERFLOW;
5373                 }
5374         }
5375         if (dev->flags != old_flags) {
5376                 pr_info("device %s %s promiscuous mode\n",
5377                         dev->name,
5378                         dev->flags & IFF_PROMISC ? "entered" : "left");
5379                 if (audit_enabled) {
5380                         current_uid_gid(&uid, &gid);
5381                         audit_log(current->audit_context, GFP_ATOMIC,
5382                                 AUDIT_ANOM_PROMISCUOUS,
5383                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5384                                 dev->name, (dev->flags & IFF_PROMISC),
5385                                 (old_flags & IFF_PROMISC),
5386                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5387                                 from_kuid(&init_user_ns, uid),
5388                                 from_kgid(&init_user_ns, gid),
5389                                 audit_get_sessionid(current));
5390                 }
5391
5392                 dev_change_rx_flags(dev, IFF_PROMISC);
5393         }
5394         if (notify)
5395                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5396         return 0;
5397 }
5398
5399 /**
5400  *      dev_set_promiscuity     - update promiscuity count on a device
5401  *      @dev: device
5402  *      @inc: modifier
5403  *
5404  *      Add or remove promiscuity from a device. While the count in the device
5405  *      remains above zero the interface remains promiscuous. Once it hits zero
5406  *      the device reverts back to normal filtering operation. A negative inc
5407  *      value is used to drop promiscuity on the device.
5408  *      Return 0 if successful or a negative errno code on error.
5409  */
5410 int dev_set_promiscuity(struct net_device *dev, int inc)
5411 {
5412         unsigned int old_flags = dev->flags;
5413         int err;
5414
5415         err = __dev_set_promiscuity(dev, inc, true);
5416         if (err < 0)
5417                 return err;
5418         if (dev->flags != old_flags)
5419                 dev_set_rx_mode(dev);
5420         return err;
5421 }
5422 EXPORT_SYMBOL(dev_set_promiscuity);
5423
5424 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5425 {
5426         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5427
5428         ASSERT_RTNL();
5429
5430         dev->flags |= IFF_ALLMULTI;
5431         dev->allmulti += inc;
5432         if (dev->allmulti == 0) {
5433                 /*
5434                  * Avoid overflow.
5435                  * If inc causes overflow, untouch allmulti and return error.
5436                  */
5437                 if (inc < 0)
5438                         dev->flags &= ~IFF_ALLMULTI;
5439                 else {
5440                         dev->allmulti -= inc;
5441                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5442                                 dev->name);
5443                         return -EOVERFLOW;
5444                 }
5445         }
5446         if (dev->flags ^ old_flags) {
5447                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5448                 dev_set_rx_mode(dev);
5449                 if (notify)
5450                         __dev_notify_flags(dev, old_flags,
5451                                            dev->gflags ^ old_gflags);
5452         }
5453         return 0;
5454 }
5455
5456 /**
5457  *      dev_set_allmulti        - update allmulti count on a device
5458  *      @dev: device
5459  *      @inc: modifier
5460  *
5461  *      Add or remove reception of all multicast frames to a device. While the
5462  *      count in the device remains above zero the interface remains listening
5463  *      to all interfaces. Once it hits zero the device reverts back to normal
5464  *      filtering operation. A negative @inc value is used to drop the counter
5465  *      when releasing a resource needing all multicasts.
5466  *      Return 0 if successful or a negative errno code on error.
5467  */
5468
5469 int dev_set_allmulti(struct net_device *dev, int inc)
5470 {
5471         return __dev_set_allmulti(dev, inc, true);
5472 }
5473 EXPORT_SYMBOL(dev_set_allmulti);
5474
5475 /*
5476  *      Upload unicast and multicast address lists to device and
5477  *      configure RX filtering. When the device doesn't support unicast
5478  *      filtering it is put in promiscuous mode while unicast addresses
5479  *      are present.
5480  */
5481 void __dev_set_rx_mode(struct net_device *dev)
5482 {
5483         const struct net_device_ops *ops = dev->netdev_ops;
5484
5485         /* dev_open will call this function so the list will stay sane. */
5486         if (!(dev->flags&IFF_UP))
5487                 return;
5488
5489         if (!netif_device_present(dev))
5490                 return;
5491
5492         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5493                 /* Unicast addresses changes may only happen under the rtnl,
5494                  * therefore calling __dev_set_promiscuity here is safe.
5495                  */
5496                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5497                         __dev_set_promiscuity(dev, 1, false);
5498                         dev->uc_promisc = true;
5499                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5500                         __dev_set_promiscuity(dev, -1, false);
5501                         dev->uc_promisc = false;
5502                 }
5503         }
5504
5505         if (ops->ndo_set_rx_mode)
5506                 ops->ndo_set_rx_mode(dev);
5507 }
5508
5509 void dev_set_rx_mode(struct net_device *dev)
5510 {
5511         netif_addr_lock_bh(dev);
5512         __dev_set_rx_mode(dev);
5513         netif_addr_unlock_bh(dev);
5514 }
5515
5516 /**
5517  *      dev_get_flags - get flags reported to userspace
5518  *      @dev: device
5519  *
5520  *      Get the combination of flag bits exported through APIs to userspace.
5521  */
5522 unsigned int dev_get_flags(const struct net_device *dev)
5523 {
5524         unsigned int flags;
5525
5526         flags = (dev->flags & ~(IFF_PROMISC |
5527                                 IFF_ALLMULTI |
5528                                 IFF_RUNNING |
5529                                 IFF_LOWER_UP |
5530                                 IFF_DORMANT)) |
5531                 (dev->gflags & (IFF_PROMISC |
5532                                 IFF_ALLMULTI));
5533
5534         if (netif_running(dev)) {
5535                 if (netif_oper_up(dev))
5536                         flags |= IFF_RUNNING;
5537                 if (netif_carrier_ok(dev))
5538                         flags |= IFF_LOWER_UP;
5539                 if (netif_dormant(dev))
5540                         flags |= IFF_DORMANT;
5541         }
5542
5543         return flags;
5544 }
5545 EXPORT_SYMBOL(dev_get_flags);
5546
5547 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5548 {
5549         unsigned int old_flags = dev->flags;
5550         int ret;
5551
5552         ASSERT_RTNL();
5553
5554         /*
5555          *      Set the flags on our device.
5556          */
5557
5558         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5559                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5560                                IFF_AUTOMEDIA)) |
5561                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5562                                     IFF_ALLMULTI));
5563
5564         /*
5565          *      Load in the correct multicast list now the flags have changed.
5566          */
5567
5568         if ((old_flags ^ flags) & IFF_MULTICAST)
5569                 dev_change_rx_flags(dev, IFF_MULTICAST);
5570
5571         dev_set_rx_mode(dev);
5572
5573         /*
5574          *      Have we downed the interface. We handle IFF_UP ourselves
5575          *      according to user attempts to set it, rather than blindly
5576          *      setting it.
5577          */
5578
5579         ret = 0;
5580         if ((old_flags ^ flags) & IFF_UP)
5581                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5582
5583         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5584                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5585                 unsigned int old_flags = dev->flags;
5586
5587                 dev->gflags ^= IFF_PROMISC;
5588
5589                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5590                         if (dev->flags != old_flags)
5591                                 dev_set_rx_mode(dev);
5592         }
5593
5594         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5595            is important. Some (broken) drivers set IFF_PROMISC, when
5596            IFF_ALLMULTI is requested not asking us and not reporting.
5597          */
5598         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5599                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5600
5601                 dev->gflags ^= IFF_ALLMULTI;
5602                 __dev_set_allmulti(dev, inc, false);
5603         }
5604
5605         return ret;
5606 }
5607
5608 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5609                         unsigned int gchanges)
5610 {
5611         unsigned int changes = dev->flags ^ old_flags;
5612
5613         if (gchanges)
5614                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5615
5616         if (changes & IFF_UP) {
5617                 if (dev->flags & IFF_UP)
5618                         call_netdevice_notifiers(NETDEV_UP, dev);
5619                 else
5620                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5621         }
5622
5623         if (dev->flags & IFF_UP &&
5624             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5625                 struct netdev_notifier_change_info change_info;
5626
5627                 change_info.flags_changed = changes;
5628                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5629                                               &change_info.info);
5630         }
5631 }
5632
5633 /**
5634  *      dev_change_flags - change device settings
5635  *      @dev: device
5636  *      @flags: device state flags
5637  *
5638  *      Change settings on device based state flags. The flags are
5639  *      in the userspace exported format.
5640  */
5641 int dev_change_flags(struct net_device *dev, unsigned int flags)
5642 {
5643         int ret;
5644         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5645
5646         ret = __dev_change_flags(dev, flags);
5647         if (ret < 0)
5648                 return ret;
5649
5650         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5651         __dev_notify_flags(dev, old_flags, changes);
5652         return ret;
5653 }
5654 EXPORT_SYMBOL(dev_change_flags);
5655
5656 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5657 {
5658         const struct net_device_ops *ops = dev->netdev_ops;
5659
5660         if (ops->ndo_change_mtu)
5661                 return ops->ndo_change_mtu(dev, new_mtu);
5662
5663         dev->mtu = new_mtu;
5664         return 0;
5665 }
5666
5667 /**
5668  *      dev_set_mtu - Change maximum transfer unit
5669  *      @dev: device
5670  *      @new_mtu: new transfer unit
5671  *
5672  *      Change the maximum transfer size of the network device.
5673  */
5674 int dev_set_mtu(struct net_device *dev, int new_mtu)
5675 {
5676         int err, orig_mtu;
5677
5678         if (new_mtu == dev->mtu)
5679                 return 0;
5680
5681         /*      MTU must be positive.    */
5682         if (new_mtu < 0)
5683                 return -EINVAL;
5684
5685         if (!netif_device_present(dev))
5686                 return -ENODEV;
5687
5688         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5689         err = notifier_to_errno(err);
5690         if (err)
5691                 return err;
5692
5693         orig_mtu = dev->mtu;
5694         err = __dev_set_mtu(dev, new_mtu);
5695
5696         if (!err) {
5697                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5698                 err = notifier_to_errno(err);
5699                 if (err) {
5700                         /* setting mtu back and notifying everyone again,
5701                          * so that they have a chance to revert changes.
5702                          */
5703                         __dev_set_mtu(dev, orig_mtu);
5704                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5705                 }
5706         }
5707         return err;
5708 }
5709 EXPORT_SYMBOL(dev_set_mtu);
5710
5711 /**
5712  *      dev_set_group - Change group this device belongs to
5713  *      @dev: device
5714  *      @new_group: group this device should belong to
5715  */
5716 void dev_set_group(struct net_device *dev, int new_group)
5717 {
5718         dev->group = new_group;
5719 }
5720 EXPORT_SYMBOL(dev_set_group);
5721
5722 /**
5723  *      dev_set_mac_address - Change Media Access Control Address
5724  *      @dev: device
5725  *      @sa: new address
5726  *
5727  *      Change the hardware (MAC) address of the device
5728  */
5729 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5730 {
5731         const struct net_device_ops *ops = dev->netdev_ops;
5732         int err;
5733
5734         if (!ops->ndo_set_mac_address)
5735                 return -EOPNOTSUPP;
5736         if (sa->sa_family != dev->type)
5737                 return -EINVAL;
5738         if (!netif_device_present(dev))
5739                 return -ENODEV;
5740         err = ops->ndo_set_mac_address(dev, sa);
5741         if (err)
5742                 return err;
5743         dev->addr_assign_type = NET_ADDR_SET;
5744         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5745         add_device_randomness(dev->dev_addr, dev->addr_len);
5746         return 0;
5747 }
5748 EXPORT_SYMBOL(dev_set_mac_address);
5749
5750 /**
5751  *      dev_change_carrier - Change device carrier
5752  *      @dev: device
5753  *      @new_carrier: new value
5754  *
5755  *      Change device carrier
5756  */
5757 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5758 {
5759         const struct net_device_ops *ops = dev->netdev_ops;
5760
5761         if (!ops->ndo_change_carrier)
5762                 return -EOPNOTSUPP;
5763         if (!netif_device_present(dev))
5764                 return -ENODEV;
5765         return ops->ndo_change_carrier(dev, new_carrier);
5766 }
5767 EXPORT_SYMBOL(dev_change_carrier);
5768
5769 /**
5770  *      dev_get_phys_port_id - Get device physical port ID
5771  *      @dev: device
5772  *      @ppid: port ID
5773  *
5774  *      Get device physical port ID
5775  */
5776 int dev_get_phys_port_id(struct net_device *dev,
5777                          struct netdev_phys_port_id *ppid)
5778 {
5779         const struct net_device_ops *ops = dev->netdev_ops;
5780
5781         if (!ops->ndo_get_phys_port_id)
5782                 return -EOPNOTSUPP;
5783         return ops->ndo_get_phys_port_id(dev, ppid);
5784 }
5785 EXPORT_SYMBOL(dev_get_phys_port_id);
5786
5787 /**
5788  *      dev_new_index   -       allocate an ifindex
5789  *      @net: the applicable net namespace
5790  *
5791  *      Returns a suitable unique value for a new device interface
5792  *      number.  The caller must hold the rtnl semaphore or the
5793  *      dev_base_lock to be sure it remains unique.
5794  */
5795 static int dev_new_index(struct net *net)
5796 {
5797         int ifindex = net->ifindex;
5798         for (;;) {
5799                 if (++ifindex <= 0)
5800                         ifindex = 1;
5801                 if (!__dev_get_by_index(net, ifindex))
5802                         return net->ifindex = ifindex;
5803         }
5804 }
5805
5806 /* Delayed registration/unregisteration */
5807 static LIST_HEAD(net_todo_list);
5808 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5809
5810 static void net_set_todo(struct net_device *dev)
5811 {
5812         list_add_tail(&dev->todo_list, &net_todo_list);
5813         dev_net(dev)->dev_unreg_count++;
5814 }
5815
5816 static void rollback_registered_many(struct list_head *head)
5817 {
5818         struct net_device *dev, *tmp;
5819         LIST_HEAD(close_head);
5820
5821         BUG_ON(dev_boot_phase);
5822         ASSERT_RTNL();
5823
5824         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5825                 /* Some devices call without registering
5826                  * for initialization unwind. Remove those
5827                  * devices and proceed with the remaining.
5828                  */
5829                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5830                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5831                                  dev->name, dev);
5832
5833                         WARN_ON(1);
5834                         list_del(&dev->unreg_list);
5835                         continue;
5836                 }
5837                 dev->dismantle = true;
5838                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5839         }
5840
5841         /* If device is running, close it first. */
5842         list_for_each_entry(dev, head, unreg_list)
5843                 list_add_tail(&dev->close_list, &close_head);
5844         dev_close_many(&close_head);
5845
5846         list_for_each_entry(dev, head, unreg_list) {
5847                 /* And unlink it from device chain. */
5848                 unlist_netdevice(dev);
5849
5850                 dev->reg_state = NETREG_UNREGISTERING;
5851         }
5852
5853         synchronize_net();
5854
5855         list_for_each_entry(dev, head, unreg_list) {
5856                 /* Shutdown queueing discipline. */
5857                 dev_shutdown(dev);
5858
5859
5860                 /* Notify protocols, that we are about to destroy
5861                    this device. They should clean all the things.
5862                 */
5863                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5864
5865                 /*
5866                  *      Flush the unicast and multicast chains
5867                  */
5868                 dev_uc_flush(dev);
5869                 dev_mc_flush(dev);
5870
5871                 if (dev->netdev_ops->ndo_uninit)
5872                         dev->netdev_ops->ndo_uninit(dev);
5873
5874                 if (!dev->rtnl_link_ops ||
5875                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5876                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5877
5878                 /* Notifier chain MUST detach us all upper devices. */
5879                 WARN_ON(netdev_has_any_upper_dev(dev));
5880
5881                 /* Remove entries from kobject tree */
5882                 netdev_unregister_kobject(dev);
5883 #ifdef CONFIG_XPS
5884                 /* Remove XPS queueing entries */
5885                 netif_reset_xps_queues_gt(dev, 0);
5886 #endif
5887         }
5888
5889         synchronize_net();
5890
5891         list_for_each_entry(dev, head, unreg_list)
5892                 dev_put(dev);
5893 }
5894
5895 static void rollback_registered(struct net_device *dev)
5896 {
5897         LIST_HEAD(single);
5898
5899         list_add(&dev->unreg_list, &single);
5900         rollback_registered_many(&single);
5901         list_del(&single);
5902 }
5903
5904 static netdev_features_t netdev_fix_features(struct net_device *dev,
5905         netdev_features_t features)
5906 {
5907         /* Fix illegal checksum combinations */
5908         if ((features & NETIF_F_HW_CSUM) &&
5909             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5910                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5911                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5912         }
5913
5914         /* TSO requires that SG is present as well. */
5915         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5916                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5917                 features &= ~NETIF_F_ALL_TSO;
5918         }
5919
5920         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5921                                         !(features & NETIF_F_IP_CSUM)) {
5922                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5923                 features &= ~NETIF_F_TSO;
5924                 features &= ~NETIF_F_TSO_ECN;
5925         }
5926
5927         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5928                                          !(features & NETIF_F_IPV6_CSUM)) {
5929                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5930                 features &= ~NETIF_F_TSO6;
5931         }
5932
5933         /* TSO ECN requires that TSO is present as well. */
5934         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5935                 features &= ~NETIF_F_TSO_ECN;
5936
5937         /* Software GSO depends on SG. */
5938         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5939                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5940                 features &= ~NETIF_F_GSO;
5941         }
5942
5943         /* UFO needs SG and checksumming */
5944         if (features & NETIF_F_UFO) {
5945                 /* maybe split UFO into V4 and V6? */
5946                 if (!((features & NETIF_F_GEN_CSUM) ||
5947                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5948                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5949                         netdev_dbg(dev,
5950                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5951                         features &= ~NETIF_F_UFO;
5952                 }
5953
5954                 if (!(features & NETIF_F_SG)) {
5955                         netdev_dbg(dev,
5956                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5957                         features &= ~NETIF_F_UFO;
5958                 }
5959         }
5960
5961 #ifdef CONFIG_NET_RX_BUSY_POLL
5962         if (dev->netdev_ops->ndo_busy_poll)
5963                 features |= NETIF_F_BUSY_POLL;
5964         else
5965 #endif
5966                 features &= ~NETIF_F_BUSY_POLL;
5967
5968         return features;
5969 }
5970
5971 int __netdev_update_features(struct net_device *dev)
5972 {
5973         netdev_features_t features;
5974         int err = 0;
5975
5976         ASSERT_RTNL();
5977
5978         features = netdev_get_wanted_features(dev);
5979
5980         if (dev->netdev_ops->ndo_fix_features)
5981                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5982
5983         /* driver might be less strict about feature dependencies */
5984         features = netdev_fix_features(dev, features);
5985
5986         if (dev->features == features)
5987                 return 0;
5988
5989         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5990                 &dev->features, &features);
5991
5992         if (dev->netdev_ops->ndo_set_features)
5993                 err = dev->netdev_ops->ndo_set_features(dev, features);
5994
5995         if (unlikely(err < 0)) {
5996                 netdev_err(dev,
5997                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5998                         err, &features, &dev->features);
5999                 return -1;
6000         }
6001
6002         if (!err)
6003                 dev->features = features;
6004
6005         return 1;
6006 }
6007
6008 /**
6009  *      netdev_update_features - recalculate device features
6010  *      @dev: the device to check
6011  *
6012  *      Recalculate dev->features set and send notifications if it
6013  *      has changed. Should be called after driver or hardware dependent
6014  *      conditions might have changed that influence the features.
6015  */
6016 void netdev_update_features(struct net_device *dev)
6017 {
6018         if (__netdev_update_features(dev))
6019                 netdev_features_change(dev);
6020 }
6021 EXPORT_SYMBOL(netdev_update_features);
6022
6023 /**
6024  *      netdev_change_features - recalculate device features
6025  *      @dev: the device to check
6026  *
6027  *      Recalculate dev->features set and send notifications even
6028  *      if they have not changed. Should be called instead of
6029  *      netdev_update_features() if also dev->vlan_features might
6030  *      have changed to allow the changes to be propagated to stacked
6031  *      VLAN devices.
6032  */
6033 void netdev_change_features(struct net_device *dev)
6034 {
6035         __netdev_update_features(dev);
6036         netdev_features_change(dev);
6037 }
6038 EXPORT_SYMBOL(netdev_change_features);
6039
6040 /**
6041  *      netif_stacked_transfer_operstate -      transfer operstate
6042  *      @rootdev: the root or lower level device to transfer state from
6043  *      @dev: the device to transfer operstate to
6044  *
6045  *      Transfer operational state from root to device. This is normally
6046  *      called when a stacking relationship exists between the root
6047  *      device and the device(a leaf device).
6048  */
6049 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6050                                         struct net_device *dev)
6051 {
6052         if (rootdev->operstate == IF_OPER_DORMANT)
6053                 netif_dormant_on(dev);
6054         else
6055                 netif_dormant_off(dev);
6056
6057         if (netif_carrier_ok(rootdev)) {
6058                 if (!netif_carrier_ok(dev))
6059                         netif_carrier_on(dev);
6060         } else {
6061                 if (netif_carrier_ok(dev))
6062                         netif_carrier_off(dev);
6063         }
6064 }
6065 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6066
6067 #ifdef CONFIG_SYSFS
6068 static int netif_alloc_rx_queues(struct net_device *dev)
6069 {
6070         unsigned int i, count = dev->num_rx_queues;
6071         struct netdev_rx_queue *rx;
6072
6073         BUG_ON(count < 1);
6074
6075         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6076         if (!rx)
6077                 return -ENOMEM;
6078
6079         dev->_rx = rx;
6080
6081         for (i = 0; i < count; i++)
6082                 rx[i].dev = dev;
6083         return 0;
6084 }
6085 #endif
6086
6087 static void netdev_init_one_queue(struct net_device *dev,
6088                                   struct netdev_queue *queue, void *_unused)
6089 {
6090         /* Initialize queue lock */
6091         spin_lock_init(&queue->_xmit_lock);
6092         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6093         queue->xmit_lock_owner = -1;
6094         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6095         queue->dev = dev;
6096 #ifdef CONFIG_BQL
6097         dql_init(&queue->dql, HZ);
6098 #endif
6099 }
6100
6101 static void netif_free_tx_queues(struct net_device *dev)
6102 {
6103         kvfree(dev->_tx);
6104 }
6105
6106 static int netif_alloc_netdev_queues(struct net_device *dev)
6107 {
6108         unsigned int count = dev->num_tx_queues;
6109         struct netdev_queue *tx;
6110         size_t sz = count * sizeof(*tx);
6111
6112         BUG_ON(count < 1 || count > 0xffff);
6113
6114         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6115         if (!tx) {
6116                 tx = vzalloc(sz);
6117                 if (!tx)
6118                         return -ENOMEM;
6119         }
6120         dev->_tx = tx;
6121
6122         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6123         spin_lock_init(&dev->tx_global_lock);
6124
6125         return 0;
6126 }
6127
6128 /**
6129  *      register_netdevice      - register a network device
6130  *      @dev: device to register
6131  *
6132  *      Take a completed network device structure and add it to the kernel
6133  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6134  *      chain. 0 is returned on success. A negative errno code is returned
6135  *      on a failure to set up the device, or if the name is a duplicate.
6136  *
6137  *      Callers must hold the rtnl semaphore. You may want
6138  *      register_netdev() instead of this.
6139  *
6140  *      BUGS:
6141  *      The locking appears insufficient to guarantee two parallel registers
6142  *      will not get the same name.
6143  */
6144
6145 int register_netdevice(struct net_device *dev)
6146 {
6147         int ret;
6148         struct net *net = dev_net(dev);
6149
6150         BUG_ON(dev_boot_phase);
6151         ASSERT_RTNL();
6152
6153         might_sleep();
6154
6155         /* When net_device's are persistent, this will be fatal. */
6156         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6157         BUG_ON(!net);
6158
6159         spin_lock_init(&dev->addr_list_lock);
6160         netdev_set_addr_lockdep_class(dev);
6161
6162         dev->iflink = -1;
6163
6164         ret = dev_get_valid_name(net, dev, dev->name);
6165         if (ret < 0)
6166                 goto out;
6167
6168         /* Init, if this function is available */
6169         if (dev->netdev_ops->ndo_init) {
6170                 ret = dev->netdev_ops->ndo_init(dev);
6171                 if (ret) {
6172                         if (ret > 0)
6173                                 ret = -EIO;
6174                         goto out;
6175                 }
6176         }
6177
6178         if (((dev->hw_features | dev->features) &
6179              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6180             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6181              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6182                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6183                 ret = -EINVAL;
6184                 goto err_uninit;
6185         }
6186
6187         ret = -EBUSY;
6188         if (!dev->ifindex)
6189                 dev->ifindex = dev_new_index(net);
6190         else if (__dev_get_by_index(net, dev->ifindex))
6191                 goto err_uninit;
6192
6193         if (dev->iflink == -1)
6194                 dev->iflink = dev->ifindex;
6195
6196         /* Transfer changeable features to wanted_features and enable
6197          * software offloads (GSO and GRO).
6198          */
6199         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6200         dev->features |= NETIF_F_SOFT_FEATURES;
6201         dev->wanted_features = dev->features & dev->hw_features;
6202
6203         if (!(dev->flags & IFF_LOOPBACK)) {
6204                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6205         }
6206
6207         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6208          */
6209         dev->vlan_features |= NETIF_F_HIGHDMA;
6210
6211         /* Make NETIF_F_SG inheritable to tunnel devices.
6212          */
6213         dev->hw_enc_features |= NETIF_F_SG;
6214
6215         /* Make NETIF_F_SG inheritable to MPLS.
6216          */
6217         dev->mpls_features |= NETIF_F_SG;
6218
6219         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6220         ret = notifier_to_errno(ret);
6221         if (ret)
6222                 goto err_uninit;
6223
6224         ret = netdev_register_kobject(dev);
6225         if (ret)
6226                 goto err_uninit;
6227         dev->reg_state = NETREG_REGISTERED;
6228
6229         __netdev_update_features(dev);
6230
6231         /*
6232          *      Default initial state at registry is that the
6233          *      device is present.
6234          */
6235
6236         set_bit(__LINK_STATE_PRESENT, &dev->state);
6237
6238         linkwatch_init_dev(dev);
6239
6240         dev_init_scheduler(dev);
6241         dev_hold(dev);
6242         list_netdevice(dev);
6243         add_device_randomness(dev->dev_addr, dev->addr_len);
6244
6245         /* If the device has permanent device address, driver should
6246          * set dev_addr and also addr_assign_type should be set to
6247          * NET_ADDR_PERM (default value).
6248          */
6249         if (dev->addr_assign_type == NET_ADDR_PERM)
6250                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6251
6252         /* Notify protocols, that a new device appeared. */
6253         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6254         ret = notifier_to_errno(ret);
6255         if (ret) {
6256                 rollback_registered(dev);
6257                 dev->reg_state = NETREG_UNREGISTERED;
6258         }
6259         /*
6260          *      Prevent userspace races by waiting until the network
6261          *      device is fully setup before sending notifications.
6262          */
6263         if (!dev->rtnl_link_ops ||
6264             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6265                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6266
6267 out:
6268         return ret;
6269
6270 err_uninit:
6271         if (dev->netdev_ops->ndo_uninit)
6272                 dev->netdev_ops->ndo_uninit(dev);
6273         goto out;
6274 }
6275 EXPORT_SYMBOL(register_netdevice);
6276
6277 /**
6278  *      init_dummy_netdev       - init a dummy network device for NAPI
6279  *      @dev: device to init
6280  *
6281  *      This takes a network device structure and initialize the minimum
6282  *      amount of fields so it can be used to schedule NAPI polls without
6283  *      registering a full blown interface. This is to be used by drivers
6284  *      that need to tie several hardware interfaces to a single NAPI
6285  *      poll scheduler due to HW limitations.
6286  */
6287 int init_dummy_netdev(struct net_device *dev)
6288 {
6289         /* Clear everything. Note we don't initialize spinlocks
6290          * are they aren't supposed to be taken by any of the
6291          * NAPI code and this dummy netdev is supposed to be
6292          * only ever used for NAPI polls
6293          */
6294         memset(dev, 0, sizeof(struct net_device));
6295
6296         /* make sure we BUG if trying to hit standard
6297          * register/unregister code path
6298          */
6299         dev->reg_state = NETREG_DUMMY;
6300
6301         /* NAPI wants this */
6302         INIT_LIST_HEAD(&dev->napi_list);
6303
6304         /* a dummy interface is started by default */
6305         set_bit(__LINK_STATE_PRESENT, &dev->state);
6306         set_bit(__LINK_STATE_START, &dev->state);
6307
6308         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6309          * because users of this 'device' dont need to change
6310          * its refcount.
6311          */
6312
6313         return 0;
6314 }
6315 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6316
6317
6318 /**
6319  *      register_netdev - register a network device
6320  *      @dev: device to register
6321  *
6322  *      Take a completed network device structure and add it to the kernel
6323  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6324  *      chain. 0 is returned on success. A negative errno code is returned
6325  *      on a failure to set up the device, or if the name is a duplicate.
6326  *
6327  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6328  *      and expands the device name if you passed a format string to
6329  *      alloc_netdev.
6330  */
6331 int register_netdev(struct net_device *dev)
6332 {
6333         int err;
6334
6335         rtnl_lock();
6336         err = register_netdevice(dev);
6337         rtnl_unlock();
6338         return err;
6339 }
6340 EXPORT_SYMBOL(register_netdev);
6341
6342 int netdev_refcnt_read(const struct net_device *dev)
6343 {
6344         int i, refcnt = 0;
6345
6346         for_each_possible_cpu(i)
6347                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6348         return refcnt;
6349 }
6350 EXPORT_SYMBOL(netdev_refcnt_read);
6351
6352 /**
6353  * netdev_wait_allrefs - wait until all references are gone.
6354  * @dev: target net_device
6355  *
6356  * This is called when unregistering network devices.
6357  *
6358  * Any protocol or device that holds a reference should register
6359  * for netdevice notification, and cleanup and put back the
6360  * reference if they receive an UNREGISTER event.
6361  * We can get stuck here if buggy protocols don't correctly
6362  * call dev_put.
6363  */
6364 static void netdev_wait_allrefs(struct net_device *dev)
6365 {
6366         unsigned long rebroadcast_time, warning_time;
6367         int refcnt;
6368
6369         linkwatch_forget_dev(dev);
6370
6371         rebroadcast_time = warning_time = jiffies;
6372         refcnt = netdev_refcnt_read(dev);
6373
6374         while (refcnt != 0) {
6375                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6376                         rtnl_lock();
6377
6378                         /* Rebroadcast unregister notification */
6379                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6380
6381                         __rtnl_unlock();
6382                         rcu_barrier();
6383                         rtnl_lock();
6384
6385                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6386                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6387                                      &dev->state)) {
6388                                 /* We must not have linkwatch events
6389                                  * pending on unregister. If this
6390                                  * happens, we simply run the queue
6391                                  * unscheduled, resulting in a noop
6392                                  * for this device.
6393                                  */
6394                                 linkwatch_run_queue();
6395                         }
6396
6397                         __rtnl_unlock();
6398
6399                         rebroadcast_time = jiffies;
6400                 }
6401
6402                 msleep(250);
6403
6404                 refcnt = netdev_refcnt_read(dev);
6405
6406                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6407                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6408                                  dev->name, refcnt);
6409                         warning_time = jiffies;
6410                 }
6411         }
6412 }
6413
6414 /* The sequence is:
6415  *
6416  *      rtnl_lock();
6417  *      ...
6418  *      register_netdevice(x1);
6419  *      register_netdevice(x2);
6420  *      ...
6421  *      unregister_netdevice(y1);
6422  *      unregister_netdevice(y2);
6423  *      ...
6424  *      rtnl_unlock();
6425  *      free_netdev(y1);
6426  *      free_netdev(y2);
6427  *
6428  * We are invoked by rtnl_unlock().
6429  * This allows us to deal with problems:
6430  * 1) We can delete sysfs objects which invoke hotplug
6431  *    without deadlocking with linkwatch via keventd.
6432  * 2) Since we run with the RTNL semaphore not held, we can sleep
6433  *    safely in order to wait for the netdev refcnt to drop to zero.
6434  *
6435  * We must not return until all unregister events added during
6436  * the interval the lock was held have been completed.
6437  */
6438 void netdev_run_todo(void)
6439 {
6440         struct list_head list;
6441
6442         /* Snapshot list, allow later requests */
6443         list_replace_init(&net_todo_list, &list);
6444
6445         __rtnl_unlock();
6446
6447
6448         /* Wait for rcu callbacks to finish before next phase */
6449         if (!list_empty(&list))
6450                 rcu_barrier();
6451
6452         while (!list_empty(&list)) {
6453                 struct net_device *dev
6454                         = list_first_entry(&list, struct net_device, todo_list);
6455                 list_del(&dev->todo_list);
6456
6457                 rtnl_lock();
6458                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6459                 __rtnl_unlock();
6460
6461                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6462                         pr_err("network todo '%s' but state %d\n",
6463                                dev->name, dev->reg_state);
6464                         dump_stack();
6465                         continue;
6466                 }
6467
6468                 dev->reg_state = NETREG_UNREGISTERED;
6469
6470                 on_each_cpu(flush_backlog, dev, 1);
6471
6472                 netdev_wait_allrefs(dev);
6473
6474                 /* paranoia */
6475                 BUG_ON(netdev_refcnt_read(dev));
6476                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6477                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6478                 WARN_ON(dev->dn_ptr);
6479
6480                 if (dev->destructor)
6481                         dev->destructor(dev);
6482
6483                 /* Report a network device has been unregistered */
6484                 rtnl_lock();
6485                 dev_net(dev)->dev_unreg_count--;
6486                 __rtnl_unlock();
6487                 wake_up(&netdev_unregistering_wq);
6488
6489                 /* Free network device */
6490                 kobject_put(&dev->dev.kobj);
6491         }
6492 }
6493
6494 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6495  * fields in the same order, with only the type differing.
6496  */
6497 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6498                              const struct net_device_stats *netdev_stats)
6499 {
6500 #if BITS_PER_LONG == 64
6501         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6502         memcpy(stats64, netdev_stats, sizeof(*stats64));
6503 #else
6504         size_t i, n = sizeof(*stats64) / sizeof(u64);
6505         const unsigned long *src = (const unsigned long *)netdev_stats;
6506         u64 *dst = (u64 *)stats64;
6507
6508         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6509                      sizeof(*stats64) / sizeof(u64));
6510         for (i = 0; i < n; i++)
6511                 dst[i] = src[i];
6512 #endif
6513 }
6514 EXPORT_SYMBOL(netdev_stats_to_stats64);
6515
6516 /**
6517  *      dev_get_stats   - get network device statistics
6518  *      @dev: device to get statistics from
6519  *      @storage: place to store stats
6520  *
6521  *      Get network statistics from device. Return @storage.
6522  *      The device driver may provide its own method by setting
6523  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6524  *      otherwise the internal statistics structure is used.
6525  */
6526 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6527                                         struct rtnl_link_stats64 *storage)
6528 {
6529         const struct net_device_ops *ops = dev->netdev_ops;
6530
6531         if (ops->ndo_get_stats64) {
6532                 memset(storage, 0, sizeof(*storage));
6533                 ops->ndo_get_stats64(dev, storage);
6534         } else if (ops->ndo_get_stats) {
6535                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6536         } else {
6537                 netdev_stats_to_stats64(storage, &dev->stats);
6538         }
6539         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6540         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6541         return storage;
6542 }
6543 EXPORT_SYMBOL(dev_get_stats);
6544
6545 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6546 {
6547         struct netdev_queue *queue = dev_ingress_queue(dev);
6548
6549 #ifdef CONFIG_NET_CLS_ACT
6550         if (queue)
6551                 return queue;
6552         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6553         if (!queue)
6554                 return NULL;
6555         netdev_init_one_queue(dev, queue, NULL);
6556         queue->qdisc = &noop_qdisc;
6557         queue->qdisc_sleeping = &noop_qdisc;
6558         rcu_assign_pointer(dev->ingress_queue, queue);
6559 #endif
6560         return queue;
6561 }
6562
6563 static const struct ethtool_ops default_ethtool_ops;
6564
6565 void netdev_set_default_ethtool_ops(struct net_device *dev,
6566                                     const struct ethtool_ops *ops)
6567 {
6568         if (dev->ethtool_ops == &default_ethtool_ops)
6569                 dev->ethtool_ops = ops;
6570 }
6571 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6572
6573 void netdev_freemem(struct net_device *dev)
6574 {
6575         char *addr = (char *)dev - dev->padded;
6576
6577         kvfree(addr);
6578 }
6579
6580 /**
6581  *      alloc_netdev_mqs - allocate network device
6582  *      @sizeof_priv:           size of private data to allocate space for
6583  *      @name:                  device name format string
6584  *      @name_assign_type:      origin of device name
6585  *      @setup:                 callback to initialize device
6586  *      @txqs:                  the number of TX subqueues to allocate
6587  *      @rxqs:                  the number of RX subqueues to allocate
6588  *
6589  *      Allocates a struct net_device with private data area for driver use
6590  *      and performs basic initialization.  Also allocates subqueue structs
6591  *      for each queue on the device.
6592  */
6593 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6594                 unsigned char name_assign_type,
6595                 void (*setup)(struct net_device *),
6596                 unsigned int txqs, unsigned int rxqs)
6597 {
6598         struct net_device *dev;
6599         size_t alloc_size;
6600         struct net_device *p;
6601
6602         BUG_ON(strlen(name) >= sizeof(dev->name));
6603
6604         if (txqs < 1) {
6605                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6606                 return NULL;
6607         }
6608
6609 #ifdef CONFIG_SYSFS
6610         if (rxqs < 1) {
6611                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6612                 return NULL;
6613         }
6614 #endif
6615
6616         alloc_size = sizeof(struct net_device);
6617         if (sizeof_priv) {
6618                 /* ensure 32-byte alignment of private area */
6619                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6620                 alloc_size += sizeof_priv;
6621         }
6622         /* ensure 32-byte alignment of whole construct */
6623         alloc_size += NETDEV_ALIGN - 1;
6624
6625         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6626         if (!p)
6627                 p = vzalloc(alloc_size);
6628         if (!p)
6629                 return NULL;
6630
6631         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6632         dev->padded = (char *)dev - (char *)p;
6633
6634         dev->pcpu_refcnt = alloc_percpu(int);
6635         if (!dev->pcpu_refcnt)
6636                 goto free_dev;
6637
6638         if (dev_addr_init(dev))
6639                 goto free_pcpu;
6640
6641         dev_mc_init(dev);
6642         dev_uc_init(dev);
6643
6644         dev_net_set(dev, &init_net);
6645
6646         dev->gso_max_size = GSO_MAX_SIZE;
6647         dev->gso_max_segs = GSO_MAX_SEGS;
6648
6649         INIT_LIST_HEAD(&dev->napi_list);
6650         INIT_LIST_HEAD(&dev->unreg_list);
6651         INIT_LIST_HEAD(&dev->close_list);
6652         INIT_LIST_HEAD(&dev->link_watch_list);
6653         INIT_LIST_HEAD(&dev->adj_list.upper);
6654         INIT_LIST_HEAD(&dev->adj_list.lower);
6655         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6656         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6657         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6658         setup(dev);
6659
6660         dev->num_tx_queues = txqs;
6661         dev->real_num_tx_queues = txqs;
6662         if (netif_alloc_netdev_queues(dev))
6663                 goto free_all;
6664
6665 #ifdef CONFIG_SYSFS
6666         dev->num_rx_queues = rxqs;
6667         dev->real_num_rx_queues = rxqs;
6668         if (netif_alloc_rx_queues(dev))
6669                 goto free_all;
6670 #endif
6671
6672         strcpy(dev->name, name);
6673         dev->name_assign_type = name_assign_type;
6674         dev->group = INIT_NETDEV_GROUP;
6675         if (!dev->ethtool_ops)
6676                 dev->ethtool_ops = &default_ethtool_ops;
6677         return dev;
6678
6679 free_all:
6680         free_netdev(dev);
6681         return NULL;
6682
6683 free_pcpu:
6684         free_percpu(dev->pcpu_refcnt);
6685 free_dev:
6686         netdev_freemem(dev);
6687         return NULL;
6688 }
6689 EXPORT_SYMBOL(alloc_netdev_mqs);
6690
6691 /**
6692  *      free_netdev - free network device
6693  *      @dev: device
6694  *
6695  *      This function does the last stage of destroying an allocated device
6696  *      interface. The reference to the device object is released.
6697  *      If this is the last reference then it will be freed.
6698  */
6699 void free_netdev(struct net_device *dev)
6700 {
6701         struct napi_struct *p, *n;
6702
6703         release_net(dev_net(dev));
6704
6705         netif_free_tx_queues(dev);
6706 #ifdef CONFIG_SYSFS
6707         kfree(dev->_rx);
6708 #endif
6709
6710         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6711
6712         /* Flush device addresses */
6713         dev_addr_flush(dev);
6714
6715         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6716                 netif_napi_del(p);
6717
6718         free_percpu(dev->pcpu_refcnt);
6719         dev->pcpu_refcnt = NULL;
6720
6721         /*  Compatibility with error handling in drivers */
6722         if (dev->reg_state == NETREG_UNINITIALIZED) {
6723                 netdev_freemem(dev);
6724                 return;
6725         }
6726
6727         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6728         dev->reg_state = NETREG_RELEASED;
6729
6730         /* will free via device release */
6731         put_device(&dev->dev);
6732 }
6733 EXPORT_SYMBOL(free_netdev);
6734
6735 /**
6736  *      synchronize_net -  Synchronize with packet receive processing
6737  *
6738  *      Wait for packets currently being received to be done.
6739  *      Does not block later packets from starting.
6740  */
6741 void synchronize_net(void)
6742 {
6743         might_sleep();
6744         if (rtnl_is_locked())
6745                 synchronize_rcu_expedited();
6746         else
6747                 synchronize_rcu();
6748 }
6749 EXPORT_SYMBOL(synchronize_net);
6750
6751 /**
6752  *      unregister_netdevice_queue - remove device from the kernel
6753  *      @dev: device
6754  *      @head: list
6755  *
6756  *      This function shuts down a device interface and removes it
6757  *      from the kernel tables.
6758  *      If head not NULL, device is queued to be unregistered later.
6759  *
6760  *      Callers must hold the rtnl semaphore.  You may want
6761  *      unregister_netdev() instead of this.
6762  */
6763
6764 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6765 {
6766         ASSERT_RTNL();
6767
6768         if (head) {
6769                 list_move_tail(&dev->unreg_list, head);
6770         } else {
6771                 rollback_registered(dev);
6772                 /* Finish processing unregister after unlock */
6773                 net_set_todo(dev);
6774         }
6775 }
6776 EXPORT_SYMBOL(unregister_netdevice_queue);
6777
6778 /**
6779  *      unregister_netdevice_many - unregister many devices
6780  *      @head: list of devices
6781  *
6782  *  Note: As most callers use a stack allocated list_head,
6783  *  we force a list_del() to make sure stack wont be corrupted later.
6784  */
6785 void unregister_netdevice_many(struct list_head *head)
6786 {
6787         struct net_device *dev;
6788
6789         if (!list_empty(head)) {
6790                 rollback_registered_many(head);
6791                 list_for_each_entry(dev, head, unreg_list)
6792                         net_set_todo(dev);
6793                 list_del(head);
6794         }
6795 }
6796 EXPORT_SYMBOL(unregister_netdevice_many);
6797
6798 /**
6799  *      unregister_netdev - remove device from the kernel
6800  *      @dev: device
6801  *
6802  *      This function shuts down a device interface and removes it
6803  *      from the kernel tables.
6804  *
6805  *      This is just a wrapper for unregister_netdevice that takes
6806  *      the rtnl semaphore.  In general you want to use this and not
6807  *      unregister_netdevice.
6808  */
6809 void unregister_netdev(struct net_device *dev)
6810 {
6811         rtnl_lock();
6812         unregister_netdevice(dev);
6813         rtnl_unlock();
6814 }
6815 EXPORT_SYMBOL(unregister_netdev);
6816
6817 /**
6818  *      dev_change_net_namespace - move device to different nethost namespace
6819  *      @dev: device
6820  *      @net: network namespace
6821  *      @pat: If not NULL name pattern to try if the current device name
6822  *            is already taken in the destination network namespace.
6823  *
6824  *      This function shuts down a device interface and moves it
6825  *      to a new network namespace. On success 0 is returned, on
6826  *      a failure a netagive errno code is returned.
6827  *
6828  *      Callers must hold the rtnl semaphore.
6829  */
6830
6831 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6832 {
6833         int err;
6834
6835         ASSERT_RTNL();
6836
6837         /* Don't allow namespace local devices to be moved. */
6838         err = -EINVAL;
6839         if (dev->features & NETIF_F_NETNS_LOCAL)
6840                 goto out;
6841
6842         /* Ensure the device has been registrered */
6843         if (dev->reg_state != NETREG_REGISTERED)
6844                 goto out;
6845
6846         /* Get out if there is nothing todo */
6847         err = 0;
6848         if (net_eq(dev_net(dev), net))
6849                 goto out;
6850
6851         /* Pick the destination device name, and ensure
6852          * we can use it in the destination network namespace.
6853          */
6854         err = -EEXIST;
6855         if (__dev_get_by_name(net, dev->name)) {
6856                 /* We get here if we can't use the current device name */
6857                 if (!pat)
6858                         goto out;
6859                 if (dev_get_valid_name(net, dev, pat) < 0)
6860                         goto out;
6861         }
6862
6863         /*
6864          * And now a mini version of register_netdevice unregister_netdevice.
6865          */
6866
6867         /* If device is running close it first. */
6868         dev_close(dev);
6869
6870         /* And unlink it from device chain */
6871         err = -ENODEV;
6872         unlist_netdevice(dev);
6873
6874         synchronize_net();
6875
6876         /* Shutdown queueing discipline. */
6877         dev_shutdown(dev);
6878
6879         /* Notify protocols, that we are about to destroy
6880            this device. They should clean all the things.
6881
6882            Note that dev->reg_state stays at NETREG_REGISTERED.
6883            This is wanted because this way 8021q and macvlan know
6884            the device is just moving and can keep their slaves up.
6885         */
6886         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6887         rcu_barrier();
6888         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6889         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6890
6891         /*
6892          *      Flush the unicast and multicast chains
6893          */
6894         dev_uc_flush(dev);
6895         dev_mc_flush(dev);
6896
6897         /* Send a netdev-removed uevent to the old namespace */
6898         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6899         netdev_adjacent_del_links(dev);
6900
6901         /* Actually switch the network namespace */
6902         dev_net_set(dev, net);
6903
6904         /* If there is an ifindex conflict assign a new one */
6905         if (__dev_get_by_index(net, dev->ifindex)) {
6906                 int iflink = (dev->iflink == dev->ifindex);
6907                 dev->ifindex = dev_new_index(net);
6908                 if (iflink)
6909                         dev->iflink = dev->ifindex;
6910         }
6911
6912         /* Send a netdev-add uevent to the new namespace */
6913         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6914         netdev_adjacent_add_links(dev);
6915
6916         /* Fixup kobjects */
6917         err = device_rename(&dev->dev, dev->name);
6918         WARN_ON(err);
6919
6920         /* Add the device back in the hashes */
6921         list_netdevice(dev);
6922
6923         /* Notify protocols, that a new device appeared. */
6924         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6925
6926         /*
6927          *      Prevent userspace races by waiting until the network
6928          *      device is fully setup before sending notifications.
6929          */
6930         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6931
6932         synchronize_net();
6933         err = 0;
6934 out:
6935         return err;
6936 }
6937 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6938
6939 static int dev_cpu_callback(struct notifier_block *nfb,
6940                             unsigned long action,
6941                             void *ocpu)
6942 {
6943         struct sk_buff **list_skb;
6944         struct sk_buff *skb;
6945         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6946         struct softnet_data *sd, *oldsd;
6947
6948         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6949                 return NOTIFY_OK;
6950
6951         local_irq_disable();
6952         cpu = smp_processor_id();
6953         sd = &per_cpu(softnet_data, cpu);
6954         oldsd = &per_cpu(softnet_data, oldcpu);
6955
6956         /* Find end of our completion_queue. */
6957         list_skb = &sd->completion_queue;
6958         while (*list_skb)
6959                 list_skb = &(*list_skb)->next;
6960         /* Append completion queue from offline CPU. */
6961         *list_skb = oldsd->completion_queue;
6962         oldsd->completion_queue = NULL;
6963
6964         /* Append output queue from offline CPU. */
6965         if (oldsd->output_queue) {
6966                 *sd->output_queue_tailp = oldsd->output_queue;
6967                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6968                 oldsd->output_queue = NULL;
6969                 oldsd->output_queue_tailp = &oldsd->output_queue;
6970         }
6971         /* Append NAPI poll list from offline CPU. */
6972         if (!list_empty(&oldsd->poll_list)) {
6973                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6974                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6975         }
6976
6977         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6978         local_irq_enable();
6979
6980         /* Process offline CPU's input_pkt_queue */
6981         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6982                 netif_rx_internal(skb);
6983                 input_queue_head_incr(oldsd);
6984         }
6985         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6986                 netif_rx_internal(skb);
6987                 input_queue_head_incr(oldsd);
6988         }
6989
6990         return NOTIFY_OK;
6991 }
6992
6993
6994 /**
6995  *      netdev_increment_features - increment feature set by one
6996  *      @all: current feature set
6997  *      @one: new feature set
6998  *      @mask: mask feature set
6999  *
7000  *      Computes a new feature set after adding a device with feature set
7001  *      @one to the master device with current feature set @all.  Will not
7002  *      enable anything that is off in @mask. Returns the new feature set.
7003  */
7004 netdev_features_t netdev_increment_features(netdev_features_t all,
7005         netdev_features_t one, netdev_features_t mask)
7006 {
7007         if (mask & NETIF_F_GEN_CSUM)
7008                 mask |= NETIF_F_ALL_CSUM;
7009         mask |= NETIF_F_VLAN_CHALLENGED;
7010
7011         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7012         all &= one | ~NETIF_F_ALL_FOR_ALL;
7013
7014         /* If one device supports hw checksumming, set for all. */
7015         if (all & NETIF_F_GEN_CSUM)
7016                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7017
7018         return all;
7019 }
7020 EXPORT_SYMBOL(netdev_increment_features);
7021
7022 static struct hlist_head * __net_init netdev_create_hash(void)
7023 {
7024         int i;
7025         struct hlist_head *hash;
7026
7027         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7028         if (hash != NULL)
7029                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7030                         INIT_HLIST_HEAD(&hash[i]);
7031
7032         return hash;
7033 }
7034
7035 /* Initialize per network namespace state */
7036 static int __net_init netdev_init(struct net *net)
7037 {
7038         if (net != &init_net)
7039                 INIT_LIST_HEAD(&net->dev_base_head);
7040
7041         net->dev_name_head = netdev_create_hash();
7042         if (net->dev_name_head == NULL)
7043                 goto err_name;
7044
7045         net->dev_index_head = netdev_create_hash();
7046         if (net->dev_index_head == NULL)
7047                 goto err_idx;
7048
7049         return 0;
7050
7051 err_idx:
7052         kfree(net->dev_name_head);
7053 err_name:
7054         return -ENOMEM;
7055 }
7056
7057 /**
7058  *      netdev_drivername - network driver for the device
7059  *      @dev: network device
7060  *
7061  *      Determine network driver for device.
7062  */
7063 const char *netdev_drivername(const struct net_device *dev)
7064 {
7065         const struct device_driver *driver;
7066         const struct device *parent;
7067         const char *empty = "";
7068
7069         parent = dev->dev.parent;
7070         if (!parent)
7071                 return empty;
7072
7073         driver = parent->driver;
7074         if (driver && driver->name)
7075                 return driver->name;
7076         return empty;
7077 }
7078
7079 static int __netdev_printk(const char *level, const struct net_device *dev,
7080                            struct va_format *vaf)
7081 {
7082         int r;
7083
7084         if (dev && dev->dev.parent) {
7085                 r = dev_printk_emit(level[1] - '0',
7086                                     dev->dev.parent,
7087                                     "%s %s %s%s: %pV",
7088                                     dev_driver_string(dev->dev.parent),
7089                                     dev_name(dev->dev.parent),
7090                                     netdev_name(dev), netdev_reg_state(dev),
7091                                     vaf);
7092         } else if (dev) {
7093                 r = printk("%s%s%s: %pV", level, netdev_name(dev),
7094                            netdev_reg_state(dev), vaf);
7095         } else {
7096                 r = printk("%s(NULL net_device): %pV", level, vaf);
7097         }
7098
7099         return r;
7100 }
7101
7102 int netdev_printk(const char *level, const struct net_device *dev,
7103                   const char *format, ...)
7104 {
7105         struct va_format vaf;
7106         va_list args;
7107         int r;
7108
7109         va_start(args, format);
7110
7111         vaf.fmt = format;
7112         vaf.va = &args;
7113
7114         r = __netdev_printk(level, dev, &vaf);
7115
7116         va_end(args);
7117
7118         return r;
7119 }
7120 EXPORT_SYMBOL(netdev_printk);
7121
7122 #define define_netdev_printk_level(func, level)                 \
7123 int func(const struct net_device *dev, const char *fmt, ...)    \
7124 {                                                               \
7125         int r;                                                  \
7126         struct va_format vaf;                                   \
7127         va_list args;                                           \
7128                                                                 \
7129         va_start(args, fmt);                                    \
7130                                                                 \
7131         vaf.fmt = fmt;                                          \
7132         vaf.va = &args;                                         \
7133                                                                 \
7134         r = __netdev_printk(level, dev, &vaf);                  \
7135                                                                 \
7136         va_end(args);                                           \
7137                                                                 \
7138         return r;                                               \
7139 }                                                               \
7140 EXPORT_SYMBOL(func);
7141
7142 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7143 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7144 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7145 define_netdev_printk_level(netdev_err, KERN_ERR);
7146 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7147 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7148 define_netdev_printk_level(netdev_info, KERN_INFO);
7149
7150 static void __net_exit netdev_exit(struct net *net)
7151 {
7152         kfree(net->dev_name_head);
7153         kfree(net->dev_index_head);
7154 }
7155
7156 static struct pernet_operations __net_initdata netdev_net_ops = {
7157         .init = netdev_init,
7158         .exit = netdev_exit,
7159 };
7160
7161 static void __net_exit default_device_exit(struct net *net)
7162 {
7163         struct net_device *dev, *aux;
7164         /*
7165          * Push all migratable network devices back to the
7166          * initial network namespace
7167          */
7168         rtnl_lock();
7169         for_each_netdev_safe(net, dev, aux) {
7170                 int err;
7171                 char fb_name[IFNAMSIZ];
7172
7173                 /* Ignore unmoveable devices (i.e. loopback) */
7174                 if (dev->features & NETIF_F_NETNS_LOCAL)
7175                         continue;
7176
7177                 /* Leave virtual devices for the generic cleanup */
7178                 if (dev->rtnl_link_ops)
7179                         continue;
7180
7181                 /* Push remaining network devices to init_net */
7182                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7183                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7184                 if (err) {
7185                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7186                                  __func__, dev->name, err);
7187                         BUG();
7188                 }
7189         }
7190         rtnl_unlock();
7191 }
7192
7193 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7194 {
7195         /* Return with the rtnl_lock held when there are no network
7196          * devices unregistering in any network namespace in net_list.
7197          */
7198         struct net *net;
7199         bool unregistering;
7200         DEFINE_WAIT(wait);
7201
7202         for (;;) {
7203                 prepare_to_wait(&netdev_unregistering_wq, &wait,
7204                                 TASK_UNINTERRUPTIBLE);
7205                 unregistering = false;
7206                 rtnl_lock();
7207                 list_for_each_entry(net, net_list, exit_list) {
7208                         if (net->dev_unreg_count > 0) {
7209                                 unregistering = true;
7210                                 break;
7211                         }
7212                 }
7213                 if (!unregistering)
7214                         break;
7215                 __rtnl_unlock();
7216                 schedule();
7217         }
7218         finish_wait(&netdev_unregistering_wq, &wait);
7219 }
7220
7221 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7222 {
7223         /* At exit all network devices most be removed from a network
7224          * namespace.  Do this in the reverse order of registration.
7225          * Do this across as many network namespaces as possible to
7226          * improve batching efficiency.
7227          */
7228         struct net_device *dev;
7229         struct net *net;
7230         LIST_HEAD(dev_kill_list);
7231
7232         /* To prevent network device cleanup code from dereferencing
7233          * loopback devices or network devices that have been freed
7234          * wait here for all pending unregistrations to complete,
7235          * before unregistring the loopback device and allowing the
7236          * network namespace be freed.
7237          *
7238          * The netdev todo list containing all network devices
7239          * unregistrations that happen in default_device_exit_batch
7240          * will run in the rtnl_unlock() at the end of
7241          * default_device_exit_batch.
7242          */
7243         rtnl_lock_unregistering(net_list);
7244         list_for_each_entry(net, net_list, exit_list) {
7245                 for_each_netdev_reverse(net, dev) {
7246                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7247                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7248                         else
7249                                 unregister_netdevice_queue(dev, &dev_kill_list);
7250                 }
7251         }
7252         unregister_netdevice_many(&dev_kill_list);
7253         rtnl_unlock();
7254 }
7255
7256 static struct pernet_operations __net_initdata default_device_ops = {
7257         .exit = default_device_exit,
7258         .exit_batch = default_device_exit_batch,
7259 };
7260
7261 /*
7262  *      Initialize the DEV module. At boot time this walks the device list and
7263  *      unhooks any devices that fail to initialise (normally hardware not
7264  *      present) and leaves us with a valid list of present and active devices.
7265  *
7266  */
7267
7268 /*
7269  *       This is called single threaded during boot, so no need
7270  *       to take the rtnl semaphore.
7271  */
7272 static int __init net_dev_init(void)
7273 {
7274         int i, rc = -ENOMEM;
7275
7276         BUG_ON(!dev_boot_phase);
7277
7278         if (dev_proc_init())
7279                 goto out;
7280
7281         if (netdev_kobject_init())
7282                 goto out;
7283
7284         INIT_LIST_HEAD(&ptype_all);
7285         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7286                 INIT_LIST_HEAD(&ptype_base[i]);
7287
7288         INIT_LIST_HEAD(&offload_base);
7289
7290         if (register_pernet_subsys(&netdev_net_ops))
7291                 goto out;
7292
7293         /*
7294          *      Initialise the packet receive queues.
7295          */
7296
7297         for_each_possible_cpu(i) {
7298                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7299
7300                 skb_queue_head_init(&sd->input_pkt_queue);
7301                 skb_queue_head_init(&sd->process_queue);
7302                 INIT_LIST_HEAD(&sd->poll_list);
7303                 sd->output_queue_tailp = &sd->output_queue;
7304 #ifdef CONFIG_RPS
7305                 sd->csd.func = rps_trigger_softirq;
7306                 sd->csd.info = sd;
7307                 sd->cpu = i;
7308 #endif
7309
7310                 sd->backlog.poll = process_backlog;
7311                 sd->backlog.weight = weight_p;
7312         }
7313
7314         dev_boot_phase = 0;
7315
7316         /* The loopback device is special if any other network devices
7317          * is present in a network namespace the loopback device must
7318          * be present. Since we now dynamically allocate and free the
7319          * loopback device ensure this invariant is maintained by
7320          * keeping the loopback device as the first device on the
7321          * list of network devices.  Ensuring the loopback devices
7322          * is the first device that appears and the last network device
7323          * that disappears.
7324          */
7325         if (register_pernet_device(&loopback_net_ops))
7326                 goto out;
7327
7328         if (register_pernet_device(&default_device_ops))
7329                 goto out;
7330
7331         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7332         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7333
7334         hotcpu_notifier(dev_cpu_callback, 0);
7335         dst_init();
7336         rc = 0;
7337 out:
7338         return rc;
7339 }
7340
7341 subsys_initcall(net_dev_init);