openvswitch: fix odd_ptr_err.cocci warnings
[linux-2.6-block.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
25cd9ba0 121#include <net/mpls.h>
8f0f2223
DM
122#include <linux/ipv6.h>
123#include <linux/in.h>
b6b2fed1
DM
124#include <linux/jhash.h>
125#include <linux/random.h>
9cbc1cb8 126#include <trace/events/napi.h>
cf66ba58 127#include <trace/events/net.h>
07dc22e7 128#include <trace/events/skb.h>
5acbbd42 129#include <linux/pci.h>
caeda9b9 130#include <linux/inetdevice.h>
c445477d 131#include <linux/cpu_rmap.h>
c5905afb 132#include <linux/static_key.h>
af12fa6e 133#include <linux/hashtable.h>
60877a32 134#include <linux/vmalloc.h>
529d0489 135#include <linux/if_macvlan.h>
e7fd2885 136#include <linux/errqueue.h>
3b47d303 137#include <linux/hrtimer.h>
1da177e4 138
342709ef
PE
139#include "net-sysfs.h"
140
d565b0a1
HX
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
5d38a079
HX
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
1da177e4 147static DEFINE_SPINLOCK(ptype_lock);
62532da9 148static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
149struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150struct list_head ptype_all __read_mostly; /* Taps */
62532da9 151static struct list_head offload_base __read_mostly;
1da177e4 152
ae78dbfa 153static int netif_rx_internal(struct sk_buff *skb);
54951194
LP
154static int call_netdevice_notifiers_info(unsigned long val,
155 struct net_device *dev,
156 struct netdev_notifier_info *info);
ae78dbfa 157
1da177e4 158/*
7562f876 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
160 * semaphore.
161 *
c6d14c84 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
163 *
164 * Writers must hold the rtnl semaphore while they loop through the
7562f876 165 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
166 * actual updates. This allows pure readers to access the list even
167 * while a writer is preparing to update it.
168 *
169 * To put it another way, dev_base_lock is held for writing only to
170 * protect against pure readers; the rtnl semaphore provides the
171 * protection against other writers.
172 *
173 * See, for example usages, register_netdevice() and
174 * unregister_netdevice(), which must be called with the rtnl
175 * semaphore held.
176 */
1da177e4 177DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
178EXPORT_SYMBOL(dev_base_lock);
179
af12fa6e
ET
180/* protects napi_hash addition/deletion and napi_gen_id */
181static DEFINE_SPINLOCK(napi_hash_lock);
182
183static unsigned int napi_gen_id;
184static DEFINE_HASHTABLE(napi_hash, 8);
185
18afa4b0 186static seqcount_t devnet_rename_seq;
c91f6df2 187
4e985ada
TG
188static inline void dev_base_seq_inc(struct net *net)
189{
190 while (++net->dev_base_seq == 0);
191}
192
881d966b 193static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 194{
95c96174
ED
195 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196
08e9897d 197 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
198}
199
881d966b 200static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 201{
7c28bd0b 202 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
203}
204
e36fa2f7 205static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
206{
207#ifdef CONFIG_RPS
e36fa2f7 208 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
209#endif
210}
211
e36fa2f7 212static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
213{
214#ifdef CONFIG_RPS
e36fa2f7 215 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
216#endif
217}
218
ce286d32 219/* Device list insertion */
53759be9 220static void list_netdevice(struct net_device *dev)
ce286d32 221{
c346dca1 222 struct net *net = dev_net(dev);
ce286d32
EB
223
224 ASSERT_RTNL();
225
226 write_lock_bh(&dev_base_lock);
c6d14c84 227 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 228 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
229 hlist_add_head_rcu(&dev->index_hlist,
230 dev_index_hash(net, dev->ifindex));
ce286d32 231 write_unlock_bh(&dev_base_lock);
4e985ada
TG
232
233 dev_base_seq_inc(net);
ce286d32
EB
234}
235
fb699dfd
ED
236/* Device list removal
237 * caller must respect a RCU grace period before freeing/reusing dev
238 */
ce286d32
EB
239static void unlist_netdevice(struct net_device *dev)
240{
241 ASSERT_RTNL();
242
243 /* Unlink dev from the device chain */
244 write_lock_bh(&dev_base_lock);
c6d14c84 245 list_del_rcu(&dev->dev_list);
72c9528b 246 hlist_del_rcu(&dev->name_hlist);
fb699dfd 247 hlist_del_rcu(&dev->index_hlist);
ce286d32 248 write_unlock_bh(&dev_base_lock);
4e985ada
TG
249
250 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
251}
252
1da177e4
LT
253/*
254 * Our notifier list
255 */
256
f07d5b94 257static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
258
259/*
260 * Device drivers call our routines to queue packets here. We empty the
261 * queue in the local softnet handler.
262 */
bea3348e 263
9958da05 264DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 265EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 266
cf508b12 267#ifdef CONFIG_LOCKDEP
723e98b7 268/*
c773e847 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
270 * according to dev->type
271 */
272static const unsigned short netdev_lock_type[] =
273 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
285 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 288
36cbd3dc 289static const char *const netdev_lock_name[] =
723e98b7
JP
290 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
302 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
305
306static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 307static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
308
309static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310{
311 int i;
312
313 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314 if (netdev_lock_type[i] == dev_type)
315 return i;
316 /* the last key is used by default */
317 return ARRAY_SIZE(netdev_lock_type) - 1;
318}
319
cf508b12
DM
320static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321 unsigned short dev_type)
723e98b7
JP
322{
323 int i;
324
325 i = netdev_lock_pos(dev_type);
326 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327 netdev_lock_name[i]);
328}
cf508b12
DM
329
330static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331{
332 int i;
333
334 i = netdev_lock_pos(dev->type);
335 lockdep_set_class_and_name(&dev->addr_list_lock,
336 &netdev_addr_lock_key[i],
337 netdev_lock_name[i]);
338}
723e98b7 339#else
cf508b12
DM
340static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341 unsigned short dev_type)
342{
343}
344static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
345{
346}
347#endif
1da177e4
LT
348
349/*******************************************************************************
350
351 Protocol management and registration routines
352
353*******************************************************************************/
354
1da177e4
LT
355/*
356 * Add a protocol ID to the list. Now that the input handler is
357 * smarter we can dispense with all the messy stuff that used to be
358 * here.
359 *
360 * BEWARE!!! Protocol handlers, mangling input packets,
361 * MUST BE last in hash buckets and checking protocol handlers
362 * MUST start from promiscuous ptype_all chain in net_bh.
363 * It is true now, do not change it.
364 * Explanation follows: if protocol handler, mangling packet, will
365 * be the first on list, it is not able to sense, that packet
366 * is cloned and should be copied-on-write, so that it will
367 * change it and subsequent readers will get broken packet.
368 * --ANK (980803)
369 */
370
c07b68e8
ED
371static inline struct list_head *ptype_head(const struct packet_type *pt)
372{
373 if (pt->type == htons(ETH_P_ALL))
374 return &ptype_all;
375 else
376 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
377}
378
1da177e4
LT
379/**
380 * dev_add_pack - add packet handler
381 * @pt: packet type declaration
382 *
383 * Add a protocol handler to the networking stack. The passed &packet_type
384 * is linked into kernel lists and may not be freed until it has been
385 * removed from the kernel lists.
386 *
4ec93edb 387 * This call does not sleep therefore it can not
1da177e4
LT
388 * guarantee all CPU's that are in middle of receiving packets
389 * will see the new packet type (until the next received packet).
390 */
391
392void dev_add_pack(struct packet_type *pt)
393{
c07b68e8 394 struct list_head *head = ptype_head(pt);
1da177e4 395
c07b68e8
ED
396 spin_lock(&ptype_lock);
397 list_add_rcu(&pt->list, head);
398 spin_unlock(&ptype_lock);
1da177e4 399}
d1b19dff 400EXPORT_SYMBOL(dev_add_pack);
1da177e4 401
1da177e4
LT
402/**
403 * __dev_remove_pack - remove packet handler
404 * @pt: packet type declaration
405 *
406 * Remove a protocol handler that was previously added to the kernel
407 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
408 * from the kernel lists and can be freed or reused once this function
4ec93edb 409 * returns.
1da177e4
LT
410 *
411 * The packet type might still be in use by receivers
412 * and must not be freed until after all the CPU's have gone
413 * through a quiescent state.
414 */
415void __dev_remove_pack(struct packet_type *pt)
416{
c07b68e8 417 struct list_head *head = ptype_head(pt);
1da177e4
LT
418 struct packet_type *pt1;
419
c07b68e8 420 spin_lock(&ptype_lock);
1da177e4
LT
421
422 list_for_each_entry(pt1, head, list) {
423 if (pt == pt1) {
424 list_del_rcu(&pt->list);
425 goto out;
426 }
427 }
428
7b6cd1ce 429 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 430out:
c07b68e8 431 spin_unlock(&ptype_lock);
1da177e4 432}
d1b19dff
ED
433EXPORT_SYMBOL(__dev_remove_pack);
434
1da177e4
LT
435/**
436 * dev_remove_pack - remove packet handler
437 * @pt: packet type declaration
438 *
439 * Remove a protocol handler that was previously added to the kernel
440 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
441 * from the kernel lists and can be freed or reused once this function
442 * returns.
443 *
444 * This call sleeps to guarantee that no CPU is looking at the packet
445 * type after return.
446 */
447void dev_remove_pack(struct packet_type *pt)
448{
449 __dev_remove_pack(pt);
4ec93edb 450
1da177e4
LT
451 synchronize_net();
452}
d1b19dff 453EXPORT_SYMBOL(dev_remove_pack);
1da177e4 454
62532da9
VY
455
456/**
457 * dev_add_offload - register offload handlers
458 * @po: protocol offload declaration
459 *
460 * Add protocol offload handlers to the networking stack. The passed
461 * &proto_offload is linked into kernel lists and may not be freed until
462 * it has been removed from the kernel lists.
463 *
464 * This call does not sleep therefore it can not
465 * guarantee all CPU's that are in middle of receiving packets
466 * will see the new offload handlers (until the next received packet).
467 */
468void dev_add_offload(struct packet_offload *po)
469{
470 struct list_head *head = &offload_base;
471
472 spin_lock(&offload_lock);
473 list_add_rcu(&po->list, head);
474 spin_unlock(&offload_lock);
475}
476EXPORT_SYMBOL(dev_add_offload);
477
478/**
479 * __dev_remove_offload - remove offload handler
480 * @po: packet offload declaration
481 *
482 * Remove a protocol offload handler that was previously added to the
483 * kernel offload handlers by dev_add_offload(). The passed &offload_type
484 * is removed from the kernel lists and can be freed or reused once this
485 * function returns.
486 *
487 * The packet type might still be in use by receivers
488 * and must not be freed until after all the CPU's have gone
489 * through a quiescent state.
490 */
1d143d9f 491static void __dev_remove_offload(struct packet_offload *po)
62532da9
VY
492{
493 struct list_head *head = &offload_base;
494 struct packet_offload *po1;
495
c53aa505 496 spin_lock(&offload_lock);
62532da9
VY
497
498 list_for_each_entry(po1, head, list) {
499 if (po == po1) {
500 list_del_rcu(&po->list);
501 goto out;
502 }
503 }
504
505 pr_warn("dev_remove_offload: %p not found\n", po);
506out:
c53aa505 507 spin_unlock(&offload_lock);
62532da9 508}
62532da9
VY
509
510/**
511 * dev_remove_offload - remove packet offload handler
512 * @po: packet offload declaration
513 *
514 * Remove a packet offload handler that was previously added to the kernel
515 * offload handlers by dev_add_offload(). The passed &offload_type is
516 * removed from the kernel lists and can be freed or reused once this
517 * function returns.
518 *
519 * This call sleeps to guarantee that no CPU is looking at the packet
520 * type after return.
521 */
522void dev_remove_offload(struct packet_offload *po)
523{
524 __dev_remove_offload(po);
525
526 synchronize_net();
527}
528EXPORT_SYMBOL(dev_remove_offload);
529
1da177e4
LT
530/******************************************************************************
531
532 Device Boot-time Settings Routines
533
534*******************************************************************************/
535
536/* Boot time configuration table */
537static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
538
539/**
540 * netdev_boot_setup_add - add new setup entry
541 * @name: name of the device
542 * @map: configured settings for the device
543 *
544 * Adds new setup entry to the dev_boot_setup list. The function
545 * returns 0 on error and 1 on success. This is a generic routine to
546 * all netdevices.
547 */
548static int netdev_boot_setup_add(char *name, struct ifmap *map)
549{
550 struct netdev_boot_setup *s;
551 int i;
552
553 s = dev_boot_setup;
554 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
555 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
556 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 557 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
558 memcpy(&s[i].map, map, sizeof(s[i].map));
559 break;
560 }
561 }
562
563 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
564}
565
566/**
567 * netdev_boot_setup_check - check boot time settings
568 * @dev: the netdevice
569 *
570 * Check boot time settings for the device.
571 * The found settings are set for the device to be used
572 * later in the device probing.
573 * Returns 0 if no settings found, 1 if they are.
574 */
575int netdev_boot_setup_check(struct net_device *dev)
576{
577 struct netdev_boot_setup *s = dev_boot_setup;
578 int i;
579
580 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
581 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 582 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
583 dev->irq = s[i].map.irq;
584 dev->base_addr = s[i].map.base_addr;
585 dev->mem_start = s[i].map.mem_start;
586 dev->mem_end = s[i].map.mem_end;
587 return 1;
588 }
589 }
590 return 0;
591}
d1b19dff 592EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
593
594
595/**
596 * netdev_boot_base - get address from boot time settings
597 * @prefix: prefix for network device
598 * @unit: id for network device
599 *
600 * Check boot time settings for the base address of device.
601 * The found settings are set for the device to be used
602 * later in the device probing.
603 * Returns 0 if no settings found.
604 */
605unsigned long netdev_boot_base(const char *prefix, int unit)
606{
607 const struct netdev_boot_setup *s = dev_boot_setup;
608 char name[IFNAMSIZ];
609 int i;
610
611 sprintf(name, "%s%d", prefix, unit);
612
613 /*
614 * If device already registered then return base of 1
615 * to indicate not to probe for this interface
616 */
881d966b 617 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
618 return 1;
619
620 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
621 if (!strcmp(name, s[i].name))
622 return s[i].map.base_addr;
623 return 0;
624}
625
626/*
627 * Saves at boot time configured settings for any netdevice.
628 */
629int __init netdev_boot_setup(char *str)
630{
631 int ints[5];
632 struct ifmap map;
633
634 str = get_options(str, ARRAY_SIZE(ints), ints);
635 if (!str || !*str)
636 return 0;
637
638 /* Save settings */
639 memset(&map, 0, sizeof(map));
640 if (ints[0] > 0)
641 map.irq = ints[1];
642 if (ints[0] > 1)
643 map.base_addr = ints[2];
644 if (ints[0] > 2)
645 map.mem_start = ints[3];
646 if (ints[0] > 3)
647 map.mem_end = ints[4];
648
649 /* Add new entry to the list */
650 return netdev_boot_setup_add(str, &map);
651}
652
653__setup("netdev=", netdev_boot_setup);
654
655/*******************************************************************************
656
657 Device Interface Subroutines
658
659*******************************************************************************/
660
661/**
662 * __dev_get_by_name - find a device by its name
c4ea43c5 663 * @net: the applicable net namespace
1da177e4
LT
664 * @name: name to find
665 *
666 * Find an interface by name. Must be called under RTNL semaphore
667 * or @dev_base_lock. If the name is found a pointer to the device
668 * is returned. If the name is not found then %NULL is returned. The
669 * reference counters are not incremented so the caller must be
670 * careful with locks.
671 */
672
881d966b 673struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 674{
0bd8d536
ED
675 struct net_device *dev;
676 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 677
b67bfe0d 678 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
679 if (!strncmp(dev->name, name, IFNAMSIZ))
680 return dev;
0bd8d536 681
1da177e4
LT
682 return NULL;
683}
d1b19dff 684EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 685
72c9528b
ED
686/**
687 * dev_get_by_name_rcu - find a device by its name
688 * @net: the applicable net namespace
689 * @name: name to find
690 *
691 * Find an interface by name.
692 * If the name is found a pointer to the device is returned.
693 * If the name is not found then %NULL is returned.
694 * The reference counters are not incremented so the caller must be
695 * careful with locks. The caller must hold RCU lock.
696 */
697
698struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
699{
72c9528b
ED
700 struct net_device *dev;
701 struct hlist_head *head = dev_name_hash(net, name);
702
b67bfe0d 703 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
704 if (!strncmp(dev->name, name, IFNAMSIZ))
705 return dev;
706
707 return NULL;
708}
709EXPORT_SYMBOL(dev_get_by_name_rcu);
710
1da177e4
LT
711/**
712 * dev_get_by_name - find a device by its name
c4ea43c5 713 * @net: the applicable net namespace
1da177e4
LT
714 * @name: name to find
715 *
716 * Find an interface by name. This can be called from any
717 * context and does its own locking. The returned handle has
718 * the usage count incremented and the caller must use dev_put() to
719 * release it when it is no longer needed. %NULL is returned if no
720 * matching device is found.
721 */
722
881d966b 723struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
724{
725 struct net_device *dev;
726
72c9528b
ED
727 rcu_read_lock();
728 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
729 if (dev)
730 dev_hold(dev);
72c9528b 731 rcu_read_unlock();
1da177e4
LT
732 return dev;
733}
d1b19dff 734EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
735
736/**
737 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 738 * @net: the applicable net namespace
1da177e4
LT
739 * @ifindex: index of device
740 *
741 * Search for an interface by index. Returns %NULL if the device
742 * is not found or a pointer to the device. The device has not
743 * had its reference counter increased so the caller must be careful
744 * about locking. The caller must hold either the RTNL semaphore
745 * or @dev_base_lock.
746 */
747
881d966b 748struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 749{
0bd8d536
ED
750 struct net_device *dev;
751 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 752
b67bfe0d 753 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
754 if (dev->ifindex == ifindex)
755 return dev;
0bd8d536 756
1da177e4
LT
757 return NULL;
758}
d1b19dff 759EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 760
fb699dfd
ED
761/**
762 * dev_get_by_index_rcu - find a device by its ifindex
763 * @net: the applicable net namespace
764 * @ifindex: index of device
765 *
766 * Search for an interface by index. Returns %NULL if the device
767 * is not found or a pointer to the device. The device has not
768 * had its reference counter increased so the caller must be careful
769 * about locking. The caller must hold RCU lock.
770 */
771
772struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
773{
fb699dfd
ED
774 struct net_device *dev;
775 struct hlist_head *head = dev_index_hash(net, ifindex);
776
b67bfe0d 777 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
778 if (dev->ifindex == ifindex)
779 return dev;
780
781 return NULL;
782}
783EXPORT_SYMBOL(dev_get_by_index_rcu);
784
1da177e4
LT
785
786/**
787 * dev_get_by_index - find a device by its ifindex
c4ea43c5 788 * @net: the applicable net namespace
1da177e4
LT
789 * @ifindex: index of device
790 *
791 * Search for an interface by index. Returns NULL if the device
792 * is not found or a pointer to the device. The device returned has
793 * had a reference added and the pointer is safe until the user calls
794 * dev_put to indicate they have finished with it.
795 */
796
881d966b 797struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
798{
799 struct net_device *dev;
800
fb699dfd
ED
801 rcu_read_lock();
802 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
803 if (dev)
804 dev_hold(dev);
fb699dfd 805 rcu_read_unlock();
1da177e4
LT
806 return dev;
807}
d1b19dff 808EXPORT_SYMBOL(dev_get_by_index);
1da177e4 809
5dbe7c17
NS
810/**
811 * netdev_get_name - get a netdevice name, knowing its ifindex.
812 * @net: network namespace
813 * @name: a pointer to the buffer where the name will be stored.
814 * @ifindex: the ifindex of the interface to get the name from.
815 *
816 * The use of raw_seqcount_begin() and cond_resched() before
817 * retrying is required as we want to give the writers a chance
818 * to complete when CONFIG_PREEMPT is not set.
819 */
820int netdev_get_name(struct net *net, char *name, int ifindex)
821{
822 struct net_device *dev;
823 unsigned int seq;
824
825retry:
826 seq = raw_seqcount_begin(&devnet_rename_seq);
827 rcu_read_lock();
828 dev = dev_get_by_index_rcu(net, ifindex);
829 if (!dev) {
830 rcu_read_unlock();
831 return -ENODEV;
832 }
833
834 strcpy(name, dev->name);
835 rcu_read_unlock();
836 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
837 cond_resched();
838 goto retry;
839 }
840
841 return 0;
842}
843
1da177e4 844/**
941666c2 845 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 846 * @net: the applicable net namespace
1da177e4
LT
847 * @type: media type of device
848 * @ha: hardware address
849 *
850 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
851 * is not found or a pointer to the device.
852 * The caller must hold RCU or RTNL.
941666c2 853 * The returned device has not had its ref count increased
1da177e4
LT
854 * and the caller must therefore be careful about locking
855 *
1da177e4
LT
856 */
857
941666c2
ED
858struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
859 const char *ha)
1da177e4
LT
860{
861 struct net_device *dev;
862
941666c2 863 for_each_netdev_rcu(net, dev)
1da177e4
LT
864 if (dev->type == type &&
865 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
866 return dev;
867
868 return NULL;
1da177e4 869}
941666c2 870EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 871
881d966b 872struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
873{
874 struct net_device *dev;
875
4e9cac2b 876 ASSERT_RTNL();
881d966b 877 for_each_netdev(net, dev)
4e9cac2b 878 if (dev->type == type)
7562f876
PE
879 return dev;
880
881 return NULL;
4e9cac2b 882}
4e9cac2b
PM
883EXPORT_SYMBOL(__dev_getfirstbyhwtype);
884
881d966b 885struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 886{
99fe3c39 887 struct net_device *dev, *ret = NULL;
4e9cac2b 888
99fe3c39
ED
889 rcu_read_lock();
890 for_each_netdev_rcu(net, dev)
891 if (dev->type == type) {
892 dev_hold(dev);
893 ret = dev;
894 break;
895 }
896 rcu_read_unlock();
897 return ret;
1da177e4 898}
1da177e4
LT
899EXPORT_SYMBOL(dev_getfirstbyhwtype);
900
901/**
6c555490 902 * __dev_get_by_flags - find any device with given flags
c4ea43c5 903 * @net: the applicable net namespace
1da177e4
LT
904 * @if_flags: IFF_* values
905 * @mask: bitmask of bits in if_flags to check
906 *
907 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04 908 * is not found or a pointer to the device. Must be called inside
6c555490 909 * rtnl_lock(), and result refcount is unchanged.
1da177e4
LT
910 */
911
6c555490
WC
912struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
913 unsigned short mask)
1da177e4 914{
7562f876 915 struct net_device *dev, *ret;
1da177e4 916
6c555490
WC
917 ASSERT_RTNL();
918
7562f876 919 ret = NULL;
6c555490 920 for_each_netdev(net, dev) {
1da177e4 921 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 922 ret = dev;
1da177e4
LT
923 break;
924 }
925 }
7562f876 926 return ret;
1da177e4 927}
6c555490 928EXPORT_SYMBOL(__dev_get_by_flags);
1da177e4
LT
929
930/**
931 * dev_valid_name - check if name is okay for network device
932 * @name: name string
933 *
934 * Network device names need to be valid file names to
c7fa9d18
DM
935 * to allow sysfs to work. We also disallow any kind of
936 * whitespace.
1da177e4 937 */
95f050bf 938bool dev_valid_name(const char *name)
1da177e4 939{
c7fa9d18 940 if (*name == '\0')
95f050bf 941 return false;
b6fe17d6 942 if (strlen(name) >= IFNAMSIZ)
95f050bf 943 return false;
c7fa9d18 944 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 945 return false;
c7fa9d18
DM
946
947 while (*name) {
948 if (*name == '/' || isspace(*name))
95f050bf 949 return false;
c7fa9d18
DM
950 name++;
951 }
95f050bf 952 return true;
1da177e4 953}
d1b19dff 954EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
955
956/**
b267b179
EB
957 * __dev_alloc_name - allocate a name for a device
958 * @net: network namespace to allocate the device name in
1da177e4 959 * @name: name format string
b267b179 960 * @buf: scratch buffer and result name string
1da177e4
LT
961 *
962 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
963 * id. It scans list of devices to build up a free map, then chooses
964 * the first empty slot. The caller must hold the dev_base or rtnl lock
965 * while allocating the name and adding the device in order to avoid
966 * duplicates.
967 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
968 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
969 */
970
b267b179 971static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
972{
973 int i = 0;
1da177e4
LT
974 const char *p;
975 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 976 unsigned long *inuse;
1da177e4
LT
977 struct net_device *d;
978
979 p = strnchr(name, IFNAMSIZ-1, '%');
980 if (p) {
981 /*
982 * Verify the string as this thing may have come from
983 * the user. There must be either one "%d" and no other "%"
984 * characters.
985 */
986 if (p[1] != 'd' || strchr(p + 2, '%'))
987 return -EINVAL;
988
989 /* Use one page as a bit array of possible slots */
cfcabdcc 990 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
991 if (!inuse)
992 return -ENOMEM;
993
881d966b 994 for_each_netdev(net, d) {
1da177e4
LT
995 if (!sscanf(d->name, name, &i))
996 continue;
997 if (i < 0 || i >= max_netdevices)
998 continue;
999
1000 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 1001 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
1002 if (!strncmp(buf, d->name, IFNAMSIZ))
1003 set_bit(i, inuse);
1004 }
1005
1006 i = find_first_zero_bit(inuse, max_netdevices);
1007 free_page((unsigned long) inuse);
1008 }
1009
d9031024
OP
1010 if (buf != name)
1011 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1012 if (!__dev_get_by_name(net, buf))
1da177e4 1013 return i;
1da177e4
LT
1014
1015 /* It is possible to run out of possible slots
1016 * when the name is long and there isn't enough space left
1017 * for the digits, or if all bits are used.
1018 */
1019 return -ENFILE;
1020}
1021
b267b179
EB
1022/**
1023 * dev_alloc_name - allocate a name for a device
1024 * @dev: device
1025 * @name: name format string
1026 *
1027 * Passed a format string - eg "lt%d" it will try and find a suitable
1028 * id. It scans list of devices to build up a free map, then chooses
1029 * the first empty slot. The caller must hold the dev_base or rtnl lock
1030 * while allocating the name and adding the device in order to avoid
1031 * duplicates.
1032 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033 * Returns the number of the unit assigned or a negative errno code.
1034 */
1035
1036int dev_alloc_name(struct net_device *dev, const char *name)
1037{
1038 char buf[IFNAMSIZ];
1039 struct net *net;
1040 int ret;
1041
c346dca1
YH
1042 BUG_ON(!dev_net(dev));
1043 net = dev_net(dev);
b267b179
EB
1044 ret = __dev_alloc_name(net, name, buf);
1045 if (ret >= 0)
1046 strlcpy(dev->name, buf, IFNAMSIZ);
1047 return ret;
1048}
d1b19dff 1049EXPORT_SYMBOL(dev_alloc_name);
b267b179 1050
828de4f6
G
1051static int dev_alloc_name_ns(struct net *net,
1052 struct net_device *dev,
1053 const char *name)
d9031024 1054{
828de4f6
G
1055 char buf[IFNAMSIZ];
1056 int ret;
8ce6cebc 1057
828de4f6
G
1058 ret = __dev_alloc_name(net, name, buf);
1059 if (ret >= 0)
1060 strlcpy(dev->name, buf, IFNAMSIZ);
1061 return ret;
1062}
1063
1064static int dev_get_valid_name(struct net *net,
1065 struct net_device *dev,
1066 const char *name)
1067{
1068 BUG_ON(!net);
8ce6cebc 1069
d9031024
OP
1070 if (!dev_valid_name(name))
1071 return -EINVAL;
1072
1c5cae81 1073 if (strchr(name, '%'))
828de4f6 1074 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1075 else if (__dev_get_by_name(net, name))
1076 return -EEXIST;
8ce6cebc
DL
1077 else if (dev->name != name)
1078 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1079
1080 return 0;
1081}
1da177e4
LT
1082
1083/**
1084 * dev_change_name - change name of a device
1085 * @dev: device
1086 * @newname: name (or format string) must be at least IFNAMSIZ
1087 *
1088 * Change name of a device, can pass format strings "eth%d".
1089 * for wildcarding.
1090 */
cf04a4c7 1091int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1092{
238fa362 1093 unsigned char old_assign_type;
fcc5a03a 1094 char oldname[IFNAMSIZ];
1da177e4 1095 int err = 0;
fcc5a03a 1096 int ret;
881d966b 1097 struct net *net;
1da177e4
LT
1098
1099 ASSERT_RTNL();
c346dca1 1100 BUG_ON(!dev_net(dev));
1da177e4 1101
c346dca1 1102 net = dev_net(dev);
1da177e4
LT
1103 if (dev->flags & IFF_UP)
1104 return -EBUSY;
1105
30e6c9fa 1106 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1107
1108 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1109 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1110 return 0;
c91f6df2 1111 }
c8d90dca 1112
fcc5a03a
HX
1113 memcpy(oldname, dev->name, IFNAMSIZ);
1114
828de4f6 1115 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1116 if (err < 0) {
30e6c9fa 1117 write_seqcount_end(&devnet_rename_seq);
d9031024 1118 return err;
c91f6df2 1119 }
1da177e4 1120
6fe82a39
VF
1121 if (oldname[0] && !strchr(oldname, '%'))
1122 netdev_info(dev, "renamed from %s\n", oldname);
1123
238fa362
TG
1124 old_assign_type = dev->name_assign_type;
1125 dev->name_assign_type = NET_NAME_RENAMED;
1126
fcc5a03a 1127rollback:
a1b3f594
EB
1128 ret = device_rename(&dev->dev, dev->name);
1129 if (ret) {
1130 memcpy(dev->name, oldname, IFNAMSIZ);
238fa362 1131 dev->name_assign_type = old_assign_type;
30e6c9fa 1132 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1133 return ret;
dcc99773 1134 }
7f988eab 1135
30e6c9fa 1136 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1137
5bb025fa
VF
1138 netdev_adjacent_rename_links(dev, oldname);
1139
7f988eab 1140 write_lock_bh(&dev_base_lock);
372b2312 1141 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1142 write_unlock_bh(&dev_base_lock);
1143
1144 synchronize_rcu();
1145
1146 write_lock_bh(&dev_base_lock);
1147 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1148 write_unlock_bh(&dev_base_lock);
1149
056925ab 1150 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1151 ret = notifier_to_errno(ret);
1152
1153 if (ret) {
91e9c07b
ED
1154 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155 if (err >= 0) {
fcc5a03a 1156 err = ret;
30e6c9fa 1157 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a 1158 memcpy(dev->name, oldname, IFNAMSIZ);
5bb025fa 1159 memcpy(oldname, newname, IFNAMSIZ);
238fa362
TG
1160 dev->name_assign_type = old_assign_type;
1161 old_assign_type = NET_NAME_RENAMED;
fcc5a03a 1162 goto rollback;
91e9c07b 1163 } else {
7b6cd1ce 1164 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1165 dev->name, ret);
fcc5a03a
HX
1166 }
1167 }
1da177e4
LT
1168
1169 return err;
1170}
1171
0b815a1a
SH
1172/**
1173 * dev_set_alias - change ifalias of a device
1174 * @dev: device
1175 * @alias: name up to IFALIASZ
f0db275a 1176 * @len: limit of bytes to copy from info
0b815a1a
SH
1177 *
1178 * Set ifalias for a device,
1179 */
1180int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181{
7364e445
AK
1182 char *new_ifalias;
1183
0b815a1a
SH
1184 ASSERT_RTNL();
1185
1186 if (len >= IFALIASZ)
1187 return -EINVAL;
1188
96ca4a2c 1189 if (!len) {
388dfc2d
SK
1190 kfree(dev->ifalias);
1191 dev->ifalias = NULL;
96ca4a2c
OH
1192 return 0;
1193 }
1194
7364e445
AK
1195 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196 if (!new_ifalias)
0b815a1a 1197 return -ENOMEM;
7364e445 1198 dev->ifalias = new_ifalias;
0b815a1a
SH
1199
1200 strlcpy(dev->ifalias, alias, len+1);
1201 return len;
1202}
1203
1204
d8a33ac4 1205/**
3041a069 1206 * netdev_features_change - device changes features
d8a33ac4
SH
1207 * @dev: device to cause notification
1208 *
1209 * Called to indicate a device has changed features.
1210 */
1211void netdev_features_change(struct net_device *dev)
1212{
056925ab 1213 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1214}
1215EXPORT_SYMBOL(netdev_features_change);
1216
1da177e4
LT
1217/**
1218 * netdev_state_change - device changes state
1219 * @dev: device to cause notification
1220 *
1221 * Called to indicate a device has changed state. This function calls
1222 * the notifier chains for netdev_chain and sends a NEWLINK message
1223 * to the routing socket.
1224 */
1225void netdev_state_change(struct net_device *dev)
1226{
1227 if (dev->flags & IFF_UP) {
54951194
LP
1228 struct netdev_notifier_change_info change_info;
1229
1230 change_info.flags_changed = 0;
1231 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232 &change_info.info);
7f294054 1233 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1da177e4
LT
1234 }
1235}
d1b19dff 1236EXPORT_SYMBOL(netdev_state_change);
1da177e4 1237
ee89bab1
AW
1238/**
1239 * netdev_notify_peers - notify network peers about existence of @dev
1240 * @dev: network device
1241 *
1242 * Generate traffic such that interested network peers are aware of
1243 * @dev, such as by generating a gratuitous ARP. This may be used when
1244 * a device wants to inform the rest of the network about some sort of
1245 * reconfiguration such as a failover event or virtual machine
1246 * migration.
1247 */
1248void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1249{
ee89bab1
AW
1250 rtnl_lock();
1251 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252 rtnl_unlock();
c1da4ac7 1253}
ee89bab1 1254EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1255
bd380811 1256static int __dev_open(struct net_device *dev)
1da177e4 1257{
d314774c 1258 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1259 int ret;
1da177e4 1260
e46b66bc
BH
1261 ASSERT_RTNL();
1262
1da177e4
LT
1263 if (!netif_device_present(dev))
1264 return -ENODEV;
1265
ca99ca14
NH
1266 /* Block netpoll from trying to do any rx path servicing.
1267 * If we don't do this there is a chance ndo_poll_controller
1268 * or ndo_poll may be running while we open the device
1269 */
66b5552f 1270 netpoll_poll_disable(dev);
ca99ca14 1271
3b8bcfd5
JB
1272 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273 ret = notifier_to_errno(ret);
1274 if (ret)
1275 return ret;
1276
1da177e4 1277 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1278
d314774c
SH
1279 if (ops->ndo_validate_addr)
1280 ret = ops->ndo_validate_addr(dev);
bada339b 1281
d314774c
SH
1282 if (!ret && ops->ndo_open)
1283 ret = ops->ndo_open(dev);
1da177e4 1284
66b5552f 1285 netpoll_poll_enable(dev);
ca99ca14 1286
bada339b
JG
1287 if (ret)
1288 clear_bit(__LINK_STATE_START, &dev->state);
1289 else {
1da177e4 1290 dev->flags |= IFF_UP;
4417da66 1291 dev_set_rx_mode(dev);
1da177e4 1292 dev_activate(dev);
7bf23575 1293 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1294 }
bada339b 1295
1da177e4
LT
1296 return ret;
1297}
1298
1299/**
bd380811
PM
1300 * dev_open - prepare an interface for use.
1301 * @dev: device to open
1da177e4 1302 *
bd380811
PM
1303 * Takes a device from down to up state. The device's private open
1304 * function is invoked and then the multicast lists are loaded. Finally
1305 * the device is moved into the up state and a %NETDEV_UP message is
1306 * sent to the netdev notifier chain.
1307 *
1308 * Calling this function on an active interface is a nop. On a failure
1309 * a negative errno code is returned.
1da177e4 1310 */
bd380811
PM
1311int dev_open(struct net_device *dev)
1312{
1313 int ret;
1314
bd380811
PM
1315 if (dev->flags & IFF_UP)
1316 return 0;
1317
bd380811
PM
1318 ret = __dev_open(dev);
1319 if (ret < 0)
1320 return ret;
1321
7f294054 1322 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
bd380811
PM
1323 call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325 return ret;
1326}
1327EXPORT_SYMBOL(dev_open);
1328
44345724 1329static int __dev_close_many(struct list_head *head)
1da177e4 1330{
44345724 1331 struct net_device *dev;
e46b66bc 1332
bd380811 1333 ASSERT_RTNL();
9d5010db
DM
1334 might_sleep();
1335
5cde2829 1336 list_for_each_entry(dev, head, close_list) {
3f4df206 1337 /* Temporarily disable netpoll until the interface is down */
66b5552f 1338 netpoll_poll_disable(dev);
3f4df206 1339
44345724 1340 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1341
44345724 1342 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1343
44345724
OP
1344 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345 * can be even on different cpu. So just clear netif_running().
1346 *
1347 * dev->stop() will invoke napi_disable() on all of it's
1348 * napi_struct instances on this device.
1349 */
4e857c58 1350 smp_mb__after_atomic(); /* Commit netif_running(). */
44345724 1351 }
1da177e4 1352
44345724 1353 dev_deactivate_many(head);
d8b2a4d2 1354
5cde2829 1355 list_for_each_entry(dev, head, close_list) {
44345724 1356 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1357
44345724
OP
1358 /*
1359 * Call the device specific close. This cannot fail.
1360 * Only if device is UP
1361 *
1362 * We allow it to be called even after a DETACH hot-plug
1363 * event.
1364 */
1365 if (ops->ndo_stop)
1366 ops->ndo_stop(dev);
1367
44345724 1368 dev->flags &= ~IFF_UP;
66b5552f 1369 netpoll_poll_enable(dev);
44345724
OP
1370 }
1371
1372 return 0;
1373}
1374
1375static int __dev_close(struct net_device *dev)
1376{
f87e6f47 1377 int retval;
44345724
OP
1378 LIST_HEAD(single);
1379
5cde2829 1380 list_add(&dev->close_list, &single);
f87e6f47
LT
1381 retval = __dev_close_many(&single);
1382 list_del(&single);
ca99ca14 1383
f87e6f47 1384 return retval;
44345724
OP
1385}
1386
3fbd8758 1387static int dev_close_many(struct list_head *head)
44345724
OP
1388{
1389 struct net_device *dev, *tmp;
1da177e4 1390
5cde2829
EB
1391 /* Remove the devices that don't need to be closed */
1392 list_for_each_entry_safe(dev, tmp, head, close_list)
44345724 1393 if (!(dev->flags & IFF_UP))
5cde2829 1394 list_del_init(&dev->close_list);
44345724
OP
1395
1396 __dev_close_many(head);
1da177e4 1397
5cde2829 1398 list_for_each_entry_safe(dev, tmp, head, close_list) {
7f294054 1399 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
44345724 1400 call_netdevice_notifiers(NETDEV_DOWN, dev);
5cde2829 1401 list_del_init(&dev->close_list);
44345724 1402 }
bd380811
PM
1403
1404 return 0;
1405}
1406
1407/**
1408 * dev_close - shutdown an interface.
1409 * @dev: device to shutdown
1410 *
1411 * This function moves an active device into down state. A
1412 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414 * chain.
1415 */
1416int dev_close(struct net_device *dev)
1417{
e14a5993
ED
1418 if (dev->flags & IFF_UP) {
1419 LIST_HEAD(single);
1da177e4 1420
5cde2829 1421 list_add(&dev->close_list, &single);
e14a5993
ED
1422 dev_close_many(&single);
1423 list_del(&single);
1424 }
da6e378b 1425 return 0;
1da177e4 1426}
d1b19dff 1427EXPORT_SYMBOL(dev_close);
1da177e4
LT
1428
1429
0187bdfb
BH
1430/**
1431 * dev_disable_lro - disable Large Receive Offload on a device
1432 * @dev: device
1433 *
1434 * Disable Large Receive Offload (LRO) on a net device. Must be
1435 * called under RTNL. This is needed if received packets may be
1436 * forwarded to another interface.
1437 */
1438void dev_disable_lro(struct net_device *dev)
1439{
fbe168ba
MK
1440 struct net_device *lower_dev;
1441 struct list_head *iter;
529d0489 1442
bc5787c6
MM
1443 dev->wanted_features &= ~NETIF_F_LRO;
1444 netdev_update_features(dev);
27660515 1445
22d5969f
MM
1446 if (unlikely(dev->features & NETIF_F_LRO))
1447 netdev_WARN(dev, "failed to disable LRO!\n");
fbe168ba
MK
1448
1449 netdev_for_each_lower_dev(dev, lower_dev, iter)
1450 dev_disable_lro(lower_dev);
0187bdfb
BH
1451}
1452EXPORT_SYMBOL(dev_disable_lro);
1453
351638e7
JP
1454static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455 struct net_device *dev)
1456{
1457 struct netdev_notifier_info info;
1458
1459 netdev_notifier_info_init(&info, dev);
1460 return nb->notifier_call(nb, val, &info);
1461}
0187bdfb 1462
881d966b
EB
1463static int dev_boot_phase = 1;
1464
1da177e4
LT
1465/**
1466 * register_netdevice_notifier - register a network notifier block
1467 * @nb: notifier
1468 *
1469 * Register a notifier to be called when network device events occur.
1470 * The notifier passed is linked into the kernel structures and must
1471 * not be reused until it has been unregistered. A negative errno code
1472 * is returned on a failure.
1473 *
1474 * When registered all registration and up events are replayed
4ec93edb 1475 * to the new notifier to allow device to have a race free
1da177e4
LT
1476 * view of the network device list.
1477 */
1478
1479int register_netdevice_notifier(struct notifier_block *nb)
1480{
1481 struct net_device *dev;
fcc5a03a 1482 struct net_device *last;
881d966b 1483 struct net *net;
1da177e4
LT
1484 int err;
1485
1486 rtnl_lock();
f07d5b94 1487 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1488 if (err)
1489 goto unlock;
881d966b
EB
1490 if (dev_boot_phase)
1491 goto unlock;
1492 for_each_net(net) {
1493 for_each_netdev(net, dev) {
351638e7 1494 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
881d966b
EB
1495 err = notifier_to_errno(err);
1496 if (err)
1497 goto rollback;
1498
1499 if (!(dev->flags & IFF_UP))
1500 continue;
1da177e4 1501
351638e7 1502 call_netdevice_notifier(nb, NETDEV_UP, dev);
881d966b 1503 }
1da177e4 1504 }
fcc5a03a
HX
1505
1506unlock:
1da177e4
LT
1507 rtnl_unlock();
1508 return err;
fcc5a03a
HX
1509
1510rollback:
1511 last = dev;
881d966b
EB
1512 for_each_net(net) {
1513 for_each_netdev(net, dev) {
1514 if (dev == last)
8f891489 1515 goto outroll;
fcc5a03a 1516
881d966b 1517 if (dev->flags & IFF_UP) {
351638e7
JP
1518 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519 dev);
1520 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
881d966b 1521 }
351638e7 1522 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1523 }
fcc5a03a 1524 }
c67625a1 1525
8f891489 1526outroll:
c67625a1 1527 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1528 goto unlock;
1da177e4 1529}
d1b19dff 1530EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1531
1532/**
1533 * unregister_netdevice_notifier - unregister a network notifier block
1534 * @nb: notifier
1535 *
1536 * Unregister a notifier previously registered by
1537 * register_netdevice_notifier(). The notifier is unlinked into the
1538 * kernel structures and may then be reused. A negative errno code
1539 * is returned on a failure.
7d3d43da
EB
1540 *
1541 * After unregistering unregister and down device events are synthesized
1542 * for all devices on the device list to the removed notifier to remove
1543 * the need for special case cleanup code.
1da177e4
LT
1544 */
1545
1546int unregister_netdevice_notifier(struct notifier_block *nb)
1547{
7d3d43da
EB
1548 struct net_device *dev;
1549 struct net *net;
9f514950
HX
1550 int err;
1551
1552 rtnl_lock();
f07d5b94 1553 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1554 if (err)
1555 goto unlock;
1556
1557 for_each_net(net) {
1558 for_each_netdev(net, dev) {
1559 if (dev->flags & IFF_UP) {
351638e7
JP
1560 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561 dev);
1562 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
7d3d43da 1563 }
351638e7 1564 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1565 }
1566 }
1567unlock:
9f514950
HX
1568 rtnl_unlock();
1569 return err;
1da177e4 1570}
d1b19dff 1571EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4 1572
351638e7
JP
1573/**
1574 * call_netdevice_notifiers_info - call all network notifier blocks
1575 * @val: value passed unmodified to notifier function
1576 * @dev: net_device pointer passed unmodified to notifier function
1577 * @info: notifier information data
1578 *
1579 * Call all network notifier blocks. Parameters and return value
1580 * are as for raw_notifier_call_chain().
1581 */
1582
1d143d9f 1583static int call_netdevice_notifiers_info(unsigned long val,
1584 struct net_device *dev,
1585 struct netdev_notifier_info *info)
351638e7
JP
1586{
1587 ASSERT_RTNL();
1588 netdev_notifier_info_init(info, dev);
1589 return raw_notifier_call_chain(&netdev_chain, val, info);
1590}
351638e7 1591
1da177e4
LT
1592/**
1593 * call_netdevice_notifiers - call all network notifier blocks
1594 * @val: value passed unmodified to notifier function
c4ea43c5 1595 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1596 *
1597 * Call all network notifier blocks. Parameters and return value
f07d5b94 1598 * are as for raw_notifier_call_chain().
1da177e4
LT
1599 */
1600
ad7379d4 1601int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1602{
351638e7
JP
1603 struct netdev_notifier_info info;
1604
1605 return call_netdevice_notifiers_info(val, dev, &info);
1da177e4 1606}
edf947f1 1607EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1608
c5905afb 1609static struct static_key netstamp_needed __read_mostly;
b90e5794 1610#ifdef HAVE_JUMP_LABEL
c5905afb 1611/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1612 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1613 * static_key_slow_dec() calls.
b90e5794
ED
1614 */
1615static atomic_t netstamp_needed_deferred;
1616#endif
1da177e4
LT
1617
1618void net_enable_timestamp(void)
1619{
b90e5794
ED
1620#ifdef HAVE_JUMP_LABEL
1621 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622
1623 if (deferred) {
1624 while (--deferred)
c5905afb 1625 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1626 return;
1627 }
1628#endif
c5905afb 1629 static_key_slow_inc(&netstamp_needed);
1da177e4 1630}
d1b19dff 1631EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1632
1633void net_disable_timestamp(void)
1634{
b90e5794
ED
1635#ifdef HAVE_JUMP_LABEL
1636 if (in_interrupt()) {
1637 atomic_inc(&netstamp_needed_deferred);
1638 return;
1639 }
1640#endif
c5905afb 1641 static_key_slow_dec(&netstamp_needed);
1da177e4 1642}
d1b19dff 1643EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1644
3b098e2d 1645static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1646{
588f0330 1647 skb->tstamp.tv64 = 0;
c5905afb 1648 if (static_key_false(&netstamp_needed))
a61bbcf2 1649 __net_timestamp(skb);
1da177e4
LT
1650}
1651
588f0330 1652#define net_timestamp_check(COND, SKB) \
c5905afb 1653 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1654 if ((COND) && !(SKB)->tstamp.tv64) \
1655 __net_timestamp(SKB); \
1656 } \
3b098e2d 1657
1ee481fb 1658bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
79b569f0
DL
1659{
1660 unsigned int len;
1661
1662 if (!(dev->flags & IFF_UP))
1663 return false;
1664
1665 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666 if (skb->len <= len)
1667 return true;
1668
1669 /* if TSO is enabled, we don't care about the length as the packet
1670 * could be forwarded without being segmented before
1671 */
1672 if (skb_is_gso(skb))
1673 return true;
1674
1675 return false;
1676}
1ee481fb 1677EXPORT_SYMBOL_GPL(is_skb_forwardable);
79b569f0 1678
a0265d28
HX
1679int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680{
1681 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683 atomic_long_inc(&dev->rx_dropped);
1684 kfree_skb(skb);
1685 return NET_RX_DROP;
1686 }
1687 }
1688
1689 if (unlikely(!is_skb_forwardable(dev, skb))) {
1690 atomic_long_inc(&dev->rx_dropped);
1691 kfree_skb(skb);
1692 return NET_RX_DROP;
1693 }
1694
1695 skb_scrub_packet(skb, true);
1696 skb->protocol = eth_type_trans(skb, dev);
1697
1698 return 0;
1699}
1700EXPORT_SYMBOL_GPL(__dev_forward_skb);
1701
44540960
AB
1702/**
1703 * dev_forward_skb - loopback an skb to another netif
1704 *
1705 * @dev: destination network device
1706 * @skb: buffer to forward
1707 *
1708 * return values:
1709 * NET_RX_SUCCESS (no congestion)
6ec82562 1710 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1711 *
1712 * dev_forward_skb can be used for injecting an skb from the
1713 * start_xmit function of one device into the receive queue
1714 * of another device.
1715 *
1716 * The receiving device may be in another namespace, so
1717 * we have to clear all information in the skb that could
1718 * impact namespace isolation.
1719 */
1720int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1721{
a0265d28 1722 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
44540960
AB
1723}
1724EXPORT_SYMBOL_GPL(dev_forward_skb);
1725
71d9dec2
CG
1726static inline int deliver_skb(struct sk_buff *skb,
1727 struct packet_type *pt_prev,
1728 struct net_device *orig_dev)
1729{
1080e512
MT
1730 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1731 return -ENOMEM;
71d9dec2
CG
1732 atomic_inc(&skb->users);
1733 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1734}
1735
c0de08d0
EL
1736static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1737{
a3d744e9 1738 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1739 return false;
1740
1741 if (ptype->id_match)
1742 return ptype->id_match(ptype, skb->sk);
1743 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1744 return true;
1745
1746 return false;
1747}
1748
1da177e4
LT
1749/*
1750 * Support routine. Sends outgoing frames to any network
1751 * taps currently in use.
1752 */
1753
f6a78bfc 1754static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1755{
1756 struct packet_type *ptype;
71d9dec2
CG
1757 struct sk_buff *skb2 = NULL;
1758 struct packet_type *pt_prev = NULL;
a61bbcf2 1759
1da177e4
LT
1760 rcu_read_lock();
1761 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1762 /* Never send packets back to the socket
1763 * they originated from - MvS (miquels@drinkel.ow.org)
1764 */
1765 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1766 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1767 if (pt_prev) {
1768 deliver_skb(skb2, pt_prev, skb->dev);
1769 pt_prev = ptype;
1770 continue;
1771 }
1772
1773 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1774 if (!skb2)
1775 break;
1776
70978182
ED
1777 net_timestamp_set(skb2);
1778
1da177e4
LT
1779 /* skb->nh should be correctly
1780 set by sender, so that the second statement is
1781 just protection against buggy protocols.
1782 */
459a98ed 1783 skb_reset_mac_header(skb2);
1da177e4 1784
d56f90a7 1785 if (skb_network_header(skb2) < skb2->data ||
ced14f68 1786 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
e87cc472
JP
1787 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1788 ntohs(skb2->protocol),
1789 dev->name);
c1d2bbe1 1790 skb_reset_network_header(skb2);
1da177e4
LT
1791 }
1792
b0e380b1 1793 skb2->transport_header = skb2->network_header;
1da177e4 1794 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1795 pt_prev = ptype;
1da177e4
LT
1796 }
1797 }
71d9dec2
CG
1798 if (pt_prev)
1799 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1800 rcu_read_unlock();
1801}
1802
2c53040f
BH
1803/**
1804 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1805 * @dev: Network device
1806 * @txq: number of queues available
1807 *
1808 * If real_num_tx_queues is changed the tc mappings may no longer be
1809 * valid. To resolve this verify the tc mapping remains valid and if
1810 * not NULL the mapping. With no priorities mapping to this
1811 * offset/count pair it will no longer be used. In the worst case TC0
1812 * is invalid nothing can be done so disable priority mappings. If is
1813 * expected that drivers will fix this mapping if they can before
1814 * calling netif_set_real_num_tx_queues.
1815 */
bb134d22 1816static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1817{
1818 int i;
1819 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1820
1821 /* If TC0 is invalidated disable TC mapping */
1822 if (tc->offset + tc->count > txq) {
7b6cd1ce 1823 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1824 dev->num_tc = 0;
1825 return;
1826 }
1827
1828 /* Invalidated prio to tc mappings set to TC0 */
1829 for (i = 1; i < TC_BITMASK + 1; i++) {
1830 int q = netdev_get_prio_tc_map(dev, i);
1831
1832 tc = &dev->tc_to_txq[q];
1833 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1834 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1835 i, q);
4f57c087
JF
1836 netdev_set_prio_tc_map(dev, i, 0);
1837 }
1838 }
1839}
1840
537c00de
AD
1841#ifdef CONFIG_XPS
1842static DEFINE_MUTEX(xps_map_mutex);
1843#define xmap_dereference(P) \
1844 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1845
10cdc3f3
AD
1846static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1847 int cpu, u16 index)
537c00de 1848{
10cdc3f3
AD
1849 struct xps_map *map = NULL;
1850 int pos;
537c00de 1851
10cdc3f3
AD
1852 if (dev_maps)
1853 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1854
10cdc3f3
AD
1855 for (pos = 0; map && pos < map->len; pos++) {
1856 if (map->queues[pos] == index) {
537c00de
AD
1857 if (map->len > 1) {
1858 map->queues[pos] = map->queues[--map->len];
1859 } else {
10cdc3f3 1860 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1861 kfree_rcu(map, rcu);
1862 map = NULL;
1863 }
10cdc3f3 1864 break;
537c00de 1865 }
537c00de
AD
1866 }
1867
10cdc3f3
AD
1868 return map;
1869}
1870
024e9679 1871static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1872{
1873 struct xps_dev_maps *dev_maps;
024e9679 1874 int cpu, i;
10cdc3f3
AD
1875 bool active = false;
1876
1877 mutex_lock(&xps_map_mutex);
1878 dev_maps = xmap_dereference(dev->xps_maps);
1879
1880 if (!dev_maps)
1881 goto out_no_maps;
1882
1883 for_each_possible_cpu(cpu) {
024e9679
AD
1884 for (i = index; i < dev->num_tx_queues; i++) {
1885 if (!remove_xps_queue(dev_maps, cpu, i))
1886 break;
1887 }
1888 if (i == dev->num_tx_queues)
10cdc3f3
AD
1889 active = true;
1890 }
1891
1892 if (!active) {
537c00de
AD
1893 RCU_INIT_POINTER(dev->xps_maps, NULL);
1894 kfree_rcu(dev_maps, rcu);
1895 }
1896
024e9679
AD
1897 for (i = index; i < dev->num_tx_queues; i++)
1898 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1899 NUMA_NO_NODE);
1900
537c00de
AD
1901out_no_maps:
1902 mutex_unlock(&xps_map_mutex);
1903}
1904
01c5f864
AD
1905static struct xps_map *expand_xps_map(struct xps_map *map,
1906 int cpu, u16 index)
1907{
1908 struct xps_map *new_map;
1909 int alloc_len = XPS_MIN_MAP_ALLOC;
1910 int i, pos;
1911
1912 for (pos = 0; map && pos < map->len; pos++) {
1913 if (map->queues[pos] != index)
1914 continue;
1915 return map;
1916 }
1917
1918 /* Need to add queue to this CPU's existing map */
1919 if (map) {
1920 if (pos < map->alloc_len)
1921 return map;
1922
1923 alloc_len = map->alloc_len * 2;
1924 }
1925
1926 /* Need to allocate new map to store queue on this CPU's map */
1927 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1928 cpu_to_node(cpu));
1929 if (!new_map)
1930 return NULL;
1931
1932 for (i = 0; i < pos; i++)
1933 new_map->queues[i] = map->queues[i];
1934 new_map->alloc_len = alloc_len;
1935 new_map->len = pos;
1936
1937 return new_map;
1938}
1939
3573540c
MT
1940int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1941 u16 index)
537c00de 1942{
01c5f864 1943 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1944 struct xps_map *map, *new_map;
537c00de 1945 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1946 int cpu, numa_node_id = -2;
1947 bool active = false;
537c00de
AD
1948
1949 mutex_lock(&xps_map_mutex);
1950
1951 dev_maps = xmap_dereference(dev->xps_maps);
1952
01c5f864
AD
1953 /* allocate memory for queue storage */
1954 for_each_online_cpu(cpu) {
1955 if (!cpumask_test_cpu(cpu, mask))
1956 continue;
1957
1958 if (!new_dev_maps)
1959 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
1960 if (!new_dev_maps) {
1961 mutex_unlock(&xps_map_mutex);
01c5f864 1962 return -ENOMEM;
2bb60cb9 1963 }
01c5f864
AD
1964
1965 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1966 NULL;
1967
1968 map = expand_xps_map(map, cpu, index);
1969 if (!map)
1970 goto error;
1971
1972 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1973 }
1974
1975 if (!new_dev_maps)
1976 goto out_no_new_maps;
1977
537c00de 1978 for_each_possible_cpu(cpu) {
01c5f864
AD
1979 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1980 /* add queue to CPU maps */
1981 int pos = 0;
1982
1983 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1984 while ((pos < map->len) && (map->queues[pos] != index))
1985 pos++;
1986
1987 if (pos == map->len)
1988 map->queues[map->len++] = index;
537c00de 1989#ifdef CONFIG_NUMA
537c00de
AD
1990 if (numa_node_id == -2)
1991 numa_node_id = cpu_to_node(cpu);
1992 else if (numa_node_id != cpu_to_node(cpu))
1993 numa_node_id = -1;
537c00de 1994#endif
01c5f864
AD
1995 } else if (dev_maps) {
1996 /* fill in the new device map from the old device map */
1997 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1998 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 1999 }
01c5f864 2000
537c00de
AD
2001 }
2002
01c5f864
AD
2003 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2004
537c00de 2005 /* Cleanup old maps */
01c5f864
AD
2006 if (dev_maps) {
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2010 if (map && map != new_map)
2011 kfree_rcu(map, rcu);
2012 }
537c00de 2013
01c5f864 2014 kfree_rcu(dev_maps, rcu);
537c00de
AD
2015 }
2016
01c5f864
AD
2017 dev_maps = new_dev_maps;
2018 active = true;
537c00de 2019
01c5f864
AD
2020out_no_new_maps:
2021 /* update Tx queue numa node */
537c00de
AD
2022 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2023 (numa_node_id >= 0) ? numa_node_id :
2024 NUMA_NO_NODE);
2025
01c5f864
AD
2026 if (!dev_maps)
2027 goto out_no_maps;
2028
2029 /* removes queue from unused CPUs */
2030 for_each_possible_cpu(cpu) {
2031 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2032 continue;
2033
2034 if (remove_xps_queue(dev_maps, cpu, index))
2035 active = true;
2036 }
2037
2038 /* free map if not active */
2039 if (!active) {
2040 RCU_INIT_POINTER(dev->xps_maps, NULL);
2041 kfree_rcu(dev_maps, rcu);
2042 }
2043
2044out_no_maps:
537c00de
AD
2045 mutex_unlock(&xps_map_mutex);
2046
2047 return 0;
2048error:
01c5f864
AD
2049 /* remove any maps that we added */
2050 for_each_possible_cpu(cpu) {
2051 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2052 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2053 NULL;
2054 if (new_map && new_map != map)
2055 kfree(new_map);
2056 }
2057
537c00de
AD
2058 mutex_unlock(&xps_map_mutex);
2059
537c00de
AD
2060 kfree(new_dev_maps);
2061 return -ENOMEM;
2062}
2063EXPORT_SYMBOL(netif_set_xps_queue);
2064
2065#endif
f0796d5c
JF
2066/*
2067 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2068 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2069 */
e6484930 2070int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2071{
1d24eb48
TH
2072 int rc;
2073
e6484930
TH
2074 if (txq < 1 || txq > dev->num_tx_queues)
2075 return -EINVAL;
f0796d5c 2076
5c56580b
BH
2077 if (dev->reg_state == NETREG_REGISTERED ||
2078 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2079 ASSERT_RTNL();
2080
1d24eb48
TH
2081 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2082 txq);
bf264145
TH
2083 if (rc)
2084 return rc;
2085
4f57c087
JF
2086 if (dev->num_tc)
2087 netif_setup_tc(dev, txq);
2088
024e9679 2089 if (txq < dev->real_num_tx_queues) {
e6484930 2090 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2091#ifdef CONFIG_XPS
2092 netif_reset_xps_queues_gt(dev, txq);
2093#endif
2094 }
f0796d5c 2095 }
e6484930
TH
2096
2097 dev->real_num_tx_queues = txq;
2098 return 0;
f0796d5c
JF
2099}
2100EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2101
a953be53 2102#ifdef CONFIG_SYSFS
62fe0b40
BH
2103/**
2104 * netif_set_real_num_rx_queues - set actual number of RX queues used
2105 * @dev: Network device
2106 * @rxq: Actual number of RX queues
2107 *
2108 * This must be called either with the rtnl_lock held or before
2109 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2110 * negative error code. If called before registration, it always
2111 * succeeds.
62fe0b40
BH
2112 */
2113int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2114{
2115 int rc;
2116
bd25fa7b
TH
2117 if (rxq < 1 || rxq > dev->num_rx_queues)
2118 return -EINVAL;
2119
62fe0b40
BH
2120 if (dev->reg_state == NETREG_REGISTERED) {
2121 ASSERT_RTNL();
2122
62fe0b40
BH
2123 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2124 rxq);
2125 if (rc)
2126 return rc;
62fe0b40
BH
2127 }
2128
2129 dev->real_num_rx_queues = rxq;
2130 return 0;
2131}
2132EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2133#endif
2134
2c53040f
BH
2135/**
2136 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2137 *
2138 * This routine should set an upper limit on the number of RSS queues
2139 * used by default by multiqueue devices.
2140 */
a55b138b 2141int netif_get_num_default_rss_queues(void)
16917b87
YM
2142{
2143 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2144}
2145EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2146
def82a1d 2147static inline void __netif_reschedule(struct Qdisc *q)
56079431 2148{
def82a1d
JP
2149 struct softnet_data *sd;
2150 unsigned long flags;
56079431 2151
def82a1d 2152 local_irq_save(flags);
903ceff7 2153 sd = this_cpu_ptr(&softnet_data);
a9cbd588
CG
2154 q->next_sched = NULL;
2155 *sd->output_queue_tailp = q;
2156 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2157 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2158 local_irq_restore(flags);
2159}
2160
2161void __netif_schedule(struct Qdisc *q)
2162{
2163 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2164 __netif_reschedule(q);
56079431
DV
2165}
2166EXPORT_SYMBOL(__netif_schedule);
2167
e6247027
ED
2168struct dev_kfree_skb_cb {
2169 enum skb_free_reason reason;
2170};
2171
2172static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
56079431 2173{
e6247027
ED
2174 return (struct dev_kfree_skb_cb *)skb->cb;
2175}
2176
46e5da40
JF
2177void netif_schedule_queue(struct netdev_queue *txq)
2178{
2179 rcu_read_lock();
2180 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2181 struct Qdisc *q = rcu_dereference(txq->qdisc);
2182
2183 __netif_schedule(q);
2184 }
2185 rcu_read_unlock();
2186}
2187EXPORT_SYMBOL(netif_schedule_queue);
2188
2189/**
2190 * netif_wake_subqueue - allow sending packets on subqueue
2191 * @dev: network device
2192 * @queue_index: sub queue index
2193 *
2194 * Resume individual transmit queue of a device with multiple transmit queues.
2195 */
2196void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2197{
2198 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2199
2200 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2201 struct Qdisc *q;
2202
2203 rcu_read_lock();
2204 q = rcu_dereference(txq->qdisc);
2205 __netif_schedule(q);
2206 rcu_read_unlock();
2207 }
2208}
2209EXPORT_SYMBOL(netif_wake_subqueue);
2210
2211void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2212{
2213 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2214 struct Qdisc *q;
2215
2216 rcu_read_lock();
2217 q = rcu_dereference(dev_queue->qdisc);
2218 __netif_schedule(q);
2219 rcu_read_unlock();
2220 }
2221}
2222EXPORT_SYMBOL(netif_tx_wake_queue);
2223
e6247027 2224void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
56079431 2225{
e6247027 2226 unsigned long flags;
56079431 2227
e6247027
ED
2228 if (likely(atomic_read(&skb->users) == 1)) {
2229 smp_rmb();
2230 atomic_set(&skb->users, 0);
2231 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2232 return;
bea3348e 2233 }
e6247027
ED
2234 get_kfree_skb_cb(skb)->reason = reason;
2235 local_irq_save(flags);
2236 skb->next = __this_cpu_read(softnet_data.completion_queue);
2237 __this_cpu_write(softnet_data.completion_queue, skb);
2238 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2239 local_irq_restore(flags);
56079431 2240}
e6247027 2241EXPORT_SYMBOL(__dev_kfree_skb_irq);
56079431 2242
e6247027 2243void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
56079431
DV
2244{
2245 if (in_irq() || irqs_disabled())
e6247027 2246 __dev_kfree_skb_irq(skb, reason);
56079431
DV
2247 else
2248 dev_kfree_skb(skb);
2249}
e6247027 2250EXPORT_SYMBOL(__dev_kfree_skb_any);
56079431
DV
2251
2252
bea3348e
SH
2253/**
2254 * netif_device_detach - mark device as removed
2255 * @dev: network device
2256 *
2257 * Mark device as removed from system and therefore no longer available.
2258 */
56079431
DV
2259void netif_device_detach(struct net_device *dev)
2260{
2261 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2262 netif_running(dev)) {
d543103a 2263 netif_tx_stop_all_queues(dev);
56079431
DV
2264 }
2265}
2266EXPORT_SYMBOL(netif_device_detach);
2267
bea3348e
SH
2268/**
2269 * netif_device_attach - mark device as attached
2270 * @dev: network device
2271 *
2272 * Mark device as attached from system and restart if needed.
2273 */
56079431
DV
2274void netif_device_attach(struct net_device *dev)
2275{
2276 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2277 netif_running(dev)) {
d543103a 2278 netif_tx_wake_all_queues(dev);
4ec93edb 2279 __netdev_watchdog_up(dev);
56079431
DV
2280 }
2281}
2282EXPORT_SYMBOL(netif_device_attach);
2283
36c92474
BH
2284static void skb_warn_bad_offload(const struct sk_buff *skb)
2285{
65e9d2fa 2286 static const netdev_features_t null_features = 0;
36c92474
BH
2287 struct net_device *dev = skb->dev;
2288 const char *driver = "";
2289
c846ad9b
BG
2290 if (!net_ratelimit())
2291 return;
2292
36c92474
BH
2293 if (dev && dev->dev.parent)
2294 driver = dev_driver_string(dev->dev.parent);
2295
2296 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2297 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2298 driver, dev ? &dev->features : &null_features,
2299 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2300 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2301 skb_shinfo(skb)->gso_type, skb->ip_summed);
2302}
2303
1da177e4
LT
2304/*
2305 * Invalidate hardware checksum when packet is to be mangled, and
2306 * complete checksum manually on outgoing path.
2307 */
84fa7933 2308int skb_checksum_help(struct sk_buff *skb)
1da177e4 2309{
d3bc23e7 2310 __wsum csum;
663ead3b 2311 int ret = 0, offset;
1da177e4 2312
84fa7933 2313 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2314 goto out_set_summed;
2315
2316 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2317 skb_warn_bad_offload(skb);
2318 return -EINVAL;
1da177e4
LT
2319 }
2320
cef401de
ED
2321 /* Before computing a checksum, we should make sure no frag could
2322 * be modified by an external entity : checksum could be wrong.
2323 */
2324 if (skb_has_shared_frag(skb)) {
2325 ret = __skb_linearize(skb);
2326 if (ret)
2327 goto out;
2328 }
2329
55508d60 2330 offset = skb_checksum_start_offset(skb);
a030847e
HX
2331 BUG_ON(offset >= skb_headlen(skb));
2332 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2333
2334 offset += skb->csum_offset;
2335 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2336
2337 if (skb_cloned(skb) &&
2338 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2339 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2340 if (ret)
2341 goto out;
2342 }
2343
a030847e 2344 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2345out_set_summed:
1da177e4 2346 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2347out:
1da177e4
LT
2348 return ret;
2349}
d1b19dff 2350EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2351
53d6471c 2352__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
f6a78bfc 2353{
4b9b1cdf 2354 unsigned int vlan_depth = skb->mac_len;
252e3346 2355 __be16 type = skb->protocol;
f6a78bfc 2356
19acc327
PS
2357 /* Tunnel gso handlers can set protocol to ethernet. */
2358 if (type == htons(ETH_P_TEB)) {
2359 struct ethhdr *eth;
2360
2361 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362 return 0;
2363
2364 eth = (struct ethhdr *)skb_mac_header(skb);
2365 type = eth->h_proto;
2366 }
2367
4b9b1cdf
NA
2368 /* if skb->protocol is 802.1Q/AD then the header should already be
2369 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2370 * ETH_HLEN otherwise
2371 */
2372 if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2373 if (vlan_depth) {
80019d31 2374 if (WARN_ON(vlan_depth < VLAN_HLEN))
4b9b1cdf
NA
2375 return 0;
2376 vlan_depth -= VLAN_HLEN;
2377 } else {
2378 vlan_depth = ETH_HLEN;
2379 }
2380 do {
2381 struct vlan_hdr *vh;
2382
2383 if (unlikely(!pskb_may_pull(skb,
2384 vlan_depth + VLAN_HLEN)))
2385 return 0;
2386
2387 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2388 type = vh->h_vlan_encapsulated_proto;
2389 vlan_depth += VLAN_HLEN;
2390 } while (type == htons(ETH_P_8021Q) ||
2391 type == htons(ETH_P_8021AD));
7b9c6090
JG
2392 }
2393
53d6471c
VY
2394 *depth = vlan_depth;
2395
ec5f0615
PS
2396 return type;
2397}
2398
2399/**
2400 * skb_mac_gso_segment - mac layer segmentation handler.
2401 * @skb: buffer to segment
2402 * @features: features for the output path (see dev->features)
2403 */
2404struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2405 netdev_features_t features)
2406{
2407 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2408 struct packet_offload *ptype;
53d6471c
VY
2409 int vlan_depth = skb->mac_len;
2410 __be16 type = skb_network_protocol(skb, &vlan_depth);
ec5f0615
PS
2411
2412 if (unlikely(!type))
2413 return ERR_PTR(-EINVAL);
2414
53d6471c 2415 __skb_pull(skb, vlan_depth);
f6a78bfc
HX
2416
2417 rcu_read_lock();
22061d80 2418 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2419 if (ptype->type == type && ptype->callbacks.gso_segment) {
f191a1d1 2420 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2421 break;
2422 }
2423 }
2424 rcu_read_unlock();
2425
98e399f8 2426 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2427
f6a78bfc
HX
2428 return segs;
2429}
05e8ef4a
PS
2430EXPORT_SYMBOL(skb_mac_gso_segment);
2431
2432
2433/* openvswitch calls this on rx path, so we need a different check.
2434 */
2435static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2436{
2437 if (tx_path)
2438 return skb->ip_summed != CHECKSUM_PARTIAL;
2439 else
2440 return skb->ip_summed == CHECKSUM_NONE;
2441}
2442
2443/**
2444 * __skb_gso_segment - Perform segmentation on skb.
2445 * @skb: buffer to segment
2446 * @features: features for the output path (see dev->features)
2447 * @tx_path: whether it is called in TX path
2448 *
2449 * This function segments the given skb and returns a list of segments.
2450 *
2451 * It may return NULL if the skb requires no segmentation. This is
2452 * only possible when GSO is used for verifying header integrity.
2453 */
2454struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2455 netdev_features_t features, bool tx_path)
2456{
2457 if (unlikely(skb_needs_check(skb, tx_path))) {
2458 int err;
2459
2460 skb_warn_bad_offload(skb);
2461
a40e0a66 2462 err = skb_cow_head(skb, 0);
2463 if (err < 0)
05e8ef4a
PS
2464 return ERR_PTR(err);
2465 }
2466
68c33163 2467 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3347c960
ED
2468 SKB_GSO_CB(skb)->encap_level = 0;
2469
05e8ef4a
PS
2470 skb_reset_mac_header(skb);
2471 skb_reset_mac_len(skb);
2472
2473 return skb_mac_gso_segment(skb, features);
2474}
12b0004d 2475EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2476
fb286bb2
HX
2477/* Take action when hardware reception checksum errors are detected. */
2478#ifdef CONFIG_BUG
2479void netdev_rx_csum_fault(struct net_device *dev)
2480{
2481 if (net_ratelimit()) {
7b6cd1ce 2482 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2483 dump_stack();
2484 }
2485}
2486EXPORT_SYMBOL(netdev_rx_csum_fault);
2487#endif
2488
1da177e4
LT
2489/* Actually, we should eliminate this check as soon as we know, that:
2490 * 1. IOMMU is present and allows to map all the memory.
2491 * 2. No high memory really exists on this machine.
2492 */
2493
c1e756bf 2494static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2495{
3d3a8533 2496#ifdef CONFIG_HIGHMEM
1da177e4 2497 int i;
5acbbd42 2498 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2499 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2500 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2501 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2502 return 1;
ea2ab693 2503 }
5acbbd42 2504 }
1da177e4 2505
5acbbd42
FT
2506 if (PCI_DMA_BUS_IS_PHYS) {
2507 struct device *pdev = dev->dev.parent;
1da177e4 2508
9092c658
ED
2509 if (!pdev)
2510 return 0;
5acbbd42 2511 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2512 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2513 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2514 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2515 return 1;
2516 }
2517 }
3d3a8533 2518#endif
1da177e4
LT
2519 return 0;
2520}
1da177e4 2521
3b392ddb
SH
2522/* If MPLS offload request, verify we are testing hardware MPLS features
2523 * instead of standard features for the netdev.
2524 */
d0edc7bf 2525#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3b392ddb
SH
2526static netdev_features_t net_mpls_features(struct sk_buff *skb,
2527 netdev_features_t features,
2528 __be16 type)
2529{
25cd9ba0 2530 if (eth_p_mpls(type))
3b392ddb
SH
2531 features &= skb->dev->mpls_features;
2532
2533 return features;
2534}
2535#else
2536static netdev_features_t net_mpls_features(struct sk_buff *skb,
2537 netdev_features_t features,
2538 __be16 type)
2539{
2540 return features;
2541}
2542#endif
2543
c8f44aff 2544static netdev_features_t harmonize_features(struct sk_buff *skb,
c1e756bf 2545 netdev_features_t features)
f01a5236 2546{
53d6471c 2547 int tmp;
3b392ddb
SH
2548 __be16 type;
2549
2550 type = skb_network_protocol(skb, &tmp);
2551 features = net_mpls_features(skb, features, type);
53d6471c 2552
c0d680e5 2553 if (skb->ip_summed != CHECKSUM_NONE &&
3b392ddb 2554 !can_checksum_protocol(features, type)) {
f01a5236 2555 features &= ~NETIF_F_ALL_CSUM;
c1e756bf 2556 } else if (illegal_highdma(skb->dev, skb)) {
f01a5236
JG
2557 features &= ~NETIF_F_SG;
2558 }
2559
2560 return features;
2561}
2562
c1e756bf 2563netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6 2564{
fcbeb976
ED
2565 const struct net_device *dev = skb->dev;
2566 netdev_features_t features = dev->features;
2567 u16 gso_segs = skb_shinfo(skb)->gso_segs;
58e998c6
JG
2568 __be16 protocol = skb->protocol;
2569
fcbeb976 2570 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
30b678d8
BH
2571 features &= ~NETIF_F_GSO_MASK;
2572
796f2da8
TM
2573 if (!vlan_tx_tag_present(skb)) {
2574 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2575 protocol == htons(ETH_P_8021AD))) {
2576 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2577 protocol = veh->h_vlan_encapsulated_proto;
2578 } else {
2579 return harmonize_features(skb, features);
2580 }
f01a5236 2581 }
58e998c6 2582
db115037 2583 features = netdev_intersect_features(features,
fcbeb976 2584 dev->vlan_features |
db115037
MK
2585 NETIF_F_HW_VLAN_CTAG_TX |
2586 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2587
cdbaa0bb 2588 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
db115037
MK
2589 features = netdev_intersect_features(features,
2590 NETIF_F_SG |
2591 NETIF_F_HIGHDMA |
2592 NETIF_F_FRAGLIST |
2593 NETIF_F_GEN_CSUM |
2594 NETIF_F_HW_VLAN_CTAG_TX |
2595 NETIF_F_HW_VLAN_STAG_TX);
cdbaa0bb 2596
c1e756bf 2597 return harmonize_features(skb, features);
58e998c6 2598}
c1e756bf 2599EXPORT_SYMBOL(netif_skb_features);
58e998c6 2600
2ea25513 2601static int xmit_one(struct sk_buff *skb, struct net_device *dev,
95f6b3dd 2602 struct netdev_queue *txq, bool more)
f6a78bfc 2603{
2ea25513
DM
2604 unsigned int len;
2605 int rc;
00829823 2606
2ea25513
DM
2607 if (!list_empty(&ptype_all))
2608 dev_queue_xmit_nit(skb, dev);
fc741216 2609
2ea25513
DM
2610 len = skb->len;
2611 trace_net_dev_start_xmit(skb, dev);
95f6b3dd 2612 rc = netdev_start_xmit(skb, dev, txq, more);
2ea25513 2613 trace_net_dev_xmit(skb, rc, dev, len);
adf30907 2614
2ea25513
DM
2615 return rc;
2616}
7b9c6090 2617
8dcda22a
DM
2618struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2619 struct netdev_queue *txq, int *ret)
7f2e870f
DM
2620{
2621 struct sk_buff *skb = first;
2622 int rc = NETDEV_TX_OK;
7b9c6090 2623
7f2e870f
DM
2624 while (skb) {
2625 struct sk_buff *next = skb->next;
fc70fb64 2626
7f2e870f 2627 skb->next = NULL;
95f6b3dd 2628 rc = xmit_one(skb, dev, txq, next != NULL);
7f2e870f
DM
2629 if (unlikely(!dev_xmit_complete(rc))) {
2630 skb->next = next;
2631 goto out;
2632 }
6afff0ca 2633
7f2e870f
DM
2634 skb = next;
2635 if (netif_xmit_stopped(txq) && skb) {
2636 rc = NETDEV_TX_BUSY;
2637 break;
9ccb8975 2638 }
7f2e870f 2639 }
9ccb8975 2640
7f2e870f
DM
2641out:
2642 *ret = rc;
2643 return skb;
2644}
b40863c6 2645
1ff0dc94
ED
2646static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2647 netdev_features_t features)
f6a78bfc 2648{
eae3f88e 2649 if (vlan_tx_tag_present(skb) &&
5968250c
JP
2650 !vlan_hw_offload_capable(features, skb->vlan_proto))
2651 skb = __vlan_hwaccel_push_inside(skb);
eae3f88e
DM
2652 return skb;
2653}
f6a78bfc 2654
55a93b3e 2655static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
eae3f88e
DM
2656{
2657 netdev_features_t features;
f6a78bfc 2658
eae3f88e
DM
2659 if (skb->next)
2660 return skb;
068a2de5 2661
eae3f88e
DM
2662 features = netif_skb_features(skb);
2663 skb = validate_xmit_vlan(skb, features);
2664 if (unlikely(!skb))
2665 goto out_null;
7b9c6090 2666
eae3f88e
DM
2667 /* If encapsulation offload request, verify we are testing
2668 * hardware encapsulation features instead of standard
2669 * features for the netdev
2670 */
2671 if (skb->encapsulation)
2672 features &= dev->hw_enc_features;
2673
04ffcb25 2674 if (netif_needs_gso(dev, skb, features)) {
ce93718f
DM
2675 struct sk_buff *segs;
2676
2677 segs = skb_gso_segment(skb, features);
cecda693 2678 if (IS_ERR(segs)) {
af6dabc9 2679 goto out_kfree_skb;
cecda693
JW
2680 } else if (segs) {
2681 consume_skb(skb);
2682 skb = segs;
f6a78bfc 2683 }
eae3f88e
DM
2684 } else {
2685 if (skb_needs_linearize(skb, features) &&
2686 __skb_linearize(skb))
2687 goto out_kfree_skb;
4ec93edb 2688
eae3f88e
DM
2689 /* If packet is not checksummed and device does not
2690 * support checksumming for this protocol, complete
2691 * checksumming here.
2692 */
2693 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2694 if (skb->encapsulation)
2695 skb_set_inner_transport_header(skb,
2696 skb_checksum_start_offset(skb));
2697 else
2698 skb_set_transport_header(skb,
2699 skb_checksum_start_offset(skb));
2700 if (!(features & NETIF_F_ALL_CSUM) &&
2701 skb_checksum_help(skb))
2702 goto out_kfree_skb;
7b9c6090 2703 }
0c772159 2704 }
7b9c6090 2705
eae3f88e 2706 return skb;
fc70fb64 2707
f6a78bfc
HX
2708out_kfree_skb:
2709 kfree_skb(skb);
eae3f88e
DM
2710out_null:
2711 return NULL;
2712}
6afff0ca 2713
55a93b3e
ED
2714struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2715{
2716 struct sk_buff *next, *head = NULL, *tail;
2717
bec3cfdc 2718 for (; skb != NULL; skb = next) {
55a93b3e
ED
2719 next = skb->next;
2720 skb->next = NULL;
bec3cfdc
ED
2721
2722 /* in case skb wont be segmented, point to itself */
2723 skb->prev = skb;
2724
55a93b3e 2725 skb = validate_xmit_skb(skb, dev);
bec3cfdc
ED
2726 if (!skb)
2727 continue;
55a93b3e 2728
bec3cfdc
ED
2729 if (!head)
2730 head = skb;
2731 else
2732 tail->next = skb;
2733 /* If skb was segmented, skb->prev points to
2734 * the last segment. If not, it still contains skb.
2735 */
2736 tail = skb->prev;
55a93b3e
ED
2737 }
2738 return head;
f6a78bfc
HX
2739}
2740
1def9238
ED
2741static void qdisc_pkt_len_init(struct sk_buff *skb)
2742{
2743 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2744
2745 qdisc_skb_cb(skb)->pkt_len = skb->len;
2746
2747 /* To get more precise estimation of bytes sent on wire,
2748 * we add to pkt_len the headers size of all segments
2749 */
2750 if (shinfo->gso_size) {
757b8b1d 2751 unsigned int hdr_len;
15e5a030 2752 u16 gso_segs = shinfo->gso_segs;
1def9238 2753
757b8b1d
ED
2754 /* mac layer + network layer */
2755 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2756
2757 /* + transport layer */
1def9238
ED
2758 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2759 hdr_len += tcp_hdrlen(skb);
2760 else
2761 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2762
2763 if (shinfo->gso_type & SKB_GSO_DODGY)
2764 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2765 shinfo->gso_size);
2766
2767 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2768 }
2769}
2770
bbd8a0d3
KK
2771static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2772 struct net_device *dev,
2773 struct netdev_queue *txq)
2774{
2775 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2776 bool contended;
bbd8a0d3
KK
2777 int rc;
2778
1def9238 2779 qdisc_pkt_len_init(skb);
a2da570d 2780 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2781 /*
2782 * Heuristic to force contended enqueues to serialize on a
2783 * separate lock before trying to get qdisc main lock.
9bf2b8c2
YX
2784 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2785 * often and dequeue packets faster.
79640a4c 2786 */
a2da570d 2787 contended = qdisc_is_running(q);
79640a4c
ED
2788 if (unlikely(contended))
2789 spin_lock(&q->busylock);
2790
bbd8a0d3
KK
2791 spin_lock(root_lock);
2792 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2793 kfree_skb(skb);
2794 rc = NET_XMIT_DROP;
2795 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2796 qdisc_run_begin(q)) {
bbd8a0d3
KK
2797 /*
2798 * This is a work-conserving queue; there are no old skbs
2799 * waiting to be sent out; and the qdisc is not running -
2800 * xmit the skb directly.
2801 */
bfe0d029 2802
bfe0d029
ED
2803 qdisc_bstats_update(q, skb);
2804
55a93b3e 2805 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
79640a4c
ED
2806 if (unlikely(contended)) {
2807 spin_unlock(&q->busylock);
2808 contended = false;
2809 }
bbd8a0d3 2810 __qdisc_run(q);
79640a4c 2811 } else
bc135b23 2812 qdisc_run_end(q);
bbd8a0d3
KK
2813
2814 rc = NET_XMIT_SUCCESS;
2815 } else {
a2da570d 2816 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2817 if (qdisc_run_begin(q)) {
2818 if (unlikely(contended)) {
2819 spin_unlock(&q->busylock);
2820 contended = false;
2821 }
2822 __qdisc_run(q);
2823 }
bbd8a0d3
KK
2824 }
2825 spin_unlock(root_lock);
79640a4c
ED
2826 if (unlikely(contended))
2827 spin_unlock(&q->busylock);
bbd8a0d3
KK
2828 return rc;
2829}
2830
86f8515f 2831#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
5bc1421e
NH
2832static void skb_update_prio(struct sk_buff *skb)
2833{
6977a79d 2834 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2835
91c68ce2
ED
2836 if (!skb->priority && skb->sk && map) {
2837 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2838
2839 if (prioidx < map->priomap_len)
2840 skb->priority = map->priomap[prioidx];
2841 }
5bc1421e
NH
2842}
2843#else
2844#define skb_update_prio(skb)
2845#endif
2846
745e20f1 2847static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2848#define RECURSION_LIMIT 10
745e20f1 2849
95603e22
MM
2850/**
2851 * dev_loopback_xmit - loop back @skb
2852 * @skb: buffer to transmit
2853 */
2854int dev_loopback_xmit(struct sk_buff *skb)
2855{
2856 skb_reset_mac_header(skb);
2857 __skb_pull(skb, skb_network_offset(skb));
2858 skb->pkt_type = PACKET_LOOPBACK;
2859 skb->ip_summed = CHECKSUM_UNNECESSARY;
2860 WARN_ON(!skb_dst(skb));
2861 skb_dst_force(skb);
2862 netif_rx_ni(skb);
2863 return 0;
2864}
2865EXPORT_SYMBOL(dev_loopback_xmit);
2866
d29f749e 2867/**
9d08dd3d 2868 * __dev_queue_xmit - transmit a buffer
d29f749e 2869 * @skb: buffer to transmit
9d08dd3d 2870 * @accel_priv: private data used for L2 forwarding offload
d29f749e
DJ
2871 *
2872 * Queue a buffer for transmission to a network device. The caller must
2873 * have set the device and priority and built the buffer before calling
2874 * this function. The function can be called from an interrupt.
2875 *
2876 * A negative errno code is returned on a failure. A success does not
2877 * guarantee the frame will be transmitted as it may be dropped due
2878 * to congestion or traffic shaping.
2879 *
2880 * -----------------------------------------------------------------------------------
2881 * I notice this method can also return errors from the queue disciplines,
2882 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2883 * be positive.
2884 *
2885 * Regardless of the return value, the skb is consumed, so it is currently
2886 * difficult to retry a send to this method. (You can bump the ref count
2887 * before sending to hold a reference for retry if you are careful.)
2888 *
2889 * When calling this method, interrupts MUST be enabled. This is because
2890 * the BH enable code must have IRQs enabled so that it will not deadlock.
2891 * --BLG
2892 */
0a59f3a9 2893static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
1da177e4
LT
2894{
2895 struct net_device *dev = skb->dev;
dc2b4847 2896 struct netdev_queue *txq;
1da177e4
LT
2897 struct Qdisc *q;
2898 int rc = -ENOMEM;
2899
6d1ccff6
ED
2900 skb_reset_mac_header(skb);
2901
e7fd2885
WB
2902 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2903 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2904
4ec93edb
YH
2905 /* Disable soft irqs for various locks below. Also
2906 * stops preemption for RCU.
1da177e4 2907 */
4ec93edb 2908 rcu_read_lock_bh();
1da177e4 2909
5bc1421e
NH
2910 skb_update_prio(skb);
2911
02875878
ED
2912 /* If device/qdisc don't need skb->dst, release it right now while
2913 * its hot in this cpu cache.
2914 */
2915 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2916 skb_dst_drop(skb);
2917 else
2918 skb_dst_force(skb);
2919
f663dd9a 2920 txq = netdev_pick_tx(dev, skb, accel_priv);
a898def2 2921 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2922
1da177e4 2923#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2924 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2925#endif
cf66ba58 2926 trace_net_dev_queue(skb);
1da177e4 2927 if (q->enqueue) {
bbd8a0d3 2928 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2929 goto out;
1da177e4
LT
2930 }
2931
2932 /* The device has no queue. Common case for software devices:
2933 loopback, all the sorts of tunnels...
2934
932ff279
HX
2935 Really, it is unlikely that netif_tx_lock protection is necessary
2936 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2937 counters.)
2938 However, it is possible, that they rely on protection
2939 made by us here.
2940
2941 Check this and shot the lock. It is not prone from deadlocks.
2942 Either shot noqueue qdisc, it is even simpler 8)
2943 */
2944 if (dev->flags & IFF_UP) {
2945 int cpu = smp_processor_id(); /* ok because BHs are off */
2946
c773e847 2947 if (txq->xmit_lock_owner != cpu) {
1da177e4 2948
745e20f1
ED
2949 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2950 goto recursion_alert;
2951
1f59533f
JDB
2952 skb = validate_xmit_skb(skb, dev);
2953 if (!skb)
2954 goto drop;
2955
c773e847 2956 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2957
73466498 2958 if (!netif_xmit_stopped(txq)) {
745e20f1 2959 __this_cpu_inc(xmit_recursion);
ce93718f 2960 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
745e20f1 2961 __this_cpu_dec(xmit_recursion);
572a9d7b 2962 if (dev_xmit_complete(rc)) {
c773e847 2963 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2964 goto out;
2965 }
2966 }
c773e847 2967 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2968 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2969 dev->name);
1da177e4
LT
2970 } else {
2971 /* Recursion is detected! It is possible,
745e20f1
ED
2972 * unfortunately
2973 */
2974recursion_alert:
e87cc472
JP
2975 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2976 dev->name);
1da177e4
LT
2977 }
2978 }
2979
2980 rc = -ENETDOWN;
1f59533f 2981drop:
d4828d85 2982 rcu_read_unlock_bh();
1da177e4 2983
015f0688 2984 atomic_long_inc(&dev->tx_dropped);
1f59533f 2985 kfree_skb_list(skb);
1da177e4
LT
2986 return rc;
2987out:
d4828d85 2988 rcu_read_unlock_bh();
1da177e4
LT
2989 return rc;
2990}
f663dd9a
JW
2991
2992int dev_queue_xmit(struct sk_buff *skb)
2993{
2994 return __dev_queue_xmit(skb, NULL);
2995}
d1b19dff 2996EXPORT_SYMBOL(dev_queue_xmit);
1da177e4 2997
f663dd9a
JW
2998int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2999{
3000 return __dev_queue_xmit(skb, accel_priv);
3001}
3002EXPORT_SYMBOL(dev_queue_xmit_accel);
3003
1da177e4
LT
3004
3005/*=======================================================================
3006 Receiver routines
3007 =======================================================================*/
3008
6b2bedc3 3009int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
3010EXPORT_SYMBOL(netdev_max_backlog);
3011
3b098e2d 3012int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
3013int netdev_budget __read_mostly = 300;
3014int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 3015
eecfd7c4
ED
3016/* Called with irq disabled */
3017static inline void ____napi_schedule(struct softnet_data *sd,
3018 struct napi_struct *napi)
3019{
3020 list_add_tail(&napi->poll_list, &sd->poll_list);
3021 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3022}
3023
bfb564e7
KK
3024#ifdef CONFIG_RPS
3025
3026/* One global table that all flow-based protocols share. */
6e3f7faf 3027struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
3028EXPORT_SYMBOL(rps_sock_flow_table);
3029
c5905afb 3030struct static_key rps_needed __read_mostly;
adc9300e 3031
c445477d
BH
3032static struct rps_dev_flow *
3033set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3034 struct rps_dev_flow *rflow, u16 next_cpu)
3035{
09994d1b 3036 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
3037#ifdef CONFIG_RFS_ACCEL
3038 struct netdev_rx_queue *rxqueue;
3039 struct rps_dev_flow_table *flow_table;
3040 struct rps_dev_flow *old_rflow;
3041 u32 flow_id;
3042 u16 rxq_index;
3043 int rc;
3044
3045 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
3046 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3047 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
3048 goto out;
3049 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3050 if (rxq_index == skb_get_rx_queue(skb))
3051 goto out;
3052
3053 rxqueue = dev->_rx + rxq_index;
3054 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3055 if (!flow_table)
3056 goto out;
61b905da 3057 flow_id = skb_get_hash(skb) & flow_table->mask;
c445477d
BH
3058 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3059 rxq_index, flow_id);
3060 if (rc < 0)
3061 goto out;
3062 old_rflow = rflow;
3063 rflow = &flow_table->flows[flow_id];
c445477d
BH
3064 rflow->filter = rc;
3065 if (old_rflow->filter == rflow->filter)
3066 old_rflow->filter = RPS_NO_FILTER;
3067 out:
3068#endif
3069 rflow->last_qtail =
09994d1b 3070 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
3071 }
3072
09994d1b 3073 rflow->cpu = next_cpu;
c445477d
BH
3074 return rflow;
3075}
3076
bfb564e7
KK
3077/*
3078 * get_rps_cpu is called from netif_receive_skb and returns the target
3079 * CPU from the RPS map of the receiving queue for a given skb.
3080 * rcu_read_lock must be held on entry.
3081 */
3082static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3083 struct rps_dev_flow **rflowp)
3084{
3085 struct netdev_rx_queue *rxqueue;
6e3f7faf 3086 struct rps_map *map;
bfb564e7
KK
3087 struct rps_dev_flow_table *flow_table;
3088 struct rps_sock_flow_table *sock_flow_table;
3089 int cpu = -1;
3090 u16 tcpu;
61b905da 3091 u32 hash;
bfb564e7
KK
3092
3093 if (skb_rx_queue_recorded(skb)) {
3094 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
3095 if (unlikely(index >= dev->real_num_rx_queues)) {
3096 WARN_ONCE(dev->real_num_rx_queues > 1,
3097 "%s received packet on queue %u, but number "
3098 "of RX queues is %u\n",
3099 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
3100 goto done;
3101 }
3102 rxqueue = dev->_rx + index;
3103 } else
3104 rxqueue = dev->_rx;
3105
6e3f7faf
ED
3106 map = rcu_dereference(rxqueue->rps_map);
3107 if (map) {
85875236 3108 if (map->len == 1 &&
33d480ce 3109 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
3110 tcpu = map->cpus[0];
3111 if (cpu_online(tcpu))
3112 cpu = tcpu;
3113 goto done;
3114 }
33d480ce 3115 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 3116 goto done;
6febfca9 3117 }
bfb564e7 3118
2d47b459 3119 skb_reset_network_header(skb);
61b905da
TH
3120 hash = skb_get_hash(skb);
3121 if (!hash)
bfb564e7
KK
3122 goto done;
3123
fec5e652
TH
3124 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3125 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3126 if (flow_table && sock_flow_table) {
3127 u16 next_cpu;
3128 struct rps_dev_flow *rflow;
3129
61b905da 3130 rflow = &flow_table->flows[hash & flow_table->mask];
fec5e652
TH
3131 tcpu = rflow->cpu;
3132
61b905da 3133 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
fec5e652
TH
3134
3135 /*
3136 * If the desired CPU (where last recvmsg was done) is
3137 * different from current CPU (one in the rx-queue flow
3138 * table entry), switch if one of the following holds:
3139 * - Current CPU is unset (equal to RPS_NO_CPU).
3140 * - Current CPU is offline.
3141 * - The current CPU's queue tail has advanced beyond the
3142 * last packet that was enqueued using this table entry.
3143 * This guarantees that all previous packets for the flow
3144 * have been dequeued, thus preserving in order delivery.
3145 */
3146 if (unlikely(tcpu != next_cpu) &&
3147 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3148 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3149 rflow->last_qtail)) >= 0)) {
3150 tcpu = next_cpu;
c445477d 3151 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3152 }
c445477d 3153
fec5e652
TH
3154 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3155 *rflowp = rflow;
3156 cpu = tcpu;
3157 goto done;
3158 }
3159 }
3160
0a9627f2 3161 if (map) {
8fc54f68 3162 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
0a9627f2
TH
3163 if (cpu_online(tcpu)) {
3164 cpu = tcpu;
3165 goto done;
3166 }
3167 }
3168
3169done:
0a9627f2
TH
3170 return cpu;
3171}
3172
c445477d
BH
3173#ifdef CONFIG_RFS_ACCEL
3174
3175/**
3176 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3177 * @dev: Device on which the filter was set
3178 * @rxq_index: RX queue index
3179 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3180 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3181 *
3182 * Drivers that implement ndo_rx_flow_steer() should periodically call
3183 * this function for each installed filter and remove the filters for
3184 * which it returns %true.
3185 */
3186bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3187 u32 flow_id, u16 filter_id)
3188{
3189 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3190 struct rps_dev_flow_table *flow_table;
3191 struct rps_dev_flow *rflow;
3192 bool expire = true;
3193 int cpu;
3194
3195 rcu_read_lock();
3196 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3197 if (flow_table && flow_id <= flow_table->mask) {
3198 rflow = &flow_table->flows[flow_id];
3199 cpu = ACCESS_ONCE(rflow->cpu);
3200 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3201 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3202 rflow->last_qtail) <
3203 (int)(10 * flow_table->mask)))
3204 expire = false;
3205 }
3206 rcu_read_unlock();
3207 return expire;
3208}
3209EXPORT_SYMBOL(rps_may_expire_flow);
3210
3211#endif /* CONFIG_RFS_ACCEL */
3212
0a9627f2 3213/* Called from hardirq (IPI) context */
e36fa2f7 3214static void rps_trigger_softirq(void *data)
0a9627f2 3215{
e36fa2f7
ED
3216 struct softnet_data *sd = data;
3217
eecfd7c4 3218 ____napi_schedule(sd, &sd->backlog);
dee42870 3219 sd->received_rps++;
0a9627f2 3220}
e36fa2f7 3221
fec5e652 3222#endif /* CONFIG_RPS */
0a9627f2 3223
e36fa2f7
ED
3224/*
3225 * Check if this softnet_data structure is another cpu one
3226 * If yes, queue it to our IPI list and return 1
3227 * If no, return 0
3228 */
3229static int rps_ipi_queued(struct softnet_data *sd)
3230{
3231#ifdef CONFIG_RPS
903ceff7 3232 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
e36fa2f7
ED
3233
3234 if (sd != mysd) {
3235 sd->rps_ipi_next = mysd->rps_ipi_list;
3236 mysd->rps_ipi_list = sd;
3237
3238 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3239 return 1;
3240 }
3241#endif /* CONFIG_RPS */
3242 return 0;
3243}
3244
99bbc707
WB
3245#ifdef CONFIG_NET_FLOW_LIMIT
3246int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3247#endif
3248
3249static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3250{
3251#ifdef CONFIG_NET_FLOW_LIMIT
3252 struct sd_flow_limit *fl;
3253 struct softnet_data *sd;
3254 unsigned int old_flow, new_flow;
3255
3256 if (qlen < (netdev_max_backlog >> 1))
3257 return false;
3258
903ceff7 3259 sd = this_cpu_ptr(&softnet_data);
99bbc707
WB
3260
3261 rcu_read_lock();
3262 fl = rcu_dereference(sd->flow_limit);
3263 if (fl) {
3958afa1 3264 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
99bbc707
WB
3265 old_flow = fl->history[fl->history_head];
3266 fl->history[fl->history_head] = new_flow;
3267
3268 fl->history_head++;
3269 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3270
3271 if (likely(fl->buckets[old_flow]))
3272 fl->buckets[old_flow]--;
3273
3274 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3275 fl->count++;
3276 rcu_read_unlock();
3277 return true;
3278 }
3279 }
3280 rcu_read_unlock();
3281#endif
3282 return false;
3283}
3284
0a9627f2
TH
3285/*
3286 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3287 * queue (may be a remote CPU queue).
3288 */
fec5e652
TH
3289static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3290 unsigned int *qtail)
0a9627f2 3291{
e36fa2f7 3292 struct softnet_data *sd;
0a9627f2 3293 unsigned long flags;
99bbc707 3294 unsigned int qlen;
0a9627f2 3295
e36fa2f7 3296 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3297
3298 local_irq_save(flags);
0a9627f2 3299
e36fa2f7 3300 rps_lock(sd);
99bbc707
WB
3301 qlen = skb_queue_len(&sd->input_pkt_queue);
3302 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
e008f3f0 3303 if (qlen) {
0a9627f2 3304enqueue:
e36fa2f7 3305 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3306 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3307 rps_unlock(sd);
152102c7 3308 local_irq_restore(flags);
0a9627f2
TH
3309 return NET_RX_SUCCESS;
3310 }
3311
ebda37c2
ED
3312 /* Schedule NAPI for backlog device
3313 * We can use non atomic operation since we own the queue lock
3314 */
3315 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3316 if (!rps_ipi_queued(sd))
eecfd7c4 3317 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3318 }
3319 goto enqueue;
3320 }
3321
dee42870 3322 sd->dropped++;
e36fa2f7 3323 rps_unlock(sd);
0a9627f2 3324
0a9627f2
TH
3325 local_irq_restore(flags);
3326
caf586e5 3327 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3328 kfree_skb(skb);
3329 return NET_RX_DROP;
3330}
1da177e4 3331
ae78dbfa 3332static int netif_rx_internal(struct sk_buff *skb)
1da177e4 3333{
b0e28f1e 3334 int ret;
1da177e4 3335
588f0330 3336 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3337
cf66ba58 3338 trace_netif_rx(skb);
df334545 3339#ifdef CONFIG_RPS
c5905afb 3340 if (static_key_false(&rps_needed)) {
fec5e652 3341 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3342 int cpu;
3343
cece1945 3344 preempt_disable();
b0e28f1e 3345 rcu_read_lock();
fec5e652
TH
3346
3347 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3348 if (cpu < 0)
3349 cpu = smp_processor_id();
fec5e652
TH
3350
3351 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3352
b0e28f1e 3353 rcu_read_unlock();
cece1945 3354 preempt_enable();
adc9300e
ED
3355 } else
3356#endif
fec5e652
TH
3357 {
3358 unsigned int qtail;
3359 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3360 put_cpu();
3361 }
b0e28f1e 3362 return ret;
1da177e4 3363}
ae78dbfa
BH
3364
3365/**
3366 * netif_rx - post buffer to the network code
3367 * @skb: buffer to post
3368 *
3369 * This function receives a packet from a device driver and queues it for
3370 * the upper (protocol) levels to process. It always succeeds. The buffer
3371 * may be dropped during processing for congestion control or by the
3372 * protocol layers.
3373 *
3374 * return values:
3375 * NET_RX_SUCCESS (no congestion)
3376 * NET_RX_DROP (packet was dropped)
3377 *
3378 */
3379
3380int netif_rx(struct sk_buff *skb)
3381{
3382 trace_netif_rx_entry(skb);
3383
3384 return netif_rx_internal(skb);
3385}
d1b19dff 3386EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3387
3388int netif_rx_ni(struct sk_buff *skb)
3389{
3390 int err;
3391
ae78dbfa
BH
3392 trace_netif_rx_ni_entry(skb);
3393
1da177e4 3394 preempt_disable();
ae78dbfa 3395 err = netif_rx_internal(skb);
1da177e4
LT
3396 if (local_softirq_pending())
3397 do_softirq();
3398 preempt_enable();
3399
3400 return err;
3401}
1da177e4
LT
3402EXPORT_SYMBOL(netif_rx_ni);
3403
1da177e4
LT
3404static void net_tx_action(struct softirq_action *h)
3405{
903ceff7 3406 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
1da177e4
LT
3407
3408 if (sd->completion_queue) {
3409 struct sk_buff *clist;
3410
3411 local_irq_disable();
3412 clist = sd->completion_queue;
3413 sd->completion_queue = NULL;
3414 local_irq_enable();
3415
3416 while (clist) {
3417 struct sk_buff *skb = clist;
3418 clist = clist->next;
3419
547b792c 3420 WARN_ON(atomic_read(&skb->users));
e6247027
ED
3421 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3422 trace_consume_skb(skb);
3423 else
3424 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3425 __kfree_skb(skb);
3426 }
3427 }
3428
3429 if (sd->output_queue) {
37437bb2 3430 struct Qdisc *head;
1da177e4
LT
3431
3432 local_irq_disable();
3433 head = sd->output_queue;
3434 sd->output_queue = NULL;
a9cbd588 3435 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3436 local_irq_enable();
3437
3438 while (head) {
37437bb2
DM
3439 struct Qdisc *q = head;
3440 spinlock_t *root_lock;
3441
1da177e4
LT
3442 head = head->next_sched;
3443
5fb66229 3444 root_lock = qdisc_lock(q);
37437bb2 3445 if (spin_trylock(root_lock)) {
4e857c58 3446 smp_mb__before_atomic();
def82a1d
JP
3447 clear_bit(__QDISC_STATE_SCHED,
3448 &q->state);
37437bb2
DM
3449 qdisc_run(q);
3450 spin_unlock(root_lock);
1da177e4 3451 } else {
195648bb 3452 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3453 &q->state)) {
195648bb 3454 __netif_reschedule(q);
e8a83e10 3455 } else {
4e857c58 3456 smp_mb__before_atomic();
e8a83e10
JP
3457 clear_bit(__QDISC_STATE_SCHED,
3458 &q->state);
3459 }
1da177e4
LT
3460 }
3461 }
3462 }
3463}
3464
ab95bfe0
JP
3465#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3466 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3467/* This hook is defined here for ATM LANE */
3468int (*br_fdb_test_addr_hook)(struct net_device *dev,
3469 unsigned char *addr) __read_mostly;
4fb019a0 3470EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3471#endif
1da177e4 3472
1da177e4
LT
3473#ifdef CONFIG_NET_CLS_ACT
3474/* TODO: Maybe we should just force sch_ingress to be compiled in
3475 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3476 * a compare and 2 stores extra right now if we dont have it on
3477 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3478 * NOTE: This doesn't stop any functionality; if you dont have
3479 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3480 *
3481 */
24824a09 3482static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3483{
1da177e4 3484 struct net_device *dev = skb->dev;
f697c3e8 3485 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3486 int result = TC_ACT_OK;
3487 struct Qdisc *q;
4ec93edb 3488
de384830 3489 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3490 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3491 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3492 return TC_ACT_SHOT;
3493 }
1da177e4 3494
f697c3e8
HX
3495 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3496 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3497
46e5da40 3498 q = rcu_dereference(rxq->qdisc);
8d50b53d 3499 if (q != &noop_qdisc) {
83874000 3500 spin_lock(qdisc_lock(q));
a9312ae8
DM
3501 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3502 result = qdisc_enqueue_root(skb, q);
83874000
DM
3503 spin_unlock(qdisc_lock(q));
3504 }
f697c3e8
HX
3505
3506 return result;
3507}
86e65da9 3508
f697c3e8
HX
3509static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3510 struct packet_type **pt_prev,
3511 int *ret, struct net_device *orig_dev)
3512{
24824a09
ED
3513 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3514
46e5da40 3515 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
f697c3e8 3516 goto out;
1da177e4 3517
f697c3e8
HX
3518 if (*pt_prev) {
3519 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3520 *pt_prev = NULL;
1da177e4
LT
3521 }
3522
24824a09 3523 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3524 case TC_ACT_SHOT:
3525 case TC_ACT_STOLEN:
3526 kfree_skb(skb);
3527 return NULL;
3528 }
3529
3530out:
3531 skb->tc_verd = 0;
3532 return skb;
1da177e4
LT
3533}
3534#endif
3535
ab95bfe0
JP
3536/**
3537 * netdev_rx_handler_register - register receive handler
3538 * @dev: device to register a handler for
3539 * @rx_handler: receive handler to register
93e2c32b 3540 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0 3541 *
e227867f 3542 * Register a receive handler for a device. This handler will then be
ab95bfe0
JP
3543 * called from __netif_receive_skb. A negative errno code is returned
3544 * on a failure.
3545 *
3546 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3547 *
3548 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3549 */
3550int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3551 rx_handler_func_t *rx_handler,
3552 void *rx_handler_data)
ab95bfe0
JP
3553{
3554 ASSERT_RTNL();
3555
3556 if (dev->rx_handler)
3557 return -EBUSY;
3558
00cfec37 3559 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3560 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3561 rcu_assign_pointer(dev->rx_handler, rx_handler);
3562
3563 return 0;
3564}
3565EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3566
3567/**
3568 * netdev_rx_handler_unregister - unregister receive handler
3569 * @dev: device to unregister a handler from
3570 *
166ec369 3571 * Unregister a receive handler from a device.
ab95bfe0
JP
3572 *
3573 * The caller must hold the rtnl_mutex.
3574 */
3575void netdev_rx_handler_unregister(struct net_device *dev)
3576{
3577
3578 ASSERT_RTNL();
a9b3cd7f 3579 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3580 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3581 * section has a guarantee to see a non NULL rx_handler_data
3582 * as well.
3583 */
3584 synchronize_net();
a9b3cd7f 3585 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3586}
3587EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3588
b4b9e355
MG
3589/*
3590 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3591 * the special handling of PFMEMALLOC skbs.
3592 */
3593static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3594{
3595 switch (skb->protocol) {
2b8837ae
JP
3596 case htons(ETH_P_ARP):
3597 case htons(ETH_P_IP):
3598 case htons(ETH_P_IPV6):
3599 case htons(ETH_P_8021Q):
3600 case htons(ETH_P_8021AD):
b4b9e355
MG
3601 return true;
3602 default:
3603 return false;
3604 }
3605}
3606
9754e293 3607static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3608{
3609 struct packet_type *ptype, *pt_prev;
ab95bfe0 3610 rx_handler_func_t *rx_handler;
f2ccd8fa 3611 struct net_device *orig_dev;
63d8ea7f 3612 struct net_device *null_or_dev;
8a4eb573 3613 bool deliver_exact = false;
1da177e4 3614 int ret = NET_RX_DROP;
252e3346 3615 __be16 type;
1da177e4 3616
588f0330 3617 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3618
cf66ba58 3619 trace_netif_receive_skb(skb);
9b22ea56 3620
cc9bd5ce 3621 orig_dev = skb->dev;
8f903c70 3622
c1d2bbe1 3623 skb_reset_network_header(skb);
fda55eca
ED
3624 if (!skb_transport_header_was_set(skb))
3625 skb_reset_transport_header(skb);
0b5c9db1 3626 skb_reset_mac_len(skb);
1da177e4
LT
3627
3628 pt_prev = NULL;
3629
3630 rcu_read_lock();
3631
63d8ea7f 3632another_round:
b6858177 3633 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3634
3635 __this_cpu_inc(softnet_data.processed);
3636
8ad227ff
PM
3637 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3638 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
0d5501c1 3639 skb = skb_vlan_untag(skb);
bcc6d479 3640 if (unlikely(!skb))
b4b9e355 3641 goto unlock;
bcc6d479
JP
3642 }
3643
1da177e4
LT
3644#ifdef CONFIG_NET_CLS_ACT
3645 if (skb->tc_verd & TC_NCLS) {
3646 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3647 goto ncls;
3648 }
3649#endif
3650
9754e293 3651 if (pfmemalloc)
b4b9e355
MG
3652 goto skip_taps;
3653
1da177e4 3654 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3655 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3656 if (pt_prev)
f2ccd8fa 3657 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3658 pt_prev = ptype;
3659 }
3660 }
3661
b4b9e355 3662skip_taps:
1da177e4 3663#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3664 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3665 if (!skb)
b4b9e355 3666 goto unlock;
1da177e4
LT
3667ncls:
3668#endif
3669
9754e293 3670 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3671 goto drop;
3672
2425717b
JF
3673 if (vlan_tx_tag_present(skb)) {
3674 if (pt_prev) {
3675 ret = deliver_skb(skb, pt_prev, orig_dev);
3676 pt_prev = NULL;
3677 }
48cc32d3 3678 if (vlan_do_receive(&skb))
2425717b
JF
3679 goto another_round;
3680 else if (unlikely(!skb))
b4b9e355 3681 goto unlock;
2425717b
JF
3682 }
3683
48cc32d3 3684 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3685 if (rx_handler) {
3686 if (pt_prev) {
3687 ret = deliver_skb(skb, pt_prev, orig_dev);
3688 pt_prev = NULL;
3689 }
8a4eb573
JP
3690 switch (rx_handler(&skb)) {
3691 case RX_HANDLER_CONSUMED:
3bc1b1ad 3692 ret = NET_RX_SUCCESS;
b4b9e355 3693 goto unlock;
8a4eb573 3694 case RX_HANDLER_ANOTHER:
63d8ea7f 3695 goto another_round;
8a4eb573
JP
3696 case RX_HANDLER_EXACT:
3697 deliver_exact = true;
3698 case RX_HANDLER_PASS:
3699 break;
3700 default:
3701 BUG();
3702 }
ab95bfe0 3703 }
1da177e4 3704
d4b812de
ED
3705 if (unlikely(vlan_tx_tag_present(skb))) {
3706 if (vlan_tx_tag_get_id(skb))
3707 skb->pkt_type = PACKET_OTHERHOST;
3708 /* Note: we might in the future use prio bits
3709 * and set skb->priority like in vlan_do_receive()
3710 * For the time being, just ignore Priority Code Point
3711 */
3712 skb->vlan_tci = 0;
3713 }
48cc32d3 3714
63d8ea7f 3715 /* deliver only exact match when indicated */
8a4eb573 3716 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3717
1da177e4 3718 type = skb->protocol;
82d8a867
PE
3719 list_for_each_entry_rcu(ptype,
3720 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3721 if (ptype->type == type &&
e3f48d37
JP
3722 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3723 ptype->dev == orig_dev)) {
4ec93edb 3724 if (pt_prev)
f2ccd8fa 3725 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3726 pt_prev = ptype;
3727 }
3728 }
3729
3730 if (pt_prev) {
1080e512 3731 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3732 goto drop;
1080e512
MT
3733 else
3734 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3735 } else {
b4b9e355 3736drop:
caf586e5 3737 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3738 kfree_skb(skb);
3739 /* Jamal, now you will not able to escape explaining
3740 * me how you were going to use this. :-)
3741 */
3742 ret = NET_RX_DROP;
3743 }
3744
b4b9e355 3745unlock:
1da177e4 3746 rcu_read_unlock();
9754e293
DM
3747 return ret;
3748}
3749
3750static int __netif_receive_skb(struct sk_buff *skb)
3751{
3752 int ret;
3753
3754 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3755 unsigned long pflags = current->flags;
3756
3757 /*
3758 * PFMEMALLOC skbs are special, they should
3759 * - be delivered to SOCK_MEMALLOC sockets only
3760 * - stay away from userspace
3761 * - have bounded memory usage
3762 *
3763 * Use PF_MEMALLOC as this saves us from propagating the allocation
3764 * context down to all allocation sites.
3765 */
3766 current->flags |= PF_MEMALLOC;
3767 ret = __netif_receive_skb_core(skb, true);
3768 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3769 } else
3770 ret = __netif_receive_skb_core(skb, false);
3771
1da177e4
LT
3772 return ret;
3773}
0a9627f2 3774
ae78dbfa 3775static int netif_receive_skb_internal(struct sk_buff *skb)
0a9627f2 3776{
588f0330 3777 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3778
c1f19b51
RC
3779 if (skb_defer_rx_timestamp(skb))
3780 return NET_RX_SUCCESS;
3781
df334545 3782#ifdef CONFIG_RPS
c5905afb 3783 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3784 struct rps_dev_flow voidflow, *rflow = &voidflow;
3785 int cpu, ret;
fec5e652 3786
3b098e2d
ED
3787 rcu_read_lock();
3788
3789 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3790
3b098e2d
ED
3791 if (cpu >= 0) {
3792 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3793 rcu_read_unlock();
adc9300e 3794 return ret;
3b098e2d 3795 }
adc9300e 3796 rcu_read_unlock();
fec5e652 3797 }
1e94d72f 3798#endif
adc9300e 3799 return __netif_receive_skb(skb);
0a9627f2 3800}
ae78dbfa
BH
3801
3802/**
3803 * netif_receive_skb - process receive buffer from network
3804 * @skb: buffer to process
3805 *
3806 * netif_receive_skb() is the main receive data processing function.
3807 * It always succeeds. The buffer may be dropped during processing
3808 * for congestion control or by the protocol layers.
3809 *
3810 * This function may only be called from softirq context and interrupts
3811 * should be enabled.
3812 *
3813 * Return values (usually ignored):
3814 * NET_RX_SUCCESS: no congestion
3815 * NET_RX_DROP: packet was dropped
3816 */
3817int netif_receive_skb(struct sk_buff *skb)
3818{
3819 trace_netif_receive_skb_entry(skb);
3820
3821 return netif_receive_skb_internal(skb);
3822}
d1b19dff 3823EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3824
88751275
ED
3825/* Network device is going away, flush any packets still pending
3826 * Called with irqs disabled.
3827 */
152102c7 3828static void flush_backlog(void *arg)
6e583ce5 3829{
152102c7 3830 struct net_device *dev = arg;
903ceff7 3831 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
6e583ce5
SH
3832 struct sk_buff *skb, *tmp;
3833
e36fa2f7 3834 rps_lock(sd);
6e7676c1 3835 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3836 if (skb->dev == dev) {
e36fa2f7 3837 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3838 kfree_skb(skb);
76cc8b13 3839 input_queue_head_incr(sd);
6e583ce5 3840 }
6e7676c1 3841 }
e36fa2f7 3842 rps_unlock(sd);
6e7676c1
CG
3843
3844 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3845 if (skb->dev == dev) {
3846 __skb_unlink(skb, &sd->process_queue);
3847 kfree_skb(skb);
76cc8b13 3848 input_queue_head_incr(sd);
6e7676c1
CG
3849 }
3850 }
6e583ce5
SH
3851}
3852
d565b0a1
HX
3853static int napi_gro_complete(struct sk_buff *skb)
3854{
22061d80 3855 struct packet_offload *ptype;
d565b0a1 3856 __be16 type = skb->protocol;
22061d80 3857 struct list_head *head = &offload_base;
d565b0a1
HX
3858 int err = -ENOENT;
3859
c3c7c254
ED
3860 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3861
fc59f9a3
HX
3862 if (NAPI_GRO_CB(skb)->count == 1) {
3863 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3864 goto out;
fc59f9a3 3865 }
d565b0a1
HX
3866
3867 rcu_read_lock();
3868 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3869 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3870 continue;
3871
299603e8 3872 err = ptype->callbacks.gro_complete(skb, 0);
d565b0a1
HX
3873 break;
3874 }
3875 rcu_read_unlock();
3876
3877 if (err) {
3878 WARN_ON(&ptype->list == head);
3879 kfree_skb(skb);
3880 return NET_RX_SUCCESS;
3881 }
3882
3883out:
ae78dbfa 3884 return netif_receive_skb_internal(skb);
d565b0a1
HX
3885}
3886
2e71a6f8
ED
3887/* napi->gro_list contains packets ordered by age.
3888 * youngest packets at the head of it.
3889 * Complete skbs in reverse order to reduce latencies.
3890 */
3891void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3892{
2e71a6f8 3893 struct sk_buff *skb, *prev = NULL;
d565b0a1 3894
2e71a6f8
ED
3895 /* scan list and build reverse chain */
3896 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3897 skb->prev = prev;
3898 prev = skb;
3899 }
3900
3901 for (skb = prev; skb; skb = prev) {
d565b0a1 3902 skb->next = NULL;
2e71a6f8
ED
3903
3904 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3905 return;
3906
3907 prev = skb->prev;
d565b0a1 3908 napi_gro_complete(skb);
2e71a6f8 3909 napi->gro_count--;
d565b0a1
HX
3910 }
3911
3912 napi->gro_list = NULL;
3913}
86cac58b 3914EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3915
89c5fa33
ED
3916static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3917{
3918 struct sk_buff *p;
3919 unsigned int maclen = skb->dev->hard_header_len;
0b4cec8c 3920 u32 hash = skb_get_hash_raw(skb);
89c5fa33
ED
3921
3922 for (p = napi->gro_list; p; p = p->next) {
3923 unsigned long diffs;
3924
0b4cec8c
TH
3925 NAPI_GRO_CB(p)->flush = 0;
3926
3927 if (hash != skb_get_hash_raw(p)) {
3928 NAPI_GRO_CB(p)->same_flow = 0;
3929 continue;
3930 }
3931
89c5fa33
ED
3932 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3933 diffs |= p->vlan_tci ^ skb->vlan_tci;
3934 if (maclen == ETH_HLEN)
3935 diffs |= compare_ether_header(skb_mac_header(p),
a50e233c 3936 skb_mac_header(skb));
89c5fa33
ED
3937 else if (!diffs)
3938 diffs = memcmp(skb_mac_header(p),
a50e233c 3939 skb_mac_header(skb),
89c5fa33
ED
3940 maclen);
3941 NAPI_GRO_CB(p)->same_flow = !diffs;
89c5fa33
ED
3942 }
3943}
3944
299603e8
JC
3945static void skb_gro_reset_offset(struct sk_buff *skb)
3946{
3947 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3948 const skb_frag_t *frag0 = &pinfo->frags[0];
3949
3950 NAPI_GRO_CB(skb)->data_offset = 0;
3951 NAPI_GRO_CB(skb)->frag0 = NULL;
3952 NAPI_GRO_CB(skb)->frag0_len = 0;
3953
3954 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3955 pinfo->nr_frags &&
3956 !PageHighMem(skb_frag_page(frag0))) {
3957 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3958 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
89c5fa33
ED
3959 }
3960}
3961
a50e233c
ED
3962static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3963{
3964 struct skb_shared_info *pinfo = skb_shinfo(skb);
3965
3966 BUG_ON(skb->end - skb->tail < grow);
3967
3968 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3969
3970 skb->data_len -= grow;
3971 skb->tail += grow;
3972
3973 pinfo->frags[0].page_offset += grow;
3974 skb_frag_size_sub(&pinfo->frags[0], grow);
3975
3976 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3977 skb_frag_unref(skb, 0);
3978 memmove(pinfo->frags, pinfo->frags + 1,
3979 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3980 }
3981}
3982
bb728820 3983static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3984{
3985 struct sk_buff **pp = NULL;
22061d80 3986 struct packet_offload *ptype;
d565b0a1 3987 __be16 type = skb->protocol;
22061d80 3988 struct list_head *head = &offload_base;
0da2afd5 3989 int same_flow;
5b252f0c 3990 enum gro_result ret;
a50e233c 3991 int grow;
d565b0a1 3992
9c62a68d 3993 if (!(skb->dev->features & NETIF_F_GRO))
d565b0a1
HX
3994 goto normal;
3995
5a212329 3996 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
f17f5c91
HX
3997 goto normal;
3998
89c5fa33
ED
3999 gro_list_prepare(napi, skb);
4000
d565b0a1
HX
4001 rcu_read_lock();
4002 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 4003 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
4004 continue;
4005
86911732 4006 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 4007 skb_reset_mac_len(skb);
d565b0a1
HX
4008 NAPI_GRO_CB(skb)->same_flow = 0;
4009 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 4010 NAPI_GRO_CB(skb)->free = 0;
b582ef09 4011 NAPI_GRO_CB(skb)->udp_mark = 0;
d565b0a1 4012
662880f4
TH
4013 /* Setup for GRO checksum validation */
4014 switch (skb->ip_summed) {
4015 case CHECKSUM_COMPLETE:
4016 NAPI_GRO_CB(skb)->csum = skb->csum;
4017 NAPI_GRO_CB(skb)->csum_valid = 1;
4018 NAPI_GRO_CB(skb)->csum_cnt = 0;
4019 break;
4020 case CHECKSUM_UNNECESSARY:
4021 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4022 NAPI_GRO_CB(skb)->csum_valid = 0;
4023 break;
4024 default:
4025 NAPI_GRO_CB(skb)->csum_cnt = 0;
4026 NAPI_GRO_CB(skb)->csum_valid = 0;
4027 }
d565b0a1 4028
f191a1d1 4029 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
4030 break;
4031 }
4032 rcu_read_unlock();
4033
4034 if (&ptype->list == head)
4035 goto normal;
4036
0da2afd5 4037 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 4038 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 4039
d565b0a1
HX
4040 if (pp) {
4041 struct sk_buff *nskb = *pp;
4042
4043 *pp = nskb->next;
4044 nskb->next = NULL;
4045 napi_gro_complete(nskb);
4ae5544f 4046 napi->gro_count--;
d565b0a1
HX
4047 }
4048
0da2afd5 4049 if (same_flow)
d565b0a1
HX
4050 goto ok;
4051
600adc18 4052 if (NAPI_GRO_CB(skb)->flush)
d565b0a1 4053 goto normal;
d565b0a1 4054
600adc18
ED
4055 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4056 struct sk_buff *nskb = napi->gro_list;
4057
4058 /* locate the end of the list to select the 'oldest' flow */
4059 while (nskb->next) {
4060 pp = &nskb->next;
4061 nskb = *pp;
4062 }
4063 *pp = NULL;
4064 nskb->next = NULL;
4065 napi_gro_complete(nskb);
4066 } else {
4067 napi->gro_count++;
4068 }
d565b0a1 4069 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 4070 NAPI_GRO_CB(skb)->age = jiffies;
29e98242 4071 NAPI_GRO_CB(skb)->last = skb;
86911732 4072 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
4073 skb->next = napi->gro_list;
4074 napi->gro_list = skb;
5d0d9be8 4075 ret = GRO_HELD;
d565b0a1 4076
ad0f9904 4077pull:
a50e233c
ED
4078 grow = skb_gro_offset(skb) - skb_headlen(skb);
4079 if (grow > 0)
4080 gro_pull_from_frag0(skb, grow);
d565b0a1 4081ok:
5d0d9be8 4082 return ret;
d565b0a1
HX
4083
4084normal:
ad0f9904
HX
4085 ret = GRO_NORMAL;
4086 goto pull;
5d38a079 4087}
96e93eab 4088
bf5a755f
JC
4089struct packet_offload *gro_find_receive_by_type(__be16 type)
4090{
4091 struct list_head *offload_head = &offload_base;
4092 struct packet_offload *ptype;
4093
4094 list_for_each_entry_rcu(ptype, offload_head, list) {
4095 if (ptype->type != type || !ptype->callbacks.gro_receive)
4096 continue;
4097 return ptype;
4098 }
4099 return NULL;
4100}
e27a2f83 4101EXPORT_SYMBOL(gro_find_receive_by_type);
bf5a755f
JC
4102
4103struct packet_offload *gro_find_complete_by_type(__be16 type)
4104{
4105 struct list_head *offload_head = &offload_base;
4106 struct packet_offload *ptype;
4107
4108 list_for_each_entry_rcu(ptype, offload_head, list) {
4109 if (ptype->type != type || !ptype->callbacks.gro_complete)
4110 continue;
4111 return ptype;
4112 }
4113 return NULL;
4114}
e27a2f83 4115EXPORT_SYMBOL(gro_find_complete_by_type);
5d38a079 4116
bb728820 4117static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 4118{
5d0d9be8
HX
4119 switch (ret) {
4120 case GRO_NORMAL:
ae78dbfa 4121 if (netif_receive_skb_internal(skb))
c7c4b3b6
BH
4122 ret = GRO_DROP;
4123 break;
5d38a079 4124
5d0d9be8 4125 case GRO_DROP:
5d38a079
HX
4126 kfree_skb(skb);
4127 break;
5b252f0c 4128
daa86548 4129 case GRO_MERGED_FREE:
d7e8883c
ED
4130 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4131 kmem_cache_free(skbuff_head_cache, skb);
4132 else
4133 __kfree_skb(skb);
daa86548
ED
4134 break;
4135
5b252f0c
BH
4136 case GRO_HELD:
4137 case GRO_MERGED:
4138 break;
5d38a079
HX
4139 }
4140
c7c4b3b6 4141 return ret;
5d0d9be8 4142}
5d0d9be8 4143
c7c4b3b6 4144gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 4145{
ae78dbfa 4146 trace_napi_gro_receive_entry(skb);
86911732 4147
a50e233c
ED
4148 skb_gro_reset_offset(skb);
4149
89c5fa33 4150 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
4151}
4152EXPORT_SYMBOL(napi_gro_receive);
4153
d0c2b0d2 4154static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 4155{
93a35f59
ED
4156 if (unlikely(skb->pfmemalloc)) {
4157 consume_skb(skb);
4158 return;
4159 }
96e93eab 4160 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
4161 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4162 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 4163 skb->vlan_tci = 0;
66c46d74 4164 skb->dev = napi->dev;
6d152e23 4165 skb->skb_iif = 0;
c3caf119
JC
4166 skb->encapsulation = 0;
4167 skb_shinfo(skb)->gso_type = 0;
e33d0ba8 4168 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
4169
4170 napi->skb = skb;
4171}
96e93eab 4172
76620aaf 4173struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 4174{
5d38a079 4175 struct sk_buff *skb = napi->skb;
5d38a079
HX
4176
4177 if (!skb) {
fd11a83d 4178 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
84b9cd63 4179 napi->skb = skb;
80595d59 4180 }
96e93eab
HX
4181 return skb;
4182}
76620aaf 4183EXPORT_SYMBOL(napi_get_frags);
96e93eab 4184
a50e233c
ED
4185static gro_result_t napi_frags_finish(struct napi_struct *napi,
4186 struct sk_buff *skb,
4187 gro_result_t ret)
96e93eab 4188{
5d0d9be8
HX
4189 switch (ret) {
4190 case GRO_NORMAL:
a50e233c
ED
4191 case GRO_HELD:
4192 __skb_push(skb, ETH_HLEN);
4193 skb->protocol = eth_type_trans(skb, skb->dev);
4194 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
c7c4b3b6 4195 ret = GRO_DROP;
86911732 4196 break;
5d38a079 4197
5d0d9be8 4198 case GRO_DROP:
5d0d9be8
HX
4199 case GRO_MERGED_FREE:
4200 napi_reuse_skb(napi, skb);
4201 break;
5b252f0c
BH
4202
4203 case GRO_MERGED:
4204 break;
5d0d9be8 4205 }
5d38a079 4206
c7c4b3b6 4207 return ret;
5d38a079 4208}
5d0d9be8 4209
a50e233c
ED
4210/* Upper GRO stack assumes network header starts at gro_offset=0
4211 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4212 * We copy ethernet header into skb->data to have a common layout.
4213 */
4adb9c4a 4214static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
4215{
4216 struct sk_buff *skb = napi->skb;
a50e233c
ED
4217 const struct ethhdr *eth;
4218 unsigned int hlen = sizeof(*eth);
76620aaf
HX
4219
4220 napi->skb = NULL;
4221
a50e233c
ED
4222 skb_reset_mac_header(skb);
4223 skb_gro_reset_offset(skb);
4224
4225 eth = skb_gro_header_fast(skb, 0);
4226 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4227 eth = skb_gro_header_slow(skb, hlen, 0);
4228 if (unlikely(!eth)) {
4229 napi_reuse_skb(napi, skb);
4230 return NULL;
4231 }
4232 } else {
4233 gro_pull_from_frag0(skb, hlen);
4234 NAPI_GRO_CB(skb)->frag0 += hlen;
4235 NAPI_GRO_CB(skb)->frag0_len -= hlen;
76620aaf 4236 }
a50e233c
ED
4237 __skb_pull(skb, hlen);
4238
4239 /*
4240 * This works because the only protocols we care about don't require
4241 * special handling.
4242 * We'll fix it up properly in napi_frags_finish()
4243 */
4244 skb->protocol = eth->h_proto;
76620aaf 4245
76620aaf
HX
4246 return skb;
4247}
76620aaf 4248
c7c4b3b6 4249gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 4250{
76620aaf 4251 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
4252
4253 if (!skb)
c7c4b3b6 4254 return GRO_DROP;
5d0d9be8 4255
ae78dbfa
BH
4256 trace_napi_gro_frags_entry(skb);
4257
89c5fa33 4258 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 4259}
5d38a079
HX
4260EXPORT_SYMBOL(napi_gro_frags);
4261
573e8fca
TH
4262/* Compute the checksum from gro_offset and return the folded value
4263 * after adding in any pseudo checksum.
4264 */
4265__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4266{
4267 __wsum wsum;
4268 __sum16 sum;
4269
4270 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4271
4272 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4273 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4274 if (likely(!sum)) {
4275 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4276 !skb->csum_complete_sw)
4277 netdev_rx_csum_fault(skb->dev);
4278 }
4279
4280 NAPI_GRO_CB(skb)->csum = wsum;
4281 NAPI_GRO_CB(skb)->csum_valid = 1;
4282
4283 return sum;
4284}
4285EXPORT_SYMBOL(__skb_gro_checksum_complete);
4286
e326bed2 4287/*
855abcf0 4288 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
e326bed2
ED
4289 * Note: called with local irq disabled, but exits with local irq enabled.
4290 */
4291static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4292{
4293#ifdef CONFIG_RPS
4294 struct softnet_data *remsd = sd->rps_ipi_list;
4295
4296 if (remsd) {
4297 sd->rps_ipi_list = NULL;
4298
4299 local_irq_enable();
4300
4301 /* Send pending IPI's to kick RPS processing on remote cpus. */
4302 while (remsd) {
4303 struct softnet_data *next = remsd->rps_ipi_next;
4304
4305 if (cpu_online(remsd->cpu))
c46fff2a 4306 smp_call_function_single_async(remsd->cpu,
fce8ad15 4307 &remsd->csd);
e326bed2
ED
4308 remsd = next;
4309 }
4310 } else
4311#endif
4312 local_irq_enable();
4313}
4314
d75b1ade
ED
4315static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4316{
4317#ifdef CONFIG_RPS
4318 return sd->rps_ipi_list != NULL;
4319#else
4320 return false;
4321#endif
4322}
4323
bea3348e 4324static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4325{
4326 int work = 0;
eecfd7c4 4327 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4328
e326bed2
ED
4329 /* Check if we have pending ipi, its better to send them now,
4330 * not waiting net_rx_action() end.
4331 */
d75b1ade 4332 if (sd_has_rps_ipi_waiting(sd)) {
e326bed2
ED
4333 local_irq_disable();
4334 net_rps_action_and_irq_enable(sd);
4335 }
d75b1ade 4336
bea3348e 4337 napi->weight = weight_p;
6e7676c1 4338 local_irq_disable();
11ef7a89 4339 while (1) {
1da177e4 4340 struct sk_buff *skb;
6e7676c1
CG
4341
4342 while ((skb = __skb_dequeue(&sd->process_queue))) {
4343 local_irq_enable();
4344 __netif_receive_skb(skb);
6e7676c1 4345 local_irq_disable();
76cc8b13
TH
4346 input_queue_head_incr(sd);
4347 if (++work >= quota) {
4348 local_irq_enable();
4349 return work;
4350 }
6e7676c1 4351 }
1da177e4 4352
e36fa2f7 4353 rps_lock(sd);
11ef7a89 4354 if (skb_queue_empty(&sd->input_pkt_queue)) {
eecfd7c4
ED
4355 /*
4356 * Inline a custom version of __napi_complete().
4357 * only current cpu owns and manipulates this napi,
11ef7a89
TH
4358 * and NAPI_STATE_SCHED is the only possible flag set
4359 * on backlog.
4360 * We can use a plain write instead of clear_bit(),
eecfd7c4
ED
4361 * and we dont need an smp_mb() memory barrier.
4362 */
eecfd7c4 4363 napi->state = 0;
11ef7a89 4364 rps_unlock(sd);
eecfd7c4 4365
11ef7a89 4366 break;
bea3348e 4367 }
11ef7a89
TH
4368
4369 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4370 &sd->process_queue);
e36fa2f7 4371 rps_unlock(sd);
6e7676c1
CG
4372 }
4373 local_irq_enable();
1da177e4 4374
bea3348e
SH
4375 return work;
4376}
1da177e4 4377
bea3348e
SH
4378/**
4379 * __napi_schedule - schedule for receive
c4ea43c5 4380 * @n: entry to schedule
bea3348e 4381 *
bc9ad166
ED
4382 * The entry's receive function will be scheduled to run.
4383 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
bea3348e 4384 */
b5606c2d 4385void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4386{
4387 unsigned long flags;
1da177e4 4388
bea3348e 4389 local_irq_save(flags);
903ceff7 4390 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
bea3348e 4391 local_irq_restore(flags);
1da177e4 4392}
bea3348e
SH
4393EXPORT_SYMBOL(__napi_schedule);
4394
bc9ad166
ED
4395/**
4396 * __napi_schedule_irqoff - schedule for receive
4397 * @n: entry to schedule
4398 *
4399 * Variant of __napi_schedule() assuming hard irqs are masked
4400 */
4401void __napi_schedule_irqoff(struct napi_struct *n)
4402{
4403 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4404}
4405EXPORT_SYMBOL(__napi_schedule_irqoff);
4406
d565b0a1
HX
4407void __napi_complete(struct napi_struct *n)
4408{
4409 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
d565b0a1 4410
d75b1ade 4411 list_del_init(&n->poll_list);
4e857c58 4412 smp_mb__before_atomic();
d565b0a1
HX
4413 clear_bit(NAPI_STATE_SCHED, &n->state);
4414}
4415EXPORT_SYMBOL(__napi_complete);
4416
3b47d303 4417void napi_complete_done(struct napi_struct *n, int work_done)
d565b0a1
HX
4418{
4419 unsigned long flags;
4420
4421 /*
4422 * don't let napi dequeue from the cpu poll list
4423 * just in case its running on a different cpu
4424 */
4425 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4426 return;
4427
3b47d303
ED
4428 if (n->gro_list) {
4429 unsigned long timeout = 0;
d75b1ade 4430
3b47d303
ED
4431 if (work_done)
4432 timeout = n->dev->gro_flush_timeout;
4433
4434 if (timeout)
4435 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4436 HRTIMER_MODE_REL_PINNED);
4437 else
4438 napi_gro_flush(n, false);
4439 }
d75b1ade
ED
4440 if (likely(list_empty(&n->poll_list))) {
4441 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4442 } else {
4443 /* If n->poll_list is not empty, we need to mask irqs */
4444 local_irq_save(flags);
4445 __napi_complete(n);
4446 local_irq_restore(flags);
4447 }
d565b0a1 4448}
3b47d303 4449EXPORT_SYMBOL(napi_complete_done);
d565b0a1 4450
af12fa6e
ET
4451/* must be called under rcu_read_lock(), as we dont take a reference */
4452struct napi_struct *napi_by_id(unsigned int napi_id)
4453{
4454 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4455 struct napi_struct *napi;
4456
4457 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4458 if (napi->napi_id == napi_id)
4459 return napi;
4460
4461 return NULL;
4462}
4463EXPORT_SYMBOL_GPL(napi_by_id);
4464
4465void napi_hash_add(struct napi_struct *napi)
4466{
4467 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4468
4469 spin_lock(&napi_hash_lock);
4470
4471 /* 0 is not a valid id, we also skip an id that is taken
4472 * we expect both events to be extremely rare
4473 */
4474 napi->napi_id = 0;
4475 while (!napi->napi_id) {
4476 napi->napi_id = ++napi_gen_id;
4477 if (napi_by_id(napi->napi_id))
4478 napi->napi_id = 0;
4479 }
4480
4481 hlist_add_head_rcu(&napi->napi_hash_node,
4482 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4483
4484 spin_unlock(&napi_hash_lock);
4485 }
4486}
4487EXPORT_SYMBOL_GPL(napi_hash_add);
4488
4489/* Warning : caller is responsible to make sure rcu grace period
4490 * is respected before freeing memory containing @napi
4491 */
4492void napi_hash_del(struct napi_struct *napi)
4493{
4494 spin_lock(&napi_hash_lock);
4495
4496 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4497 hlist_del_rcu(&napi->napi_hash_node);
4498
4499 spin_unlock(&napi_hash_lock);
4500}
4501EXPORT_SYMBOL_GPL(napi_hash_del);
4502
3b47d303
ED
4503static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4504{
4505 struct napi_struct *napi;
4506
4507 napi = container_of(timer, struct napi_struct, timer);
4508 if (napi->gro_list)
4509 napi_schedule(napi);
4510
4511 return HRTIMER_NORESTART;
4512}
4513
d565b0a1
HX
4514void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4515 int (*poll)(struct napi_struct *, int), int weight)
4516{
4517 INIT_LIST_HEAD(&napi->poll_list);
3b47d303
ED
4518 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4519 napi->timer.function = napi_watchdog;
4ae5544f 4520 napi->gro_count = 0;
d565b0a1 4521 napi->gro_list = NULL;
5d38a079 4522 napi->skb = NULL;
d565b0a1 4523 napi->poll = poll;
82dc3c63
ED
4524 if (weight > NAPI_POLL_WEIGHT)
4525 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4526 weight, dev->name);
d565b0a1
HX
4527 napi->weight = weight;
4528 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4529 napi->dev = dev;
5d38a079 4530#ifdef CONFIG_NETPOLL
d565b0a1
HX
4531 spin_lock_init(&napi->poll_lock);
4532 napi->poll_owner = -1;
4533#endif
4534 set_bit(NAPI_STATE_SCHED, &napi->state);
4535}
4536EXPORT_SYMBOL(netif_napi_add);
4537
3b47d303
ED
4538void napi_disable(struct napi_struct *n)
4539{
4540 might_sleep();
4541 set_bit(NAPI_STATE_DISABLE, &n->state);
4542
4543 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4544 msleep(1);
4545
4546 hrtimer_cancel(&n->timer);
4547
4548 clear_bit(NAPI_STATE_DISABLE, &n->state);
4549}
4550EXPORT_SYMBOL(napi_disable);
4551
d565b0a1
HX
4552void netif_napi_del(struct napi_struct *napi)
4553{
d7b06636 4554 list_del_init(&napi->dev_list);
76620aaf 4555 napi_free_frags(napi);
d565b0a1 4556
289dccbe 4557 kfree_skb_list(napi->gro_list);
d565b0a1 4558 napi->gro_list = NULL;
4ae5544f 4559 napi->gro_count = 0;
d565b0a1
HX
4560}
4561EXPORT_SYMBOL(netif_napi_del);
4562
726ce70e
HX
4563static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4564{
4565 void *have;
4566 int work, weight;
4567
4568 list_del_init(&n->poll_list);
4569
4570 have = netpoll_poll_lock(n);
4571
4572 weight = n->weight;
4573
4574 /* This NAPI_STATE_SCHED test is for avoiding a race
4575 * with netpoll's poll_napi(). Only the entity which
4576 * obtains the lock and sees NAPI_STATE_SCHED set will
4577 * actually make the ->poll() call. Therefore we avoid
4578 * accidentally calling ->poll() when NAPI is not scheduled.
4579 */
4580 work = 0;
4581 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4582 work = n->poll(n, weight);
4583 trace_napi_poll(n);
4584 }
4585
4586 WARN_ON_ONCE(work > weight);
4587
4588 if (likely(work < weight))
4589 goto out_unlock;
4590
4591 /* Drivers must not modify the NAPI state if they
4592 * consume the entire weight. In such cases this code
4593 * still "owns" the NAPI instance and therefore can
4594 * move the instance around on the list at-will.
4595 */
4596 if (unlikely(napi_disable_pending(n))) {
4597 napi_complete(n);
4598 goto out_unlock;
4599 }
4600
4601 if (n->gro_list) {
4602 /* flush too old packets
4603 * If HZ < 1000, flush all packets.
4604 */
4605 napi_gro_flush(n, HZ >= 1000);
4606 }
4607
001ce546
HX
4608 /* Some drivers may have called napi_schedule
4609 * prior to exhausting their budget.
4610 */
4611 if (unlikely(!list_empty(&n->poll_list))) {
4612 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4613 n->dev ? n->dev->name : "backlog");
4614 goto out_unlock;
4615 }
4616
726ce70e
HX
4617 list_add_tail(&n->poll_list, repoll);
4618
4619out_unlock:
4620 netpoll_poll_unlock(have);
4621
4622 return work;
4623}
4624
1da177e4
LT
4625static void net_rx_action(struct softirq_action *h)
4626{
903ceff7 4627 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
24f8b238 4628 unsigned long time_limit = jiffies + 2;
51b0bded 4629 int budget = netdev_budget;
d75b1ade
ED
4630 LIST_HEAD(list);
4631 LIST_HEAD(repoll);
53fb95d3 4632
1da177e4 4633 local_irq_disable();
d75b1ade
ED
4634 list_splice_init(&sd->poll_list, &list);
4635 local_irq_enable();
1da177e4 4636
ceb8d5bf 4637 for (;;) {
bea3348e 4638 struct napi_struct *n;
1da177e4 4639
ceb8d5bf
HX
4640 if (list_empty(&list)) {
4641 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4642 return;
4643 break;
4644 }
4645
6bd373eb
HX
4646 n = list_first_entry(&list, struct napi_struct, poll_list);
4647 budget -= napi_poll(n, &repoll);
4648
d75b1ade 4649 /* If softirq window is exhausted then punt.
24f8b238
SH
4650 * Allow this to run for 2 jiffies since which will allow
4651 * an average latency of 1.5/HZ.
bea3348e 4652 */
ceb8d5bf
HX
4653 if (unlikely(budget <= 0 ||
4654 time_after_eq(jiffies, time_limit))) {
4655 sd->time_squeeze++;
4656 break;
4657 }
1da177e4 4658 }
d75b1ade 4659
d75b1ade
ED
4660 local_irq_disable();
4661
4662 list_splice_tail_init(&sd->poll_list, &list);
4663 list_splice_tail(&repoll, &list);
4664 list_splice(&list, &sd->poll_list);
4665 if (!list_empty(&sd->poll_list))
4666 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4667
e326bed2 4668 net_rps_action_and_irq_enable(sd);
1da177e4
LT
4669}
4670
aa9d8560 4671struct netdev_adjacent {
9ff162a8 4672 struct net_device *dev;
5d261913
VF
4673
4674 /* upper master flag, there can only be one master device per list */
9ff162a8 4675 bool master;
5d261913 4676
5d261913
VF
4677 /* counter for the number of times this device was added to us */
4678 u16 ref_nr;
4679
402dae96
VF
4680 /* private field for the users */
4681 void *private;
4682
9ff162a8
JP
4683 struct list_head list;
4684 struct rcu_head rcu;
9ff162a8
JP
4685};
4686
5d261913
VF
4687static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4688 struct net_device *adj_dev,
2f268f12 4689 struct list_head *adj_list)
9ff162a8 4690{
5d261913 4691 struct netdev_adjacent *adj;
5d261913 4692
2f268f12 4693 list_for_each_entry(adj, adj_list, list) {
5d261913
VF
4694 if (adj->dev == adj_dev)
4695 return adj;
9ff162a8
JP
4696 }
4697 return NULL;
4698}
4699
4700/**
4701 * netdev_has_upper_dev - Check if device is linked to an upper device
4702 * @dev: device
4703 * @upper_dev: upper device to check
4704 *
4705 * Find out if a device is linked to specified upper device and return true
4706 * in case it is. Note that this checks only immediate upper device,
4707 * not through a complete stack of devices. The caller must hold the RTNL lock.
4708 */
4709bool netdev_has_upper_dev(struct net_device *dev,
4710 struct net_device *upper_dev)
4711{
4712 ASSERT_RTNL();
4713
2f268f12 4714 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
9ff162a8
JP
4715}
4716EXPORT_SYMBOL(netdev_has_upper_dev);
4717
4718/**
4719 * netdev_has_any_upper_dev - Check if device is linked to some device
4720 * @dev: device
4721 *
4722 * Find out if a device is linked to an upper device and return true in case
4723 * it is. The caller must hold the RTNL lock.
4724 */
1d143d9f 4725static bool netdev_has_any_upper_dev(struct net_device *dev)
9ff162a8
JP
4726{
4727 ASSERT_RTNL();
4728
2f268f12 4729 return !list_empty(&dev->all_adj_list.upper);
9ff162a8 4730}
9ff162a8
JP
4731
4732/**
4733 * netdev_master_upper_dev_get - Get master upper device
4734 * @dev: device
4735 *
4736 * Find a master upper device and return pointer to it or NULL in case
4737 * it's not there. The caller must hold the RTNL lock.
4738 */
4739struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4740{
aa9d8560 4741 struct netdev_adjacent *upper;
9ff162a8
JP
4742
4743 ASSERT_RTNL();
4744
2f268f12 4745 if (list_empty(&dev->adj_list.upper))
9ff162a8
JP
4746 return NULL;
4747
2f268f12 4748 upper = list_first_entry(&dev->adj_list.upper,
aa9d8560 4749 struct netdev_adjacent, list);
9ff162a8
JP
4750 if (likely(upper->master))
4751 return upper->dev;
4752 return NULL;
4753}
4754EXPORT_SYMBOL(netdev_master_upper_dev_get);
4755
b6ccba4c
VF
4756void *netdev_adjacent_get_private(struct list_head *adj_list)
4757{
4758 struct netdev_adjacent *adj;
4759
4760 adj = list_entry(adj_list, struct netdev_adjacent, list);
4761
4762 return adj->private;
4763}
4764EXPORT_SYMBOL(netdev_adjacent_get_private);
4765
44a40855
VY
4766/**
4767 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4768 * @dev: device
4769 * @iter: list_head ** of the current position
4770 *
4771 * Gets the next device from the dev's upper list, starting from iter
4772 * position. The caller must hold RCU read lock.
4773 */
4774struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4775 struct list_head **iter)
4776{
4777 struct netdev_adjacent *upper;
4778
4779 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4780
4781 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4782
4783 if (&upper->list == &dev->adj_list.upper)
4784 return NULL;
4785
4786 *iter = &upper->list;
4787
4788 return upper->dev;
4789}
4790EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4791
31088a11
VF
4792/**
4793 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
48311f46
VF
4794 * @dev: device
4795 * @iter: list_head ** of the current position
4796 *
4797 * Gets the next device from the dev's upper list, starting from iter
4798 * position. The caller must hold RCU read lock.
4799 */
2f268f12
VF
4800struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4801 struct list_head **iter)
48311f46
VF
4802{
4803 struct netdev_adjacent *upper;
4804
85328240 4805 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
48311f46
VF
4806
4807 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4808
2f268f12 4809 if (&upper->list == &dev->all_adj_list.upper)
48311f46
VF
4810 return NULL;
4811
4812 *iter = &upper->list;
4813
4814 return upper->dev;
4815}
2f268f12 4816EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
48311f46 4817
31088a11
VF
4818/**
4819 * netdev_lower_get_next_private - Get the next ->private from the
4820 * lower neighbour list
4821 * @dev: device
4822 * @iter: list_head ** of the current position
4823 *
4824 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4825 * list, starting from iter position. The caller must hold either hold the
4826 * RTNL lock or its own locking that guarantees that the neighbour lower
4827 * list will remain unchainged.
4828 */
4829void *netdev_lower_get_next_private(struct net_device *dev,
4830 struct list_head **iter)
4831{
4832 struct netdev_adjacent *lower;
4833
4834 lower = list_entry(*iter, struct netdev_adjacent, list);
4835
4836 if (&lower->list == &dev->adj_list.lower)
4837 return NULL;
4838
6859e7df 4839 *iter = lower->list.next;
31088a11
VF
4840
4841 return lower->private;
4842}
4843EXPORT_SYMBOL(netdev_lower_get_next_private);
4844
4845/**
4846 * netdev_lower_get_next_private_rcu - Get the next ->private from the
4847 * lower neighbour list, RCU
4848 * variant
4849 * @dev: device
4850 * @iter: list_head ** of the current position
4851 *
4852 * Gets the next netdev_adjacent->private from the dev's lower neighbour
4853 * list, starting from iter position. The caller must hold RCU read lock.
4854 */
4855void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4856 struct list_head **iter)
4857{
4858 struct netdev_adjacent *lower;
4859
4860 WARN_ON_ONCE(!rcu_read_lock_held());
4861
4862 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4863
4864 if (&lower->list == &dev->adj_list.lower)
4865 return NULL;
4866
6859e7df 4867 *iter = &lower->list;
31088a11
VF
4868
4869 return lower->private;
4870}
4871EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4872
4085ebe8
VY
4873/**
4874 * netdev_lower_get_next - Get the next device from the lower neighbour
4875 * list
4876 * @dev: device
4877 * @iter: list_head ** of the current position
4878 *
4879 * Gets the next netdev_adjacent from the dev's lower neighbour
4880 * list, starting from iter position. The caller must hold RTNL lock or
4881 * its own locking that guarantees that the neighbour lower
4882 * list will remain unchainged.
4883 */
4884void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4885{
4886 struct netdev_adjacent *lower;
4887
4888 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4889
4890 if (&lower->list == &dev->adj_list.lower)
4891 return NULL;
4892
4893 *iter = &lower->list;
4894
4895 return lower->dev;
4896}
4897EXPORT_SYMBOL(netdev_lower_get_next);
4898
e001bfad 4899/**
4900 * netdev_lower_get_first_private_rcu - Get the first ->private from the
4901 * lower neighbour list, RCU
4902 * variant
4903 * @dev: device
4904 *
4905 * Gets the first netdev_adjacent->private from the dev's lower neighbour
4906 * list. The caller must hold RCU read lock.
4907 */
4908void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4909{
4910 struct netdev_adjacent *lower;
4911
4912 lower = list_first_or_null_rcu(&dev->adj_list.lower,
4913 struct netdev_adjacent, list);
4914 if (lower)
4915 return lower->private;
4916 return NULL;
4917}
4918EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4919
9ff162a8
JP
4920/**
4921 * netdev_master_upper_dev_get_rcu - Get master upper device
4922 * @dev: device
4923 *
4924 * Find a master upper device and return pointer to it or NULL in case
4925 * it's not there. The caller must hold the RCU read lock.
4926 */
4927struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4928{
aa9d8560 4929 struct netdev_adjacent *upper;
9ff162a8 4930
2f268f12 4931 upper = list_first_or_null_rcu(&dev->adj_list.upper,
aa9d8560 4932 struct netdev_adjacent, list);
9ff162a8
JP
4933 if (upper && likely(upper->master))
4934 return upper->dev;
4935 return NULL;
4936}
4937EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4938
0a59f3a9 4939static int netdev_adjacent_sysfs_add(struct net_device *dev,
3ee32707
VF
4940 struct net_device *adj_dev,
4941 struct list_head *dev_list)
4942{
4943 char linkname[IFNAMSIZ+7];
4944 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4945 "upper_%s" : "lower_%s", adj_dev->name);
4946 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4947 linkname);
4948}
0a59f3a9 4949static void netdev_adjacent_sysfs_del(struct net_device *dev,
3ee32707
VF
4950 char *name,
4951 struct list_head *dev_list)
4952{
4953 char linkname[IFNAMSIZ+7];
4954 sprintf(linkname, dev_list == &dev->adj_list.upper ?
4955 "upper_%s" : "lower_%s", name);
4956 sysfs_remove_link(&(dev->dev.kobj), linkname);
4957}
4958
7ce64c79
AF
4959static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4960 struct net_device *adj_dev,
4961 struct list_head *dev_list)
4962{
4963 return (dev_list == &dev->adj_list.upper ||
4964 dev_list == &dev->adj_list.lower) &&
4965 net_eq(dev_net(dev), dev_net(adj_dev));
4966}
3ee32707 4967
5d261913
VF
4968static int __netdev_adjacent_dev_insert(struct net_device *dev,
4969 struct net_device *adj_dev,
7863c054 4970 struct list_head *dev_list,
402dae96 4971 void *private, bool master)
5d261913
VF
4972{
4973 struct netdev_adjacent *adj;
842d67a7 4974 int ret;
5d261913 4975
7863c054 4976 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913
VF
4977
4978 if (adj) {
5d261913
VF
4979 adj->ref_nr++;
4980 return 0;
4981 }
4982
4983 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4984 if (!adj)
4985 return -ENOMEM;
4986
4987 adj->dev = adj_dev;
4988 adj->master = master;
5d261913 4989 adj->ref_nr = 1;
402dae96 4990 adj->private = private;
5d261913 4991 dev_hold(adj_dev);
2f268f12
VF
4992
4993 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4994 adj_dev->name, dev->name, adj_dev->name);
5d261913 4995
7ce64c79 4996 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
3ee32707 4997 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5831d66e
VF
4998 if (ret)
4999 goto free_adj;
5000 }
5001
7863c054 5002 /* Ensure that master link is always the first item in list. */
842d67a7
VF
5003 if (master) {
5004 ret = sysfs_create_link(&(dev->dev.kobj),
5005 &(adj_dev->dev.kobj), "master");
5006 if (ret)
5831d66e 5007 goto remove_symlinks;
842d67a7 5008
7863c054 5009 list_add_rcu(&adj->list, dev_list);
842d67a7 5010 } else {
7863c054 5011 list_add_tail_rcu(&adj->list, dev_list);
842d67a7 5012 }
5d261913
VF
5013
5014 return 0;
842d67a7 5015
5831d66e 5016remove_symlinks:
7ce64c79 5017 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5018 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
842d67a7
VF
5019free_adj:
5020 kfree(adj);
974daef7 5021 dev_put(adj_dev);
842d67a7
VF
5022
5023 return ret;
5d261913
VF
5024}
5025
1d143d9f 5026static void __netdev_adjacent_dev_remove(struct net_device *dev,
5027 struct net_device *adj_dev,
5028 struct list_head *dev_list)
5d261913
VF
5029{
5030 struct netdev_adjacent *adj;
5031
7863c054 5032 adj = __netdev_find_adj(dev, adj_dev, dev_list);
5d261913 5033
2f268f12
VF
5034 if (!adj) {
5035 pr_err("tried to remove device %s from %s\n",
5036 dev->name, adj_dev->name);
5d261913 5037 BUG();
2f268f12 5038 }
5d261913
VF
5039
5040 if (adj->ref_nr > 1) {
2f268f12
VF
5041 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5042 adj->ref_nr-1);
5d261913
VF
5043 adj->ref_nr--;
5044 return;
5045 }
5046
842d67a7
VF
5047 if (adj->master)
5048 sysfs_remove_link(&(dev->dev.kobj), "master");
5049
7ce64c79 5050 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
3ee32707 5051 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5831d66e 5052
5d261913 5053 list_del_rcu(&adj->list);
2f268f12
VF
5054 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5055 adj_dev->name, dev->name, adj_dev->name);
5d261913
VF
5056 dev_put(adj_dev);
5057 kfree_rcu(adj, rcu);
5058}
5059
1d143d9f 5060static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5061 struct net_device *upper_dev,
5062 struct list_head *up_list,
5063 struct list_head *down_list,
5064 void *private, bool master)
5d261913
VF
5065{
5066 int ret;
5067
402dae96
VF
5068 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5069 master);
5d261913
VF
5070 if (ret)
5071 return ret;
5072
402dae96
VF
5073 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5074 false);
5d261913 5075 if (ret) {
2f268f12 5076 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5d261913
VF
5077 return ret;
5078 }
5079
5080 return 0;
5081}
5082
1d143d9f 5083static int __netdev_adjacent_dev_link(struct net_device *dev,
5084 struct net_device *upper_dev)
5d261913 5085{
2f268f12
VF
5086 return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5087 &dev->all_adj_list.upper,
5088 &upper_dev->all_adj_list.lower,
402dae96 5089 NULL, false);
5d261913
VF
5090}
5091
1d143d9f 5092static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5093 struct net_device *upper_dev,
5094 struct list_head *up_list,
5095 struct list_head *down_list)
5d261913 5096{
2f268f12
VF
5097 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5098 __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5d261913
VF
5099}
5100
1d143d9f 5101static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5102 struct net_device *upper_dev)
5d261913 5103{
2f268f12
VF
5104 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5105 &dev->all_adj_list.upper,
5106 &upper_dev->all_adj_list.lower);
5107}
5108
1d143d9f 5109static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5110 struct net_device *upper_dev,
5111 void *private, bool master)
2f268f12
VF
5112{
5113 int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5114
5115 if (ret)
5116 return ret;
5117
5118 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5119 &dev->adj_list.upper,
5120 &upper_dev->adj_list.lower,
402dae96 5121 private, master);
2f268f12
VF
5122 if (ret) {
5123 __netdev_adjacent_dev_unlink(dev, upper_dev);
5124 return ret;
5125 }
5126
5127 return 0;
5d261913
VF
5128}
5129
1d143d9f 5130static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5131 struct net_device *upper_dev)
2f268f12
VF
5132{
5133 __netdev_adjacent_dev_unlink(dev, upper_dev);
5134 __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5135 &dev->adj_list.upper,
5136 &upper_dev->adj_list.lower);
5137}
5d261913 5138
9ff162a8 5139static int __netdev_upper_dev_link(struct net_device *dev,
402dae96
VF
5140 struct net_device *upper_dev, bool master,
5141 void *private)
9ff162a8 5142{
5d261913
VF
5143 struct netdev_adjacent *i, *j, *to_i, *to_j;
5144 int ret = 0;
9ff162a8
JP
5145
5146 ASSERT_RTNL();
5147
5148 if (dev == upper_dev)
5149 return -EBUSY;
5150
5151 /* To prevent loops, check if dev is not upper device to upper_dev. */
2f268f12 5152 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
9ff162a8
JP
5153 return -EBUSY;
5154
2f268f12 5155 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
9ff162a8
JP
5156 return -EEXIST;
5157
5158 if (master && netdev_master_upper_dev_get(dev))
5159 return -EBUSY;
5160
402dae96
VF
5161 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5162 master);
5d261913
VF
5163 if (ret)
5164 return ret;
9ff162a8 5165
5d261913 5166 /* Now that we linked these devs, make all the upper_dev's
2f268f12 5167 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5d261913
VF
5168 * versa, and don't forget the devices itself. All of these
5169 * links are non-neighbours.
5170 */
2f268f12
VF
5171 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5172 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5173 pr_debug("Interlinking %s with %s, non-neighbour\n",
5174 i->dev->name, j->dev->name);
5d261913
VF
5175 ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5176 if (ret)
5177 goto rollback_mesh;
5178 }
5179 }
5180
5181 /* add dev to every upper_dev's upper device */
2f268f12
VF
5182 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5183 pr_debug("linking %s's upper device %s with %s\n",
5184 upper_dev->name, i->dev->name, dev->name);
5d261913
VF
5185 ret = __netdev_adjacent_dev_link(dev, i->dev);
5186 if (ret)
5187 goto rollback_upper_mesh;
5188 }
5189
5190 /* add upper_dev to every dev's lower device */
2f268f12
VF
5191 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5192 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5193 i->dev->name, upper_dev->name);
5d261913
VF
5194 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5195 if (ret)
5196 goto rollback_lower_mesh;
5197 }
9ff162a8 5198
42e52bf9 5199 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8 5200 return 0;
5d261913
VF
5201
5202rollback_lower_mesh:
5203 to_i = i;
2f268f12 5204 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5d261913
VF
5205 if (i == to_i)
5206 break;
5207 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5208 }
5209
5210 i = NULL;
5211
5212rollback_upper_mesh:
5213 to_i = i;
2f268f12 5214 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5215 if (i == to_i)
5216 break;
5217 __netdev_adjacent_dev_unlink(dev, i->dev);
5218 }
5219
5220 i = j = NULL;
5221
5222rollback_mesh:
5223 to_i = i;
5224 to_j = j;
2f268f12
VF
5225 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5226 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5d261913
VF
5227 if (i == to_i && j == to_j)
5228 break;
5229 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5230 }
5231 if (i == to_i)
5232 break;
5233 }
5234
2f268f12 5235 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5236
5237 return ret;
9ff162a8
JP
5238}
5239
5240/**
5241 * netdev_upper_dev_link - Add a link to the upper device
5242 * @dev: device
5243 * @upper_dev: new upper device
5244 *
5245 * Adds a link to device which is upper to this one. The caller must hold
5246 * the RTNL lock. On a failure a negative errno code is returned.
5247 * On success the reference counts are adjusted and the function
5248 * returns zero.
5249 */
5250int netdev_upper_dev_link(struct net_device *dev,
5251 struct net_device *upper_dev)
5252{
402dae96 5253 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
9ff162a8
JP
5254}
5255EXPORT_SYMBOL(netdev_upper_dev_link);
5256
5257/**
5258 * netdev_master_upper_dev_link - Add a master link to the upper device
5259 * @dev: device
5260 * @upper_dev: new upper device
5261 *
5262 * Adds a link to device which is upper to this one. In this case, only
5263 * one master upper device can be linked, although other non-master devices
5264 * might be linked as well. The caller must hold the RTNL lock.
5265 * On a failure a negative errno code is returned. On success the reference
5266 * counts are adjusted and the function returns zero.
5267 */
5268int netdev_master_upper_dev_link(struct net_device *dev,
5269 struct net_device *upper_dev)
5270{
402dae96 5271 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
9ff162a8
JP
5272}
5273EXPORT_SYMBOL(netdev_master_upper_dev_link);
5274
402dae96
VF
5275int netdev_master_upper_dev_link_private(struct net_device *dev,
5276 struct net_device *upper_dev,
5277 void *private)
5278{
5279 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5280}
5281EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5282
9ff162a8
JP
5283/**
5284 * netdev_upper_dev_unlink - Removes a link to upper device
5285 * @dev: device
5286 * @upper_dev: new upper device
5287 *
5288 * Removes a link to device which is upper to this one. The caller must hold
5289 * the RTNL lock.
5290 */
5291void netdev_upper_dev_unlink(struct net_device *dev,
5292 struct net_device *upper_dev)
5293{
5d261913 5294 struct netdev_adjacent *i, *j;
9ff162a8
JP
5295 ASSERT_RTNL();
5296
2f268f12 5297 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5d261913
VF
5298
5299 /* Here is the tricky part. We must remove all dev's lower
5300 * devices from all upper_dev's upper devices and vice
5301 * versa, to maintain the graph relationship.
5302 */
2f268f12
VF
5303 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5304 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5305 __netdev_adjacent_dev_unlink(i->dev, j->dev);
5306
5307 /* remove also the devices itself from lower/upper device
5308 * list
5309 */
2f268f12 5310 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5d261913
VF
5311 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5312
2f268f12 5313 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5d261913
VF
5314 __netdev_adjacent_dev_unlink(dev, i->dev);
5315
42e52bf9 5316 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
9ff162a8
JP
5317}
5318EXPORT_SYMBOL(netdev_upper_dev_unlink);
5319
4c75431a
AF
5320void netdev_adjacent_add_links(struct net_device *dev)
5321{
5322 struct netdev_adjacent *iter;
5323
5324 struct net *net = dev_net(dev);
5325
5326 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5327 if (!net_eq(net,dev_net(iter->dev)))
5328 continue;
5329 netdev_adjacent_sysfs_add(iter->dev, dev,
5330 &iter->dev->adj_list.lower);
5331 netdev_adjacent_sysfs_add(dev, iter->dev,
5332 &dev->adj_list.upper);
5333 }
5334
5335 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5336 if (!net_eq(net,dev_net(iter->dev)))
5337 continue;
5338 netdev_adjacent_sysfs_add(iter->dev, dev,
5339 &iter->dev->adj_list.upper);
5340 netdev_adjacent_sysfs_add(dev, iter->dev,
5341 &dev->adj_list.lower);
5342 }
5343}
5344
5345void netdev_adjacent_del_links(struct net_device *dev)
5346{
5347 struct netdev_adjacent *iter;
5348
5349 struct net *net = dev_net(dev);
5350
5351 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5352 if (!net_eq(net,dev_net(iter->dev)))
5353 continue;
5354 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5355 &iter->dev->adj_list.lower);
5356 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5357 &dev->adj_list.upper);
5358 }
5359
5360 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5361 if (!net_eq(net,dev_net(iter->dev)))
5362 continue;
5363 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5364 &iter->dev->adj_list.upper);
5365 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5366 &dev->adj_list.lower);
5367 }
5368}
5369
5bb025fa 5370void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
402dae96 5371{
5bb025fa 5372 struct netdev_adjacent *iter;
402dae96 5373
4c75431a
AF
5374 struct net *net = dev_net(dev);
5375
5bb025fa 5376 list_for_each_entry(iter, &dev->adj_list.upper, list) {
4c75431a
AF
5377 if (!net_eq(net,dev_net(iter->dev)))
5378 continue;
5bb025fa
VF
5379 netdev_adjacent_sysfs_del(iter->dev, oldname,
5380 &iter->dev->adj_list.lower);
5381 netdev_adjacent_sysfs_add(iter->dev, dev,
5382 &iter->dev->adj_list.lower);
5383 }
402dae96 5384
5bb025fa 5385 list_for_each_entry(iter, &dev->adj_list.lower, list) {
4c75431a
AF
5386 if (!net_eq(net,dev_net(iter->dev)))
5387 continue;
5bb025fa
VF
5388 netdev_adjacent_sysfs_del(iter->dev, oldname,
5389 &iter->dev->adj_list.upper);
5390 netdev_adjacent_sysfs_add(iter->dev, dev,
5391 &iter->dev->adj_list.upper);
5392 }
402dae96 5393}
402dae96
VF
5394
5395void *netdev_lower_dev_get_private(struct net_device *dev,
5396 struct net_device *lower_dev)
5397{
5398 struct netdev_adjacent *lower;
5399
5400 if (!lower_dev)
5401 return NULL;
5402 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5403 if (!lower)
5404 return NULL;
5405
5406 return lower->private;
5407}
5408EXPORT_SYMBOL(netdev_lower_dev_get_private);
5409
4085ebe8
VY
5410
5411int dev_get_nest_level(struct net_device *dev,
5412 bool (*type_check)(struct net_device *dev))
5413{
5414 struct net_device *lower = NULL;
5415 struct list_head *iter;
5416 int max_nest = -1;
5417 int nest;
5418
5419 ASSERT_RTNL();
5420
5421 netdev_for_each_lower_dev(dev, lower, iter) {
5422 nest = dev_get_nest_level(lower, type_check);
5423 if (max_nest < nest)
5424 max_nest = nest;
5425 }
5426
5427 if (type_check(dev))
5428 max_nest++;
5429
5430 return max_nest;
5431}
5432EXPORT_SYMBOL(dev_get_nest_level);
5433
b6c40d68
PM
5434static void dev_change_rx_flags(struct net_device *dev, int flags)
5435{
d314774c
SH
5436 const struct net_device_ops *ops = dev->netdev_ops;
5437
d2615bf4 5438 if (ops->ndo_change_rx_flags)
d314774c 5439 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
5440}
5441
991fb3f7 5442static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
1da177e4 5443{
b536db93 5444 unsigned int old_flags = dev->flags;
d04a48b0
EB
5445 kuid_t uid;
5446 kgid_t gid;
1da177e4 5447
24023451
PM
5448 ASSERT_RTNL();
5449
dad9b335
WC
5450 dev->flags |= IFF_PROMISC;
5451 dev->promiscuity += inc;
5452 if (dev->promiscuity == 0) {
5453 /*
5454 * Avoid overflow.
5455 * If inc causes overflow, untouch promisc and return error.
5456 */
5457 if (inc < 0)
5458 dev->flags &= ~IFF_PROMISC;
5459 else {
5460 dev->promiscuity -= inc;
7b6cd1ce
JP
5461 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5462 dev->name);
dad9b335
WC
5463 return -EOVERFLOW;
5464 }
5465 }
52609c0b 5466 if (dev->flags != old_flags) {
7b6cd1ce
JP
5467 pr_info("device %s %s promiscuous mode\n",
5468 dev->name,
5469 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
5470 if (audit_enabled) {
5471 current_uid_gid(&uid, &gid);
7759db82
KHK
5472 audit_log(current->audit_context, GFP_ATOMIC,
5473 AUDIT_ANOM_PROMISCUOUS,
5474 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5475 dev->name, (dev->flags & IFF_PROMISC),
5476 (old_flags & IFF_PROMISC),
e1760bd5 5477 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
5478 from_kuid(&init_user_ns, uid),
5479 from_kgid(&init_user_ns, gid),
7759db82 5480 audit_get_sessionid(current));
8192b0c4 5481 }
24023451 5482
b6c40d68 5483 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 5484 }
991fb3f7
ND
5485 if (notify)
5486 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
dad9b335 5487 return 0;
1da177e4
LT
5488}
5489
4417da66
PM
5490/**
5491 * dev_set_promiscuity - update promiscuity count on a device
5492 * @dev: device
5493 * @inc: modifier
5494 *
5495 * Add or remove promiscuity from a device. While the count in the device
5496 * remains above zero the interface remains promiscuous. Once it hits zero
5497 * the device reverts back to normal filtering operation. A negative inc
5498 * value is used to drop promiscuity on the device.
dad9b335 5499 * Return 0 if successful or a negative errno code on error.
4417da66 5500 */
dad9b335 5501int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 5502{
b536db93 5503 unsigned int old_flags = dev->flags;
dad9b335 5504 int err;
4417da66 5505
991fb3f7 5506 err = __dev_set_promiscuity(dev, inc, true);
4b5a698e 5507 if (err < 0)
dad9b335 5508 return err;
4417da66
PM
5509 if (dev->flags != old_flags)
5510 dev_set_rx_mode(dev);
dad9b335 5511 return err;
4417da66 5512}
d1b19dff 5513EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 5514
991fb3f7 5515static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
1da177e4 5516{
991fb3f7 5517 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
1da177e4 5518
24023451
PM
5519 ASSERT_RTNL();
5520
1da177e4 5521 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
5522 dev->allmulti += inc;
5523 if (dev->allmulti == 0) {
5524 /*
5525 * Avoid overflow.
5526 * If inc causes overflow, untouch allmulti and return error.
5527 */
5528 if (inc < 0)
5529 dev->flags &= ~IFF_ALLMULTI;
5530 else {
5531 dev->allmulti -= inc;
7b6cd1ce
JP
5532 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5533 dev->name);
dad9b335
WC
5534 return -EOVERFLOW;
5535 }
5536 }
24023451 5537 if (dev->flags ^ old_flags) {
b6c40d68 5538 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 5539 dev_set_rx_mode(dev);
991fb3f7
ND
5540 if (notify)
5541 __dev_notify_flags(dev, old_flags,
5542 dev->gflags ^ old_gflags);
24023451 5543 }
dad9b335 5544 return 0;
4417da66 5545}
991fb3f7
ND
5546
5547/**
5548 * dev_set_allmulti - update allmulti count on a device
5549 * @dev: device
5550 * @inc: modifier
5551 *
5552 * Add or remove reception of all multicast frames to a device. While the
5553 * count in the device remains above zero the interface remains listening
5554 * to all interfaces. Once it hits zero the device reverts back to normal
5555 * filtering operation. A negative @inc value is used to drop the counter
5556 * when releasing a resource needing all multicasts.
5557 * Return 0 if successful or a negative errno code on error.
5558 */
5559
5560int dev_set_allmulti(struct net_device *dev, int inc)
5561{
5562 return __dev_set_allmulti(dev, inc, true);
5563}
d1b19dff 5564EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
5565
5566/*
5567 * Upload unicast and multicast address lists to device and
5568 * configure RX filtering. When the device doesn't support unicast
53ccaae1 5569 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
5570 * are present.
5571 */
5572void __dev_set_rx_mode(struct net_device *dev)
5573{
d314774c
SH
5574 const struct net_device_ops *ops = dev->netdev_ops;
5575
4417da66
PM
5576 /* dev_open will call this function so the list will stay sane. */
5577 if (!(dev->flags&IFF_UP))
5578 return;
5579
5580 if (!netif_device_present(dev))
40b77c94 5581 return;
4417da66 5582
01789349 5583 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
5584 /* Unicast addresses changes may only happen under the rtnl,
5585 * therefore calling __dev_set_promiscuity here is safe.
5586 */
32e7bfc4 5587 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
991fb3f7 5588 __dev_set_promiscuity(dev, 1, false);
2d348d1f 5589 dev->uc_promisc = true;
32e7bfc4 5590 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
991fb3f7 5591 __dev_set_promiscuity(dev, -1, false);
2d348d1f 5592 dev->uc_promisc = false;
4417da66 5593 }
4417da66 5594 }
01789349
JP
5595
5596 if (ops->ndo_set_rx_mode)
5597 ops->ndo_set_rx_mode(dev);
4417da66
PM
5598}
5599
5600void dev_set_rx_mode(struct net_device *dev)
5601{
b9e40857 5602 netif_addr_lock_bh(dev);
4417da66 5603 __dev_set_rx_mode(dev);
b9e40857 5604 netif_addr_unlock_bh(dev);
1da177e4
LT
5605}
5606
f0db275a
SH
5607/**
5608 * dev_get_flags - get flags reported to userspace
5609 * @dev: device
5610 *
5611 * Get the combination of flag bits exported through APIs to userspace.
5612 */
95c96174 5613unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 5614{
95c96174 5615 unsigned int flags;
1da177e4
LT
5616
5617 flags = (dev->flags & ~(IFF_PROMISC |
5618 IFF_ALLMULTI |
b00055aa
SR
5619 IFF_RUNNING |
5620 IFF_LOWER_UP |
5621 IFF_DORMANT)) |
1da177e4
LT
5622 (dev->gflags & (IFF_PROMISC |
5623 IFF_ALLMULTI));
5624
b00055aa
SR
5625 if (netif_running(dev)) {
5626 if (netif_oper_up(dev))
5627 flags |= IFF_RUNNING;
5628 if (netif_carrier_ok(dev))
5629 flags |= IFF_LOWER_UP;
5630 if (netif_dormant(dev))
5631 flags |= IFF_DORMANT;
5632 }
1da177e4
LT
5633
5634 return flags;
5635}
d1b19dff 5636EXPORT_SYMBOL(dev_get_flags);
1da177e4 5637
bd380811 5638int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 5639{
b536db93 5640 unsigned int old_flags = dev->flags;
bd380811 5641 int ret;
1da177e4 5642
24023451
PM
5643 ASSERT_RTNL();
5644
1da177e4
LT
5645 /*
5646 * Set the flags on our device.
5647 */
5648
5649 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5650 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5651 IFF_AUTOMEDIA)) |
5652 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5653 IFF_ALLMULTI));
5654
5655 /*
5656 * Load in the correct multicast list now the flags have changed.
5657 */
5658
b6c40d68
PM
5659 if ((old_flags ^ flags) & IFF_MULTICAST)
5660 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5661
4417da66 5662 dev_set_rx_mode(dev);
1da177e4
LT
5663
5664 /*
5665 * Have we downed the interface. We handle IFF_UP ourselves
5666 * according to user attempts to set it, rather than blindly
5667 * setting it.
5668 */
5669
5670 ret = 0;
d215d10f 5671 if ((old_flags ^ flags) & IFF_UP)
bd380811 5672 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4 5673
1da177e4 5674 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff 5675 int inc = (flags & IFF_PROMISC) ? 1 : -1;
991fb3f7 5676 unsigned int old_flags = dev->flags;
d1b19dff 5677
1da177e4 5678 dev->gflags ^= IFF_PROMISC;
991fb3f7
ND
5679
5680 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5681 if (dev->flags != old_flags)
5682 dev_set_rx_mode(dev);
1da177e4
LT
5683 }
5684
5685 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5686 is important. Some (broken) drivers set IFF_PROMISC, when
5687 IFF_ALLMULTI is requested not asking us and not reporting.
5688 */
5689 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5690 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5691
1da177e4 5692 dev->gflags ^= IFF_ALLMULTI;
991fb3f7 5693 __dev_set_allmulti(dev, inc, false);
1da177e4
LT
5694 }
5695
bd380811
PM
5696 return ret;
5697}
5698
a528c219
ND
5699void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5700 unsigned int gchanges)
bd380811
PM
5701{
5702 unsigned int changes = dev->flags ^ old_flags;
5703
a528c219 5704 if (gchanges)
7f294054 5705 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
a528c219 5706
bd380811
PM
5707 if (changes & IFF_UP) {
5708 if (dev->flags & IFF_UP)
5709 call_netdevice_notifiers(NETDEV_UP, dev);
5710 else
5711 call_netdevice_notifiers(NETDEV_DOWN, dev);
5712 }
5713
5714 if (dev->flags & IFF_UP &&
be9efd36
JP
5715 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5716 struct netdev_notifier_change_info change_info;
5717
5718 change_info.flags_changed = changes;
5719 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5720 &change_info.info);
5721 }
bd380811
PM
5722}
5723
5724/**
5725 * dev_change_flags - change device settings
5726 * @dev: device
5727 * @flags: device state flags
5728 *
5729 * Change settings on device based state flags. The flags are
5730 * in the userspace exported format.
5731 */
b536db93 5732int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5733{
b536db93 5734 int ret;
991fb3f7 5735 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
bd380811
PM
5736
5737 ret = __dev_change_flags(dev, flags);
5738 if (ret < 0)
5739 return ret;
5740
991fb3f7 5741 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
a528c219 5742 __dev_notify_flags(dev, old_flags, changes);
1da177e4
LT
5743 return ret;
5744}
d1b19dff 5745EXPORT_SYMBOL(dev_change_flags);
1da177e4 5746
2315dc91
VF
5747static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5748{
5749 const struct net_device_ops *ops = dev->netdev_ops;
5750
5751 if (ops->ndo_change_mtu)
5752 return ops->ndo_change_mtu(dev, new_mtu);
5753
5754 dev->mtu = new_mtu;
5755 return 0;
5756}
5757
f0db275a
SH
5758/**
5759 * dev_set_mtu - Change maximum transfer unit
5760 * @dev: device
5761 * @new_mtu: new transfer unit
5762 *
5763 * Change the maximum transfer size of the network device.
5764 */
1da177e4
LT
5765int dev_set_mtu(struct net_device *dev, int new_mtu)
5766{
2315dc91 5767 int err, orig_mtu;
1da177e4
LT
5768
5769 if (new_mtu == dev->mtu)
5770 return 0;
5771
5772 /* MTU must be positive. */
5773 if (new_mtu < 0)
5774 return -EINVAL;
5775
5776 if (!netif_device_present(dev))
5777 return -ENODEV;
5778
1d486bfb
VF
5779 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5780 err = notifier_to_errno(err);
5781 if (err)
5782 return err;
d314774c 5783
2315dc91
VF
5784 orig_mtu = dev->mtu;
5785 err = __dev_set_mtu(dev, new_mtu);
d314774c 5786
2315dc91
VF
5787 if (!err) {
5788 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5789 err = notifier_to_errno(err);
5790 if (err) {
5791 /* setting mtu back and notifying everyone again,
5792 * so that they have a chance to revert changes.
5793 */
5794 __dev_set_mtu(dev, orig_mtu);
5795 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5796 }
5797 }
1da177e4
LT
5798 return err;
5799}
d1b19dff 5800EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5801
cbda10fa
VD
5802/**
5803 * dev_set_group - Change group this device belongs to
5804 * @dev: device
5805 * @new_group: group this device should belong to
5806 */
5807void dev_set_group(struct net_device *dev, int new_group)
5808{
5809 dev->group = new_group;
5810}
5811EXPORT_SYMBOL(dev_set_group);
5812
f0db275a
SH
5813/**
5814 * dev_set_mac_address - Change Media Access Control Address
5815 * @dev: device
5816 * @sa: new address
5817 *
5818 * Change the hardware (MAC) address of the device
5819 */
1da177e4
LT
5820int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5821{
d314774c 5822 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
5823 int err;
5824
d314774c 5825 if (!ops->ndo_set_mac_address)
1da177e4
LT
5826 return -EOPNOTSUPP;
5827 if (sa->sa_family != dev->type)
5828 return -EINVAL;
5829 if (!netif_device_present(dev))
5830 return -ENODEV;
d314774c 5831 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
5832 if (err)
5833 return err;
fbdeca2d 5834 dev->addr_assign_type = NET_ADDR_SET;
f6521516 5835 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 5836 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 5837 return 0;
1da177e4 5838}
d1b19dff 5839EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 5840
4bf84c35
JP
5841/**
5842 * dev_change_carrier - Change device carrier
5843 * @dev: device
691b3b7e 5844 * @new_carrier: new value
4bf84c35
JP
5845 *
5846 * Change device carrier
5847 */
5848int dev_change_carrier(struct net_device *dev, bool new_carrier)
5849{
5850 const struct net_device_ops *ops = dev->netdev_ops;
5851
5852 if (!ops->ndo_change_carrier)
5853 return -EOPNOTSUPP;
5854 if (!netif_device_present(dev))
5855 return -ENODEV;
5856 return ops->ndo_change_carrier(dev, new_carrier);
5857}
5858EXPORT_SYMBOL(dev_change_carrier);
5859
66b52b0d
JP
5860/**
5861 * dev_get_phys_port_id - Get device physical port ID
5862 * @dev: device
5863 * @ppid: port ID
5864 *
5865 * Get device physical port ID
5866 */
5867int dev_get_phys_port_id(struct net_device *dev,
02637fce 5868 struct netdev_phys_item_id *ppid)
66b52b0d
JP
5869{
5870 const struct net_device_ops *ops = dev->netdev_ops;
5871
5872 if (!ops->ndo_get_phys_port_id)
5873 return -EOPNOTSUPP;
5874 return ops->ndo_get_phys_port_id(dev, ppid);
5875}
5876EXPORT_SYMBOL(dev_get_phys_port_id);
5877
1da177e4
LT
5878/**
5879 * dev_new_index - allocate an ifindex
c4ea43c5 5880 * @net: the applicable net namespace
1da177e4
LT
5881 *
5882 * Returns a suitable unique value for a new device interface
5883 * number. The caller must hold the rtnl semaphore or the
5884 * dev_base_lock to be sure it remains unique.
5885 */
881d966b 5886static int dev_new_index(struct net *net)
1da177e4 5887{
aa79e66e 5888 int ifindex = net->ifindex;
1da177e4
LT
5889 for (;;) {
5890 if (++ifindex <= 0)
5891 ifindex = 1;
881d966b 5892 if (!__dev_get_by_index(net, ifindex))
aa79e66e 5893 return net->ifindex = ifindex;
1da177e4
LT
5894 }
5895}
5896
1da177e4 5897/* Delayed registration/unregisteration */
3b5b34fd 5898static LIST_HEAD(net_todo_list);
200b916f 5899DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
1da177e4 5900
6f05f629 5901static void net_set_todo(struct net_device *dev)
1da177e4 5902{
1da177e4 5903 list_add_tail(&dev->todo_list, &net_todo_list);
50624c93 5904 dev_net(dev)->dev_unreg_count++;
1da177e4
LT
5905}
5906
9b5e383c 5907static void rollback_registered_many(struct list_head *head)
93ee31f1 5908{
e93737b0 5909 struct net_device *dev, *tmp;
5cde2829 5910 LIST_HEAD(close_head);
9b5e383c 5911
93ee31f1
DL
5912 BUG_ON(dev_boot_phase);
5913 ASSERT_RTNL();
5914
e93737b0 5915 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5916 /* Some devices call without registering
e93737b0
KK
5917 * for initialization unwind. Remove those
5918 * devices and proceed with the remaining.
9b5e383c
ED
5919 */
5920 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5921 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5922 dev->name, dev);
93ee31f1 5923
9b5e383c 5924 WARN_ON(1);
e93737b0
KK
5925 list_del(&dev->unreg_list);
5926 continue;
9b5e383c 5927 }
449f4544 5928 dev->dismantle = true;
9b5e383c 5929 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5930 }
93ee31f1 5931
44345724 5932 /* If device is running, close it first. */
5cde2829
EB
5933 list_for_each_entry(dev, head, unreg_list)
5934 list_add_tail(&dev->close_list, &close_head);
5935 dev_close_many(&close_head);
93ee31f1 5936
44345724 5937 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5938 /* And unlink it from device chain. */
5939 unlist_netdevice(dev);
93ee31f1 5940
9b5e383c
ED
5941 dev->reg_state = NETREG_UNREGISTERING;
5942 }
93ee31f1
DL
5943
5944 synchronize_net();
5945
9b5e383c 5946 list_for_each_entry(dev, head, unreg_list) {
395eea6c
MB
5947 struct sk_buff *skb = NULL;
5948
9b5e383c
ED
5949 /* Shutdown queueing discipline. */
5950 dev_shutdown(dev);
93ee31f1
DL
5951
5952
9b5e383c
ED
5953 /* Notify protocols, that we are about to destroy
5954 this device. They should clean all the things.
5955 */
5956 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5957
395eea6c
MB
5958 if (!dev->rtnl_link_ops ||
5959 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5960 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5961 GFP_KERNEL);
5962
9b5e383c
ED
5963 /*
5964 * Flush the unicast and multicast chains
5965 */
a748ee24 5966 dev_uc_flush(dev);
22bedad3 5967 dev_mc_flush(dev);
93ee31f1 5968
9b5e383c
ED
5969 if (dev->netdev_ops->ndo_uninit)
5970 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5971
395eea6c
MB
5972 if (skb)
5973 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
56bfa7ee 5974
9ff162a8
JP
5975 /* Notifier chain MUST detach us all upper devices. */
5976 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 5977
9b5e383c
ED
5978 /* Remove entries from kobject tree */
5979 netdev_unregister_kobject(dev);
024e9679
AD
5980#ifdef CONFIG_XPS
5981 /* Remove XPS queueing entries */
5982 netif_reset_xps_queues_gt(dev, 0);
5983#endif
9b5e383c 5984 }
93ee31f1 5985
850a545b 5986 synchronize_net();
395264d5 5987
a5ee1551 5988 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5989 dev_put(dev);
5990}
5991
5992static void rollback_registered(struct net_device *dev)
5993{
5994 LIST_HEAD(single);
5995
5996 list_add(&dev->unreg_list, &single);
5997 rollback_registered_many(&single);
ceaaec98 5998 list_del(&single);
93ee31f1
DL
5999}
6000
c8f44aff
MM
6001static netdev_features_t netdev_fix_features(struct net_device *dev,
6002 netdev_features_t features)
b63365a2 6003{
57422dc5
MM
6004 /* Fix illegal checksum combinations */
6005 if ((features & NETIF_F_HW_CSUM) &&
6006 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6007 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
6008 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6009 }
6010
b63365a2 6011 /* TSO requires that SG is present as well. */
ea2d3688 6012 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 6013 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 6014 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
6015 }
6016
ec5f0615
PS
6017 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6018 !(features & NETIF_F_IP_CSUM)) {
6019 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6020 features &= ~NETIF_F_TSO;
6021 features &= ~NETIF_F_TSO_ECN;
6022 }
6023
6024 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6025 !(features & NETIF_F_IPV6_CSUM)) {
6026 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6027 features &= ~NETIF_F_TSO6;
6028 }
6029
31d8b9e0
BH
6030 /* TSO ECN requires that TSO is present as well. */
6031 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6032 features &= ~NETIF_F_TSO_ECN;
6033
212b573f
MM
6034 /* Software GSO depends on SG. */
6035 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 6036 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
6037 features &= ~NETIF_F_GSO;
6038 }
6039
acd1130e 6040 /* UFO needs SG and checksumming */
b63365a2 6041 if (features & NETIF_F_UFO) {
79032644
MM
6042 /* maybe split UFO into V4 and V6? */
6043 if (!((features & NETIF_F_GEN_CSUM) ||
6044 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6045 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 6046 netdev_dbg(dev,
acd1130e 6047 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
6048 features &= ~NETIF_F_UFO;
6049 }
6050
6051 if (!(features & NETIF_F_SG)) {
6f404e44 6052 netdev_dbg(dev,
acd1130e 6053 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
6054 features &= ~NETIF_F_UFO;
6055 }
6056 }
6057
d0290214
JP
6058#ifdef CONFIG_NET_RX_BUSY_POLL
6059 if (dev->netdev_ops->ndo_busy_poll)
6060 features |= NETIF_F_BUSY_POLL;
6061 else
6062#endif
6063 features &= ~NETIF_F_BUSY_POLL;
6064
b63365a2
HX
6065 return features;
6066}
b63365a2 6067
6cb6a27c 6068int __netdev_update_features(struct net_device *dev)
5455c699 6069{
c8f44aff 6070 netdev_features_t features;
5455c699
MM
6071 int err = 0;
6072
87267485
MM
6073 ASSERT_RTNL();
6074
5455c699
MM
6075 features = netdev_get_wanted_features(dev);
6076
6077 if (dev->netdev_ops->ndo_fix_features)
6078 features = dev->netdev_ops->ndo_fix_features(dev, features);
6079
6080 /* driver might be less strict about feature dependencies */
6081 features = netdev_fix_features(dev, features);
6082
6083 if (dev->features == features)
6cb6a27c 6084 return 0;
5455c699 6085
c8f44aff
MM
6086 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6087 &dev->features, &features);
5455c699
MM
6088
6089 if (dev->netdev_ops->ndo_set_features)
6090 err = dev->netdev_ops->ndo_set_features(dev, features);
6091
6cb6a27c 6092 if (unlikely(err < 0)) {
5455c699 6093 netdev_err(dev,
c8f44aff
MM
6094 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6095 err, &features, &dev->features);
6cb6a27c
MM
6096 return -1;
6097 }
6098
6099 if (!err)
6100 dev->features = features;
6101
6102 return 1;
6103}
6104
afe12cc8
MM
6105/**
6106 * netdev_update_features - recalculate device features
6107 * @dev: the device to check
6108 *
6109 * Recalculate dev->features set and send notifications if it
6110 * has changed. Should be called after driver or hardware dependent
6111 * conditions might have changed that influence the features.
6112 */
6cb6a27c
MM
6113void netdev_update_features(struct net_device *dev)
6114{
6115 if (__netdev_update_features(dev))
6116 netdev_features_change(dev);
5455c699
MM
6117}
6118EXPORT_SYMBOL(netdev_update_features);
6119
afe12cc8
MM
6120/**
6121 * netdev_change_features - recalculate device features
6122 * @dev: the device to check
6123 *
6124 * Recalculate dev->features set and send notifications even
6125 * if they have not changed. Should be called instead of
6126 * netdev_update_features() if also dev->vlan_features might
6127 * have changed to allow the changes to be propagated to stacked
6128 * VLAN devices.
6129 */
6130void netdev_change_features(struct net_device *dev)
6131{
6132 __netdev_update_features(dev);
6133 netdev_features_change(dev);
6134}
6135EXPORT_SYMBOL(netdev_change_features);
6136
fc4a7489
PM
6137/**
6138 * netif_stacked_transfer_operstate - transfer operstate
6139 * @rootdev: the root or lower level device to transfer state from
6140 * @dev: the device to transfer operstate to
6141 *
6142 * Transfer operational state from root to device. This is normally
6143 * called when a stacking relationship exists between the root
6144 * device and the device(a leaf device).
6145 */
6146void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6147 struct net_device *dev)
6148{
6149 if (rootdev->operstate == IF_OPER_DORMANT)
6150 netif_dormant_on(dev);
6151 else
6152 netif_dormant_off(dev);
6153
6154 if (netif_carrier_ok(rootdev)) {
6155 if (!netif_carrier_ok(dev))
6156 netif_carrier_on(dev);
6157 } else {
6158 if (netif_carrier_ok(dev))
6159 netif_carrier_off(dev);
6160 }
6161}
6162EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6163
a953be53 6164#ifdef CONFIG_SYSFS
1b4bf461
ED
6165static int netif_alloc_rx_queues(struct net_device *dev)
6166{
1b4bf461 6167 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 6168 struct netdev_rx_queue *rx;
1b4bf461 6169
bd25fa7b 6170 BUG_ON(count < 1);
1b4bf461 6171
bd25fa7b 6172 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 6173 if (!rx)
bd25fa7b 6174 return -ENOMEM;
62b5942a 6175
bd25fa7b
TH
6176 dev->_rx = rx;
6177
bd25fa7b 6178 for (i = 0; i < count; i++)
fe822240 6179 rx[i].dev = dev;
1b4bf461
ED
6180 return 0;
6181}
bf264145 6182#endif
1b4bf461 6183
aa942104
CG
6184static void netdev_init_one_queue(struct net_device *dev,
6185 struct netdev_queue *queue, void *_unused)
6186{
6187 /* Initialize queue lock */
6188 spin_lock_init(&queue->_xmit_lock);
6189 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6190 queue->xmit_lock_owner = -1;
b236da69 6191 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 6192 queue->dev = dev;
114cf580
TH
6193#ifdef CONFIG_BQL
6194 dql_init(&queue->dql, HZ);
6195#endif
aa942104
CG
6196}
6197
60877a32
ED
6198static void netif_free_tx_queues(struct net_device *dev)
6199{
4cb28970 6200 kvfree(dev->_tx);
60877a32
ED
6201}
6202
e6484930
TH
6203static int netif_alloc_netdev_queues(struct net_device *dev)
6204{
6205 unsigned int count = dev->num_tx_queues;
6206 struct netdev_queue *tx;
60877a32 6207 size_t sz = count * sizeof(*tx);
e6484930 6208
60877a32 6209 BUG_ON(count < 1 || count > 0xffff);
62b5942a 6210
60877a32
ED
6211 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6212 if (!tx) {
6213 tx = vzalloc(sz);
6214 if (!tx)
6215 return -ENOMEM;
6216 }
e6484930 6217 dev->_tx = tx;
1d24eb48 6218
e6484930
TH
6219 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6220 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
6221
6222 return 0;
e6484930
TH
6223}
6224
1da177e4
LT
6225/**
6226 * register_netdevice - register a network device
6227 * @dev: device to register
6228 *
6229 * Take a completed network device structure and add it to the kernel
6230 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6231 * chain. 0 is returned on success. A negative errno code is returned
6232 * on a failure to set up the device, or if the name is a duplicate.
6233 *
6234 * Callers must hold the rtnl semaphore. You may want
6235 * register_netdev() instead of this.
6236 *
6237 * BUGS:
6238 * The locking appears insufficient to guarantee two parallel registers
6239 * will not get the same name.
6240 */
6241
6242int register_netdevice(struct net_device *dev)
6243{
1da177e4 6244 int ret;
d314774c 6245 struct net *net = dev_net(dev);
1da177e4
LT
6246
6247 BUG_ON(dev_boot_phase);
6248 ASSERT_RTNL();
6249
b17a7c17
SH
6250 might_sleep();
6251
1da177e4
LT
6252 /* When net_device's are persistent, this will be fatal. */
6253 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 6254 BUG_ON(!net);
1da177e4 6255
f1f28aa3 6256 spin_lock_init(&dev->addr_list_lock);
cf508b12 6257 netdev_set_addr_lockdep_class(dev);
1da177e4 6258
1da177e4
LT
6259 dev->iflink = -1;
6260
828de4f6 6261 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
6262 if (ret < 0)
6263 goto out;
6264
1da177e4 6265 /* Init, if this function is available */
d314774c
SH
6266 if (dev->netdev_ops->ndo_init) {
6267 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
6268 if (ret) {
6269 if (ret > 0)
6270 ret = -EIO;
90833aa4 6271 goto out;
1da177e4
LT
6272 }
6273 }
4ec93edb 6274
f646968f
PM
6275 if (((dev->hw_features | dev->features) &
6276 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
6277 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6278 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6279 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6280 ret = -EINVAL;
6281 goto err_uninit;
6282 }
6283
9c7dafbf
PE
6284 ret = -EBUSY;
6285 if (!dev->ifindex)
6286 dev->ifindex = dev_new_index(net);
6287 else if (__dev_get_by_index(net, dev->ifindex))
6288 goto err_uninit;
6289
1da177e4
LT
6290 if (dev->iflink == -1)
6291 dev->iflink = dev->ifindex;
6292
5455c699
MM
6293 /* Transfer changeable features to wanted_features and enable
6294 * software offloads (GSO and GRO).
6295 */
6296 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
6297 dev->features |= NETIF_F_SOFT_FEATURES;
6298 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 6299
34324dc2
MM
6300 if (!(dev->flags & IFF_LOOPBACK)) {
6301 dev->hw_features |= NETIF_F_NOCACHE_COPY;
c6e1a0d1
TH
6302 }
6303
1180e7d6 6304 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 6305 */
1180e7d6 6306 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 6307
ee579677
PS
6308 /* Make NETIF_F_SG inheritable to tunnel devices.
6309 */
6310 dev->hw_enc_features |= NETIF_F_SG;
6311
0d89d203
SH
6312 /* Make NETIF_F_SG inheritable to MPLS.
6313 */
6314 dev->mpls_features |= NETIF_F_SG;
6315
7ffbe3fd
JB
6316 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6317 ret = notifier_to_errno(ret);
6318 if (ret)
6319 goto err_uninit;
6320
8b41d188 6321 ret = netdev_register_kobject(dev);
b17a7c17 6322 if (ret)
7ce1b0ed 6323 goto err_uninit;
b17a7c17
SH
6324 dev->reg_state = NETREG_REGISTERED;
6325
6cb6a27c 6326 __netdev_update_features(dev);
8e9b59b2 6327
1da177e4
LT
6328 /*
6329 * Default initial state at registry is that the
6330 * device is present.
6331 */
6332
6333 set_bit(__LINK_STATE_PRESENT, &dev->state);
6334
8f4cccbb
BH
6335 linkwatch_init_dev(dev);
6336
1da177e4 6337 dev_init_scheduler(dev);
1da177e4 6338 dev_hold(dev);
ce286d32 6339 list_netdevice(dev);
7bf23575 6340 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 6341
948b337e
JP
6342 /* If the device has permanent device address, driver should
6343 * set dev_addr and also addr_assign_type should be set to
6344 * NET_ADDR_PERM (default value).
6345 */
6346 if (dev->addr_assign_type == NET_ADDR_PERM)
6347 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6348
1da177e4 6349 /* Notify protocols, that a new device appeared. */
056925ab 6350 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 6351 ret = notifier_to_errno(ret);
93ee31f1
DL
6352 if (ret) {
6353 rollback_registered(dev);
6354 dev->reg_state = NETREG_UNREGISTERED;
6355 }
d90a909e
EB
6356 /*
6357 * Prevent userspace races by waiting until the network
6358 * device is fully setup before sending notifications.
6359 */
a2835763
PM
6360 if (!dev->rtnl_link_ops ||
6361 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7f294054 6362 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
1da177e4
LT
6363
6364out:
6365 return ret;
7ce1b0ed
HX
6366
6367err_uninit:
d314774c
SH
6368 if (dev->netdev_ops->ndo_uninit)
6369 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 6370 goto out;
1da177e4 6371}
d1b19dff 6372EXPORT_SYMBOL(register_netdevice);
1da177e4 6373
937f1ba5
BH
6374/**
6375 * init_dummy_netdev - init a dummy network device for NAPI
6376 * @dev: device to init
6377 *
6378 * This takes a network device structure and initialize the minimum
6379 * amount of fields so it can be used to schedule NAPI polls without
6380 * registering a full blown interface. This is to be used by drivers
6381 * that need to tie several hardware interfaces to a single NAPI
6382 * poll scheduler due to HW limitations.
6383 */
6384int init_dummy_netdev(struct net_device *dev)
6385{
6386 /* Clear everything. Note we don't initialize spinlocks
6387 * are they aren't supposed to be taken by any of the
6388 * NAPI code and this dummy netdev is supposed to be
6389 * only ever used for NAPI polls
6390 */
6391 memset(dev, 0, sizeof(struct net_device));
6392
6393 /* make sure we BUG if trying to hit standard
6394 * register/unregister code path
6395 */
6396 dev->reg_state = NETREG_DUMMY;
6397
937f1ba5
BH
6398 /* NAPI wants this */
6399 INIT_LIST_HEAD(&dev->napi_list);
6400
6401 /* a dummy interface is started by default */
6402 set_bit(__LINK_STATE_PRESENT, &dev->state);
6403 set_bit(__LINK_STATE_START, &dev->state);
6404
29b4433d
ED
6405 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6406 * because users of this 'device' dont need to change
6407 * its refcount.
6408 */
6409
937f1ba5
BH
6410 return 0;
6411}
6412EXPORT_SYMBOL_GPL(init_dummy_netdev);
6413
6414
1da177e4
LT
6415/**
6416 * register_netdev - register a network device
6417 * @dev: device to register
6418 *
6419 * Take a completed network device structure and add it to the kernel
6420 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6421 * chain. 0 is returned on success. A negative errno code is returned
6422 * on a failure to set up the device, or if the name is a duplicate.
6423 *
38b4da38 6424 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
6425 * and expands the device name if you passed a format string to
6426 * alloc_netdev.
6427 */
6428int register_netdev(struct net_device *dev)
6429{
6430 int err;
6431
6432 rtnl_lock();
1da177e4 6433 err = register_netdevice(dev);
1da177e4
LT
6434 rtnl_unlock();
6435 return err;
6436}
6437EXPORT_SYMBOL(register_netdev);
6438
29b4433d
ED
6439int netdev_refcnt_read(const struct net_device *dev)
6440{
6441 int i, refcnt = 0;
6442
6443 for_each_possible_cpu(i)
6444 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6445 return refcnt;
6446}
6447EXPORT_SYMBOL(netdev_refcnt_read);
6448
2c53040f 6449/**
1da177e4 6450 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 6451 * @dev: target net_device
1da177e4
LT
6452 *
6453 * This is called when unregistering network devices.
6454 *
6455 * Any protocol or device that holds a reference should register
6456 * for netdevice notification, and cleanup and put back the
6457 * reference if they receive an UNREGISTER event.
6458 * We can get stuck here if buggy protocols don't correctly
4ec93edb 6459 * call dev_put.
1da177e4
LT
6460 */
6461static void netdev_wait_allrefs(struct net_device *dev)
6462{
6463 unsigned long rebroadcast_time, warning_time;
29b4433d 6464 int refcnt;
1da177e4 6465
e014debe
ED
6466 linkwatch_forget_dev(dev);
6467
1da177e4 6468 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
6469 refcnt = netdev_refcnt_read(dev);
6470
6471 while (refcnt != 0) {
1da177e4 6472 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 6473 rtnl_lock();
1da177e4
LT
6474
6475 /* Rebroadcast unregister notification */
056925ab 6476 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 6477
748e2d93 6478 __rtnl_unlock();
0115e8e3 6479 rcu_barrier();
748e2d93
ED
6480 rtnl_lock();
6481
0115e8e3 6482 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
6483 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6484 &dev->state)) {
6485 /* We must not have linkwatch events
6486 * pending on unregister. If this
6487 * happens, we simply run the queue
6488 * unscheduled, resulting in a noop
6489 * for this device.
6490 */
6491 linkwatch_run_queue();
6492 }
6493
6756ae4b 6494 __rtnl_unlock();
1da177e4
LT
6495
6496 rebroadcast_time = jiffies;
6497 }
6498
6499 msleep(250);
6500
29b4433d
ED
6501 refcnt = netdev_refcnt_read(dev);
6502
1da177e4 6503 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
6504 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6505 dev->name, refcnt);
1da177e4
LT
6506 warning_time = jiffies;
6507 }
6508 }
6509}
6510
6511/* The sequence is:
6512 *
6513 * rtnl_lock();
6514 * ...
6515 * register_netdevice(x1);
6516 * register_netdevice(x2);
6517 * ...
6518 * unregister_netdevice(y1);
6519 * unregister_netdevice(y2);
6520 * ...
6521 * rtnl_unlock();
6522 * free_netdev(y1);
6523 * free_netdev(y2);
6524 *
58ec3b4d 6525 * We are invoked by rtnl_unlock().
1da177e4 6526 * This allows us to deal with problems:
b17a7c17 6527 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
6528 * without deadlocking with linkwatch via keventd.
6529 * 2) Since we run with the RTNL semaphore not held, we can sleep
6530 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
6531 *
6532 * We must not return until all unregister events added during
6533 * the interval the lock was held have been completed.
1da177e4 6534 */
1da177e4
LT
6535void netdev_run_todo(void)
6536{
626ab0e6 6537 struct list_head list;
1da177e4 6538
1da177e4 6539 /* Snapshot list, allow later requests */
626ab0e6 6540 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
6541
6542 __rtnl_unlock();
626ab0e6 6543
0115e8e3
ED
6544
6545 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
6546 if (!list_empty(&list))
6547 rcu_barrier();
6548
1da177e4
LT
6549 while (!list_empty(&list)) {
6550 struct net_device *dev
e5e26d75 6551 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
6552 list_del(&dev->todo_list);
6553
748e2d93 6554 rtnl_lock();
0115e8e3 6555 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 6556 __rtnl_unlock();
0115e8e3 6557
b17a7c17 6558 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 6559 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
6560 dev->name, dev->reg_state);
6561 dump_stack();
6562 continue;
6563 }
1da177e4 6564
b17a7c17 6565 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 6566
152102c7 6567 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 6568
b17a7c17 6569 netdev_wait_allrefs(dev);
1da177e4 6570
b17a7c17 6571 /* paranoia */
29b4433d 6572 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
6573 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6574 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 6575 WARN_ON(dev->dn_ptr);
1da177e4 6576
b17a7c17
SH
6577 if (dev->destructor)
6578 dev->destructor(dev);
9093bbb2 6579
50624c93
EB
6580 /* Report a network device has been unregistered */
6581 rtnl_lock();
6582 dev_net(dev)->dev_unreg_count--;
6583 __rtnl_unlock();
6584 wake_up(&netdev_unregistering_wq);
6585
9093bbb2
SH
6586 /* Free network device */
6587 kobject_put(&dev->dev.kobj);
1da177e4 6588 }
1da177e4
LT
6589}
6590
3cfde79c
BH
6591/* Convert net_device_stats to rtnl_link_stats64. They have the same
6592 * fields in the same order, with only the type differing.
6593 */
77a1abf5
ED
6594void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6595 const struct net_device_stats *netdev_stats)
3cfde79c
BH
6596{
6597#if BITS_PER_LONG == 64
77a1abf5
ED
6598 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6599 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
6600#else
6601 size_t i, n = sizeof(*stats64) / sizeof(u64);
6602 const unsigned long *src = (const unsigned long *)netdev_stats;
6603 u64 *dst = (u64 *)stats64;
6604
6605 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6606 sizeof(*stats64) / sizeof(u64));
6607 for (i = 0; i < n; i++)
6608 dst[i] = src[i];
6609#endif
6610}
77a1abf5 6611EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 6612
eeda3fd6
SH
6613/**
6614 * dev_get_stats - get network device statistics
6615 * @dev: device to get statistics from
28172739 6616 * @storage: place to store stats
eeda3fd6 6617 *
d7753516
BH
6618 * Get network statistics from device. Return @storage.
6619 * The device driver may provide its own method by setting
6620 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6621 * otherwise the internal statistics structure is used.
eeda3fd6 6622 */
d7753516
BH
6623struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6624 struct rtnl_link_stats64 *storage)
7004bf25 6625{
eeda3fd6
SH
6626 const struct net_device_ops *ops = dev->netdev_ops;
6627
28172739
ED
6628 if (ops->ndo_get_stats64) {
6629 memset(storage, 0, sizeof(*storage));
caf586e5
ED
6630 ops->ndo_get_stats64(dev, storage);
6631 } else if (ops->ndo_get_stats) {
3cfde79c 6632 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
6633 } else {
6634 netdev_stats_to_stats64(storage, &dev->stats);
28172739 6635 }
caf586e5 6636 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
015f0688 6637 storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
28172739 6638 return storage;
c45d286e 6639}
eeda3fd6 6640EXPORT_SYMBOL(dev_get_stats);
c45d286e 6641
24824a09 6642struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 6643{
24824a09 6644 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 6645
24824a09
ED
6646#ifdef CONFIG_NET_CLS_ACT
6647 if (queue)
6648 return queue;
6649 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6650 if (!queue)
6651 return NULL;
6652 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
6653 queue->qdisc = &noop_qdisc;
6654 queue->qdisc_sleeping = &noop_qdisc;
6655 rcu_assign_pointer(dev->ingress_queue, queue);
6656#endif
6657 return queue;
bb949fbd
DM
6658}
6659
2c60db03
ED
6660static const struct ethtool_ops default_ethtool_ops;
6661
d07d7507
SG
6662void netdev_set_default_ethtool_ops(struct net_device *dev,
6663 const struct ethtool_ops *ops)
6664{
6665 if (dev->ethtool_ops == &default_ethtool_ops)
6666 dev->ethtool_ops = ops;
6667}
6668EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6669
74d332c1
ED
6670void netdev_freemem(struct net_device *dev)
6671{
6672 char *addr = (char *)dev - dev->padded;
6673
4cb28970 6674 kvfree(addr);
74d332c1
ED
6675}
6676
1da177e4 6677/**
36909ea4 6678 * alloc_netdev_mqs - allocate network device
c835a677
TG
6679 * @sizeof_priv: size of private data to allocate space for
6680 * @name: device name format string
6681 * @name_assign_type: origin of device name
6682 * @setup: callback to initialize device
6683 * @txqs: the number of TX subqueues to allocate
6684 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
6685 *
6686 * Allocates a struct net_device with private data area for driver use
90e51adf 6687 * and performs basic initialization. Also allocates subqueue structs
36909ea4 6688 * for each queue on the device.
1da177e4 6689 */
36909ea4 6690struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
c835a677 6691 unsigned char name_assign_type,
36909ea4
TH
6692 void (*setup)(struct net_device *),
6693 unsigned int txqs, unsigned int rxqs)
1da177e4 6694{
1da177e4 6695 struct net_device *dev;
7943986c 6696 size_t alloc_size;
1ce8e7b5 6697 struct net_device *p;
1da177e4 6698
b6fe17d6
SH
6699 BUG_ON(strlen(name) >= sizeof(dev->name));
6700
36909ea4 6701 if (txqs < 1) {
7b6cd1ce 6702 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
6703 return NULL;
6704 }
6705
a953be53 6706#ifdef CONFIG_SYSFS
36909ea4 6707 if (rxqs < 1) {
7b6cd1ce 6708 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
6709 return NULL;
6710 }
6711#endif
6712
fd2ea0a7 6713 alloc_size = sizeof(struct net_device);
d1643d24
AD
6714 if (sizeof_priv) {
6715 /* ensure 32-byte alignment of private area */
1ce8e7b5 6716 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
6717 alloc_size += sizeof_priv;
6718 }
6719 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 6720 alloc_size += NETDEV_ALIGN - 1;
1da177e4 6721
74d332c1
ED
6722 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6723 if (!p)
6724 p = vzalloc(alloc_size);
62b5942a 6725 if (!p)
1da177e4 6726 return NULL;
1da177e4 6727
1ce8e7b5 6728 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 6729 dev->padded = (char *)dev - (char *)p;
ab9c73cc 6730
29b4433d
ED
6731 dev->pcpu_refcnt = alloc_percpu(int);
6732 if (!dev->pcpu_refcnt)
74d332c1 6733 goto free_dev;
ab9c73cc 6734
ab9c73cc 6735 if (dev_addr_init(dev))
29b4433d 6736 goto free_pcpu;
ab9c73cc 6737
22bedad3 6738 dev_mc_init(dev);
a748ee24 6739 dev_uc_init(dev);
ccffad25 6740
c346dca1 6741 dev_net_set(dev, &init_net);
1da177e4 6742
8d3bdbd5 6743 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 6744 dev->gso_max_segs = GSO_MAX_SEGS;
fcbeb976 6745 dev->gso_min_segs = 0;
8d3bdbd5 6746
8d3bdbd5
DM
6747 INIT_LIST_HEAD(&dev->napi_list);
6748 INIT_LIST_HEAD(&dev->unreg_list);
5cde2829 6749 INIT_LIST_HEAD(&dev->close_list);
8d3bdbd5 6750 INIT_LIST_HEAD(&dev->link_watch_list);
2f268f12
VF
6751 INIT_LIST_HEAD(&dev->adj_list.upper);
6752 INIT_LIST_HEAD(&dev->adj_list.lower);
6753 INIT_LIST_HEAD(&dev->all_adj_list.upper);
6754 INIT_LIST_HEAD(&dev->all_adj_list.lower);
02875878 6755 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
8d3bdbd5
DM
6756 setup(dev);
6757
36909ea4
TH
6758 dev->num_tx_queues = txqs;
6759 dev->real_num_tx_queues = txqs;
ed9af2e8 6760 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6761 goto free_all;
e8a0464c 6762
a953be53 6763#ifdef CONFIG_SYSFS
36909ea4
TH
6764 dev->num_rx_queues = rxqs;
6765 dev->real_num_rx_queues = rxqs;
fe822240 6766 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6767 goto free_all;
df334545 6768#endif
0a9627f2 6769
1da177e4 6770 strcpy(dev->name, name);
c835a677 6771 dev->name_assign_type = name_assign_type;
cbda10fa 6772 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
6773 if (!dev->ethtool_ops)
6774 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 6775 return dev;
ab9c73cc 6776
8d3bdbd5
DM
6777free_all:
6778 free_netdev(dev);
6779 return NULL;
6780
29b4433d
ED
6781free_pcpu:
6782 free_percpu(dev->pcpu_refcnt);
74d332c1
ED
6783free_dev:
6784 netdev_freemem(dev);
ab9c73cc 6785 return NULL;
1da177e4 6786}
36909ea4 6787EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
6788
6789/**
6790 * free_netdev - free network device
6791 * @dev: device
6792 *
4ec93edb
YH
6793 * This function does the last stage of destroying an allocated device
6794 * interface. The reference to the device object is released.
1da177e4
LT
6795 * If this is the last reference then it will be freed.
6796 */
6797void free_netdev(struct net_device *dev)
6798{
d565b0a1
HX
6799 struct napi_struct *p, *n;
6800
f3005d7f
DL
6801 release_net(dev_net(dev));
6802
60877a32 6803 netif_free_tx_queues(dev);
a953be53 6804#ifdef CONFIG_SYSFS
fe822240
TH
6805 kfree(dev->_rx);
6806#endif
e8a0464c 6807
33d480ce 6808 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6809
f001fde5
JP
6810 /* Flush device addresses */
6811 dev_addr_flush(dev);
6812
d565b0a1
HX
6813 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6814 netif_napi_del(p);
6815
29b4433d
ED
6816 free_percpu(dev->pcpu_refcnt);
6817 dev->pcpu_refcnt = NULL;
6818
3041a069 6819 /* Compatibility with error handling in drivers */
1da177e4 6820 if (dev->reg_state == NETREG_UNINITIALIZED) {
74d332c1 6821 netdev_freemem(dev);
1da177e4
LT
6822 return;
6823 }
6824
6825 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6826 dev->reg_state = NETREG_RELEASED;
6827
43cb76d9
GKH
6828 /* will free via device release */
6829 put_device(&dev->dev);
1da177e4 6830}
d1b19dff 6831EXPORT_SYMBOL(free_netdev);
4ec93edb 6832
f0db275a
SH
6833/**
6834 * synchronize_net - Synchronize with packet receive processing
6835 *
6836 * Wait for packets currently being received to be done.
6837 * Does not block later packets from starting.
6838 */
4ec93edb 6839void synchronize_net(void)
1da177e4
LT
6840{
6841 might_sleep();
be3fc413
ED
6842 if (rtnl_is_locked())
6843 synchronize_rcu_expedited();
6844 else
6845 synchronize_rcu();
1da177e4 6846}
d1b19dff 6847EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6848
6849/**
44a0873d 6850 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6851 * @dev: device
44a0873d 6852 * @head: list
6ebfbc06 6853 *
1da177e4 6854 * This function shuts down a device interface and removes it
d59b54b1 6855 * from the kernel tables.
44a0873d 6856 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6857 *
6858 * Callers must hold the rtnl semaphore. You may want
6859 * unregister_netdev() instead of this.
6860 */
6861
44a0873d 6862void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6863{
a6620712
HX
6864 ASSERT_RTNL();
6865
44a0873d 6866 if (head) {
9fdce099 6867 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6868 } else {
6869 rollback_registered(dev);
6870 /* Finish processing unregister after unlock */
6871 net_set_todo(dev);
6872 }
1da177e4 6873}
44a0873d 6874EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6875
9b5e383c
ED
6876/**
6877 * unregister_netdevice_many - unregister many devices
6878 * @head: list of devices
87757a91
ED
6879 *
6880 * Note: As most callers use a stack allocated list_head,
6881 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
6882 */
6883void unregister_netdevice_many(struct list_head *head)
6884{
6885 struct net_device *dev;
6886
6887 if (!list_empty(head)) {
6888 rollback_registered_many(head);
6889 list_for_each_entry(dev, head, unreg_list)
6890 net_set_todo(dev);
87757a91 6891 list_del(head);
9b5e383c
ED
6892 }
6893}
63c8099d 6894EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6895
1da177e4
LT
6896/**
6897 * unregister_netdev - remove device from the kernel
6898 * @dev: device
6899 *
6900 * This function shuts down a device interface and removes it
d59b54b1 6901 * from the kernel tables.
1da177e4
LT
6902 *
6903 * This is just a wrapper for unregister_netdevice that takes
6904 * the rtnl semaphore. In general you want to use this and not
6905 * unregister_netdevice.
6906 */
6907void unregister_netdev(struct net_device *dev)
6908{
6909 rtnl_lock();
6910 unregister_netdevice(dev);
6911 rtnl_unlock();
6912}
1da177e4
LT
6913EXPORT_SYMBOL(unregister_netdev);
6914
ce286d32
EB
6915/**
6916 * dev_change_net_namespace - move device to different nethost namespace
6917 * @dev: device
6918 * @net: network namespace
6919 * @pat: If not NULL name pattern to try if the current device name
6920 * is already taken in the destination network namespace.
6921 *
6922 * This function shuts down a device interface and moves it
6923 * to a new network namespace. On success 0 is returned, on
6924 * a failure a netagive errno code is returned.
6925 *
6926 * Callers must hold the rtnl semaphore.
6927 */
6928
6929int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6930{
ce286d32
EB
6931 int err;
6932
6933 ASSERT_RTNL();
6934
6935 /* Don't allow namespace local devices to be moved. */
6936 err = -EINVAL;
6937 if (dev->features & NETIF_F_NETNS_LOCAL)
6938 goto out;
6939
6940 /* Ensure the device has been registrered */
ce286d32
EB
6941 if (dev->reg_state != NETREG_REGISTERED)
6942 goto out;
6943
6944 /* Get out if there is nothing todo */
6945 err = 0;
878628fb 6946 if (net_eq(dev_net(dev), net))
ce286d32
EB
6947 goto out;
6948
6949 /* Pick the destination device name, and ensure
6950 * we can use it in the destination network namespace.
6951 */
6952 err = -EEXIST;
d9031024 6953 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6954 /* We get here if we can't use the current device name */
6955 if (!pat)
6956 goto out;
828de4f6 6957 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
6958 goto out;
6959 }
6960
6961 /*
6962 * And now a mini version of register_netdevice unregister_netdevice.
6963 */
6964
6965 /* If device is running close it first. */
9b772652 6966 dev_close(dev);
ce286d32
EB
6967
6968 /* And unlink it from device chain */
6969 err = -ENODEV;
6970 unlist_netdevice(dev);
6971
6972 synchronize_net();
6973
6974 /* Shutdown queueing discipline. */
6975 dev_shutdown(dev);
6976
6977 /* Notify protocols, that we are about to destroy
6978 this device. They should clean all the things.
3b27e105
DL
6979
6980 Note that dev->reg_state stays at NETREG_REGISTERED.
6981 This is wanted because this way 8021q and macvlan know
6982 the device is just moving and can keep their slaves up.
ce286d32
EB
6983 */
6984 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
6985 rcu_barrier();
6986 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7f294054 6987 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
ce286d32
EB
6988
6989 /*
6990 * Flush the unicast and multicast chains
6991 */
a748ee24 6992 dev_uc_flush(dev);
22bedad3 6993 dev_mc_flush(dev);
ce286d32 6994
4e66ae2e
SH
6995 /* Send a netdev-removed uevent to the old namespace */
6996 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
4c75431a 6997 netdev_adjacent_del_links(dev);
4e66ae2e 6998
ce286d32 6999 /* Actually switch the network namespace */
c346dca1 7000 dev_net_set(dev, net);
ce286d32 7001
ce286d32
EB
7002 /* If there is an ifindex conflict assign a new one */
7003 if (__dev_get_by_index(net, dev->ifindex)) {
7004 int iflink = (dev->iflink == dev->ifindex);
7005 dev->ifindex = dev_new_index(net);
7006 if (iflink)
7007 dev->iflink = dev->ifindex;
7008 }
7009
4e66ae2e
SH
7010 /* Send a netdev-add uevent to the new namespace */
7011 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
4c75431a 7012 netdev_adjacent_add_links(dev);
4e66ae2e 7013
8b41d188 7014 /* Fixup kobjects */
a1b3f594 7015 err = device_rename(&dev->dev, dev->name);
8b41d188 7016 WARN_ON(err);
ce286d32
EB
7017
7018 /* Add the device back in the hashes */
7019 list_netdevice(dev);
7020
7021 /* Notify protocols, that a new device appeared. */
7022 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7023
d90a909e
EB
7024 /*
7025 * Prevent userspace races by waiting until the network
7026 * device is fully setup before sending notifications.
7027 */
7f294054 7028 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
d90a909e 7029
ce286d32
EB
7030 synchronize_net();
7031 err = 0;
7032out:
7033 return err;
7034}
463d0183 7035EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 7036
1da177e4
LT
7037static int dev_cpu_callback(struct notifier_block *nfb,
7038 unsigned long action,
7039 void *ocpu)
7040{
7041 struct sk_buff **list_skb;
1da177e4
LT
7042 struct sk_buff *skb;
7043 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7044 struct softnet_data *sd, *oldsd;
7045
8bb78442 7046 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
7047 return NOTIFY_OK;
7048
7049 local_irq_disable();
7050 cpu = smp_processor_id();
7051 sd = &per_cpu(softnet_data, cpu);
7052 oldsd = &per_cpu(softnet_data, oldcpu);
7053
7054 /* Find end of our completion_queue. */
7055 list_skb = &sd->completion_queue;
7056 while (*list_skb)
7057 list_skb = &(*list_skb)->next;
7058 /* Append completion queue from offline CPU. */
7059 *list_skb = oldsd->completion_queue;
7060 oldsd->completion_queue = NULL;
7061
1da177e4 7062 /* Append output queue from offline CPU. */
a9cbd588
CG
7063 if (oldsd->output_queue) {
7064 *sd->output_queue_tailp = oldsd->output_queue;
7065 sd->output_queue_tailp = oldsd->output_queue_tailp;
7066 oldsd->output_queue = NULL;
7067 oldsd->output_queue_tailp = &oldsd->output_queue;
7068 }
264524d5
HC
7069 /* Append NAPI poll list from offline CPU. */
7070 if (!list_empty(&oldsd->poll_list)) {
7071 list_splice_init(&oldsd->poll_list, &sd->poll_list);
7072 raise_softirq_irqoff(NET_RX_SOFTIRQ);
7073 }
1da177e4
LT
7074
7075 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7076 local_irq_enable();
7077
7078 /* Process offline CPU's input_pkt_queue */
76cc8b13 7079 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
ae78dbfa 7080 netif_rx_internal(skb);
76cc8b13 7081 input_queue_head_incr(oldsd);
fec5e652 7082 }
76cc8b13 7083 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
ae78dbfa 7084 netif_rx_internal(skb);
76cc8b13
TH
7085 input_queue_head_incr(oldsd);
7086 }
1da177e4
LT
7087
7088 return NOTIFY_OK;
7089}
1da177e4
LT
7090
7091
7f353bf2 7092/**
b63365a2
HX
7093 * netdev_increment_features - increment feature set by one
7094 * @all: current feature set
7095 * @one: new feature set
7096 * @mask: mask feature set
7f353bf2
HX
7097 *
7098 * Computes a new feature set after adding a device with feature set
b63365a2
HX
7099 * @one to the master device with current feature set @all. Will not
7100 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 7101 */
c8f44aff
MM
7102netdev_features_t netdev_increment_features(netdev_features_t all,
7103 netdev_features_t one, netdev_features_t mask)
b63365a2 7104{
1742f183
MM
7105 if (mask & NETIF_F_GEN_CSUM)
7106 mask |= NETIF_F_ALL_CSUM;
7107 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 7108
1742f183
MM
7109 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7110 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 7111
1742f183
MM
7112 /* If one device supports hw checksumming, set for all. */
7113 if (all & NETIF_F_GEN_CSUM)
7114 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
7115
7116 return all;
7117}
b63365a2 7118EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 7119
430f03cd 7120static struct hlist_head * __net_init netdev_create_hash(void)
30d97d35
PE
7121{
7122 int i;
7123 struct hlist_head *hash;
7124
7125 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7126 if (hash != NULL)
7127 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7128 INIT_HLIST_HEAD(&hash[i]);
7129
7130 return hash;
7131}
7132
881d966b 7133/* Initialize per network namespace state */
4665079c 7134static int __net_init netdev_init(struct net *net)
881d966b 7135{
734b6541
RM
7136 if (net != &init_net)
7137 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 7138
30d97d35
PE
7139 net->dev_name_head = netdev_create_hash();
7140 if (net->dev_name_head == NULL)
7141 goto err_name;
881d966b 7142
30d97d35
PE
7143 net->dev_index_head = netdev_create_hash();
7144 if (net->dev_index_head == NULL)
7145 goto err_idx;
881d966b
EB
7146
7147 return 0;
30d97d35
PE
7148
7149err_idx:
7150 kfree(net->dev_name_head);
7151err_name:
7152 return -ENOMEM;
881d966b
EB
7153}
7154
f0db275a
SH
7155/**
7156 * netdev_drivername - network driver for the device
7157 * @dev: network device
f0db275a
SH
7158 *
7159 * Determine network driver for device.
7160 */
3019de12 7161const char *netdev_drivername(const struct net_device *dev)
6579e57b 7162{
cf04a4c7
SH
7163 const struct device_driver *driver;
7164 const struct device *parent;
3019de12 7165 const char *empty = "";
6579e57b
AV
7166
7167 parent = dev->dev.parent;
6579e57b 7168 if (!parent)
3019de12 7169 return empty;
6579e57b
AV
7170
7171 driver = parent->driver;
7172 if (driver && driver->name)
3019de12
DM
7173 return driver->name;
7174 return empty;
6579e57b
AV
7175}
7176
6ea754eb
JP
7177static void __netdev_printk(const char *level, const struct net_device *dev,
7178 struct va_format *vaf)
256df2f3 7179{
b004ff49 7180 if (dev && dev->dev.parent) {
6ea754eb
JP
7181 dev_printk_emit(level[1] - '0',
7182 dev->dev.parent,
7183 "%s %s %s%s: %pV",
7184 dev_driver_string(dev->dev.parent),
7185 dev_name(dev->dev.parent),
7186 netdev_name(dev), netdev_reg_state(dev),
7187 vaf);
b004ff49 7188 } else if (dev) {
6ea754eb
JP
7189 printk("%s%s%s: %pV",
7190 level, netdev_name(dev), netdev_reg_state(dev), vaf);
b004ff49 7191 } else {
6ea754eb 7192 printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 7193 }
256df2f3
JP
7194}
7195
6ea754eb
JP
7196void netdev_printk(const char *level, const struct net_device *dev,
7197 const char *format, ...)
256df2f3
JP
7198{
7199 struct va_format vaf;
7200 va_list args;
256df2f3
JP
7201
7202 va_start(args, format);
7203
7204 vaf.fmt = format;
7205 vaf.va = &args;
7206
6ea754eb 7207 __netdev_printk(level, dev, &vaf);
b004ff49 7208
256df2f3 7209 va_end(args);
256df2f3
JP
7210}
7211EXPORT_SYMBOL(netdev_printk);
7212
7213#define define_netdev_printk_level(func, level) \
6ea754eb 7214void func(const struct net_device *dev, const char *fmt, ...) \
256df2f3 7215{ \
256df2f3
JP
7216 struct va_format vaf; \
7217 va_list args; \
7218 \
7219 va_start(args, fmt); \
7220 \
7221 vaf.fmt = fmt; \
7222 vaf.va = &args; \
7223 \
6ea754eb 7224 __netdev_printk(level, dev, &vaf); \
b004ff49 7225 \
256df2f3 7226 va_end(args); \
256df2f3
JP
7227} \
7228EXPORT_SYMBOL(func);
7229
7230define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7231define_netdev_printk_level(netdev_alert, KERN_ALERT);
7232define_netdev_printk_level(netdev_crit, KERN_CRIT);
7233define_netdev_printk_level(netdev_err, KERN_ERR);
7234define_netdev_printk_level(netdev_warn, KERN_WARNING);
7235define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7236define_netdev_printk_level(netdev_info, KERN_INFO);
7237
4665079c 7238static void __net_exit netdev_exit(struct net *net)
881d966b
EB
7239{
7240 kfree(net->dev_name_head);
7241 kfree(net->dev_index_head);
7242}
7243
022cbae6 7244static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
7245 .init = netdev_init,
7246 .exit = netdev_exit,
7247};
7248
4665079c 7249static void __net_exit default_device_exit(struct net *net)
ce286d32 7250{
e008b5fc 7251 struct net_device *dev, *aux;
ce286d32 7252 /*
e008b5fc 7253 * Push all migratable network devices back to the
ce286d32
EB
7254 * initial network namespace
7255 */
7256 rtnl_lock();
e008b5fc 7257 for_each_netdev_safe(net, dev, aux) {
ce286d32 7258 int err;
aca51397 7259 char fb_name[IFNAMSIZ];
ce286d32
EB
7260
7261 /* Ignore unmoveable devices (i.e. loopback) */
7262 if (dev->features & NETIF_F_NETNS_LOCAL)
7263 continue;
7264
e008b5fc
EB
7265 /* Leave virtual devices for the generic cleanup */
7266 if (dev->rtnl_link_ops)
7267 continue;
d0c082ce 7268
25985edc 7269 /* Push remaining network devices to init_net */
aca51397
PE
7270 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7271 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 7272 if (err) {
7b6cd1ce
JP
7273 pr_emerg("%s: failed to move %s to init_net: %d\n",
7274 __func__, dev->name, err);
aca51397 7275 BUG();
ce286d32
EB
7276 }
7277 }
7278 rtnl_unlock();
7279}
7280
50624c93
EB
7281static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7282{
7283 /* Return with the rtnl_lock held when there are no network
7284 * devices unregistering in any network namespace in net_list.
7285 */
7286 struct net *net;
7287 bool unregistering;
ff960a73 7288 DEFINE_WAIT_FUNC(wait, woken_wake_function);
50624c93 7289
ff960a73 7290 add_wait_queue(&netdev_unregistering_wq, &wait);
50624c93 7291 for (;;) {
50624c93
EB
7292 unregistering = false;
7293 rtnl_lock();
7294 list_for_each_entry(net, net_list, exit_list) {
7295 if (net->dev_unreg_count > 0) {
7296 unregistering = true;
7297 break;
7298 }
7299 }
7300 if (!unregistering)
7301 break;
7302 __rtnl_unlock();
ff960a73
PZ
7303
7304 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
50624c93 7305 }
ff960a73 7306 remove_wait_queue(&netdev_unregistering_wq, &wait);
50624c93
EB
7307}
7308
04dc7f6b
EB
7309static void __net_exit default_device_exit_batch(struct list_head *net_list)
7310{
7311 /* At exit all network devices most be removed from a network
b595076a 7312 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
7313 * Do this across as many network namespaces as possible to
7314 * improve batching efficiency.
7315 */
7316 struct net_device *dev;
7317 struct net *net;
7318 LIST_HEAD(dev_kill_list);
7319
50624c93
EB
7320 /* To prevent network device cleanup code from dereferencing
7321 * loopback devices or network devices that have been freed
7322 * wait here for all pending unregistrations to complete,
7323 * before unregistring the loopback device and allowing the
7324 * network namespace be freed.
7325 *
7326 * The netdev todo list containing all network devices
7327 * unregistrations that happen in default_device_exit_batch
7328 * will run in the rtnl_unlock() at the end of
7329 * default_device_exit_batch.
7330 */
7331 rtnl_lock_unregistering(net_list);
04dc7f6b
EB
7332 list_for_each_entry(net, net_list, exit_list) {
7333 for_each_netdev_reverse(net, dev) {
b0ab2fab 7334 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
04dc7f6b
EB
7335 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7336 else
7337 unregister_netdevice_queue(dev, &dev_kill_list);
7338 }
7339 }
7340 unregister_netdevice_many(&dev_kill_list);
7341 rtnl_unlock();
7342}
7343
022cbae6 7344static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 7345 .exit = default_device_exit,
04dc7f6b 7346 .exit_batch = default_device_exit_batch,
ce286d32
EB
7347};
7348
1da177e4
LT
7349/*
7350 * Initialize the DEV module. At boot time this walks the device list and
7351 * unhooks any devices that fail to initialise (normally hardware not
7352 * present) and leaves us with a valid list of present and active devices.
7353 *
7354 */
7355
7356/*
7357 * This is called single threaded during boot, so no need
7358 * to take the rtnl semaphore.
7359 */
7360static int __init net_dev_init(void)
7361{
7362 int i, rc = -ENOMEM;
7363
7364 BUG_ON(!dev_boot_phase);
7365
1da177e4
LT
7366 if (dev_proc_init())
7367 goto out;
7368
8b41d188 7369 if (netdev_kobject_init())
1da177e4
LT
7370 goto out;
7371
7372 INIT_LIST_HEAD(&ptype_all);
82d8a867 7373 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
7374 INIT_LIST_HEAD(&ptype_base[i]);
7375
62532da9
VY
7376 INIT_LIST_HEAD(&offload_base);
7377
881d966b
EB
7378 if (register_pernet_subsys(&netdev_net_ops))
7379 goto out;
1da177e4
LT
7380
7381 /*
7382 * Initialise the packet receive queues.
7383 */
7384
6f912042 7385 for_each_possible_cpu(i) {
e36fa2f7 7386 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 7387
e36fa2f7 7388 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 7389 skb_queue_head_init(&sd->process_queue);
e36fa2f7 7390 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588 7391 sd->output_queue_tailp = &sd->output_queue;
df334545 7392#ifdef CONFIG_RPS
e36fa2f7
ED
7393 sd->csd.func = rps_trigger_softirq;
7394 sd->csd.info = sd;
e36fa2f7 7395 sd->cpu = i;
1e94d72f 7396#endif
0a9627f2 7397
e36fa2f7
ED
7398 sd->backlog.poll = process_backlog;
7399 sd->backlog.weight = weight_p;
1da177e4
LT
7400 }
7401
1da177e4
LT
7402 dev_boot_phase = 0;
7403
505d4f73
EB
7404 /* The loopback device is special if any other network devices
7405 * is present in a network namespace the loopback device must
7406 * be present. Since we now dynamically allocate and free the
7407 * loopback device ensure this invariant is maintained by
7408 * keeping the loopback device as the first device on the
7409 * list of network devices. Ensuring the loopback devices
7410 * is the first device that appears and the last network device
7411 * that disappears.
7412 */
7413 if (register_pernet_device(&loopback_net_ops))
7414 goto out;
7415
7416 if (register_pernet_device(&default_device_ops))
7417 goto out;
7418
962cf36c
CM
7419 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7420 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
7421
7422 hotcpu_notifier(dev_cpu_callback, 0);
7423 dst_init();
1da177e4
LT
7424 rc = 0;
7425out:
7426 return rc;
7427}
7428
7429subsys_initcall(net_dev_init);