Commit | Line | Data |
---|---|---|
482a4360 | 1 | .. SPDX-License-Identifier: GPL-2.0 |
1da177e4 | 2 | |
482a4360 | 3 | ===================================== |
1da177e4 | 4 | Network Devices, the Kernel, and You! |
482a4360 | 5 | ===================================== |
1da177e4 LT |
6 | |
7 | ||
8 | Introduction | |
9 | ============ | |
10 | The following is a random collection of documentation regarding | |
11 | network devices. | |
12 | ||
2b446e65 JK |
13 | struct net_device lifetime rules |
14 | ================================ | |
1da177e4 | 15 | Network device structures need to persist even after module is unloaded and |
74d332c1 ED |
16 | must be allocated with alloc_netdev_mqs() and friends. |
17 | If device has registered successfully, it will be freed on last use | |
2b446e65 JK |
18 | by free_netdev(). This is required to handle the pathological case cleanly |
19 | (example: ``rmmod mydriver </sys/class/net/myeth/mtu``) | |
1da177e4 | 20 | |
2b446e65 | 21 | alloc_netdev_mqs() / alloc_netdev() reserve extra space for driver |
1da177e4 LT |
22 | private data which gets freed when the network device is freed. If |
23 | separately allocated data is attached to the network device | |
2b446e65 JK |
24 | (netdev_priv()) then it is up to the module exit handler to free that. |
25 | ||
26 | There are two groups of APIs for registering struct net_device. | |
27 | First group can be used in normal contexts where ``rtnl_lock`` is not already | |
28 | held: register_netdev(), unregister_netdev(). | |
29 | Second group can be used when ``rtnl_lock`` is already held: | |
30 | register_netdevice(), unregister_netdevice(), free_netdevice(). | |
31 | ||
32 | Simple drivers | |
33 | -------------- | |
34 | ||
35 | Most drivers (especially device drivers) handle lifetime of struct net_device | |
36 | in context where ``rtnl_lock`` is not held (e.g. driver probe and remove paths). | |
37 | ||
38 | In that case the struct net_device registration is done using | |
39 | the register_netdev(), and unregister_netdev() functions: | |
40 | ||
41 | .. code-block:: c | |
42 | ||
43 | int probe() | |
44 | { | |
45 | struct my_device_priv *priv; | |
46 | int err; | |
47 | ||
48 | dev = alloc_netdev_mqs(...); | |
49 | if (!dev) | |
50 | return -ENOMEM; | |
51 | priv = netdev_priv(dev); | |
52 | ||
53 | /* ... do all device setup before calling register_netdev() ... | |
54 | */ | |
55 | ||
56 | err = register_netdev(dev); | |
57 | if (err) | |
58 | goto err_undo; | |
59 | ||
60 | /* net_device is visible to the user! */ | |
61 | ||
62 | err_undo: | |
63 | /* ... undo the device setup ... */ | |
64 | free_netdev(dev); | |
65 | return err; | |
66 | } | |
67 | ||
68 | void remove() | |
69 | { | |
70 | unregister_netdev(dev); | |
71 | free_netdev(dev); | |
72 | } | |
73 | ||
74 | Note that after calling register_netdev() the device is visible in the system. | |
75 | Users can open it and start sending / receiving traffic immediately, | |
76 | or run any other callback, so all initialization must be done prior to | |
77 | registration. | |
78 | ||
79 | unregister_netdev() closes the device and waits for all users to be done | |
80 | with it. The memory of struct net_device itself may still be referenced | |
81 | by sysfs but all operations on that device will fail. | |
82 | ||
83 | free_netdev() can be called after unregister_netdev() returns on when | |
84 | register_netdev() failed. | |
85 | ||
86 | Device management under RTNL | |
87 | ---------------------------- | |
88 | ||
89 | Registering struct net_device while in context which already holds | |
90 | the ``rtnl_lock`` requires extra care. In those scenarios most drivers | |
91 | will want to make use of struct net_device's ``needs_free_netdev`` | |
92 | and ``priv_destructor`` members for freeing of state. | |
93 | ||
94 | Example flow of netdev handling under ``rtnl_lock``: | |
95 | ||
96 | .. code-block:: c | |
97 | ||
98 | static void my_setup(struct net_device *dev) | |
99 | { | |
100 | dev->needs_free_netdev = true; | |
101 | } | |
102 | ||
103 | static void my_destructor(struct net_device *dev) | |
104 | { | |
105 | some_obj_destroy(priv->obj); | |
106 | some_uninit(priv); | |
107 | } | |
108 | ||
109 | int create_link() | |
110 | { | |
111 | struct my_device_priv *priv; | |
112 | int err; | |
113 | ||
114 | ASSERT_RTNL(); | |
115 | ||
116 | dev = alloc_netdev(sizeof(*priv), "net%d", NET_NAME_UNKNOWN, my_setup); | |
117 | if (!dev) | |
118 | return -ENOMEM; | |
119 | priv = netdev_priv(dev); | |
120 | ||
121 | /* Implicit constructor */ | |
122 | err = some_init(priv); | |
123 | if (err) | |
124 | goto err_free_dev; | |
125 | ||
126 | priv->obj = some_obj_create(); | |
127 | if (!priv->obj) { | |
128 | err = -ENOMEM; | |
129 | goto err_some_uninit; | |
130 | } | |
131 | /* End of constructor, set the destructor: */ | |
132 | dev->priv_destructor = my_destructor; | |
133 | ||
134 | err = register_netdevice(dev); | |
135 | if (err) | |
136 | /* register_netdevice() calls destructor on failure */ | |
137 | goto err_free_dev; | |
138 | ||
139 | /* If anything fails now unregister_netdevice() (or unregister_netdev()) | |
140 | * will take care of calling my_destructor and free_netdev(). | |
141 | */ | |
142 | ||
143 | return 0; | |
144 | ||
145 | err_some_uninit: | |
146 | some_uninit(priv); | |
147 | err_free_dev: | |
148 | free_netdev(dev); | |
149 | return err; | |
150 | } | |
151 | ||
152 | If struct net_device.priv_destructor is set it will be called by the core | |
153 | some time after unregister_netdevice(), it will also be called if | |
154 | register_netdevice() fails. The callback may be invoked with or without | |
155 | ``rtnl_lock`` held. | |
156 | ||
157 | There is no explicit constructor callback, driver "constructs" the private | |
158 | netdev state after allocating it and before registration. | |
159 | ||
160 | Setting struct net_device.needs_free_netdev makes core call free_netdevice() | |
161 | automatically after unregister_netdevice() when all references to the device | |
162 | are gone. It only takes effect after a successful call to register_netdevice() | |
163 | so if register_netdevice() fails driver is responsible for calling | |
164 | free_netdev(). | |
165 | ||
166 | free_netdev() is safe to call on error paths right after unregister_netdevice() | |
167 | or when register_netdevice() fails. Parts of netdev (de)registration process | |
168 | happen after ``rtnl_lock`` is released, therefore in those cases free_netdev() | |
169 | will defer some of the processing until ``rtnl_lock`` is released. | |
170 | ||
171 | Devices spawned from struct rtnl_link_ops should never free the | |
172 | struct net_device directly. | |
173 | ||
174 | .ndo_init and .ndo_uninit | |
175 | ~~~~~~~~~~~~~~~~~~~~~~~~~ | |
176 | ||
177 | ``.ndo_init`` and ``.ndo_uninit`` callbacks are called during net_device | |
178 | registration and de-registration, under ``rtnl_lock``. Drivers can use | |
179 | those e.g. when parts of their init process need to run under ``rtnl_lock``. | |
180 | ||
181 | ``.ndo_init`` runs before device is visible in the system, ``.ndo_uninit`` | |
182 | runs during de-registering after device is closed but other subsystems | |
183 | may still have outstanding references to the netdevice. | |
1da177e4 | 184 | |
1c8c7d64 SH |
185 | MTU |
186 | === | |
187 | Each network device has a Maximum Transfer Unit. The MTU does not | |
188 | include any link layer protocol overhead. Upper layer protocols must | |
189 | not pass a socket buffer (skb) to a device to transmit with more data | |
190 | than the mtu. The MTU does not include link layer header overhead, so | |
191 | for example on Ethernet if the standard MTU is 1500 bytes used, the | |
192 | actual skb will contain up to 1514 bytes because of the Ethernet | |
193 | header. Devices should allow for the 4 byte VLAN header as well. | |
194 | ||
195 | Segmentation Offload (GSO, TSO) is an exception to this rule. The | |
196 | upper layer protocol may pass a large socket buffer to the device | |
197 | transmit routine, and the device will break that up into separate | |
198 | packets based on the current MTU. | |
199 | ||
200 | MTU is symmetrical and applies both to receive and transmit. A device | |
201 | must be able to receive at least the maximum size packet allowed by | |
202 | the MTU. A network device may use the MTU as mechanism to size receive | |
203 | buffers, but the device should allow packets with VLAN header. With | |
204 | standard Ethernet mtu of 1500 bytes, the device should allow up to | |
205 | 1518 byte packets (1500 + 14 header + 4 tag). The device may either: | |
206 | drop, truncate, or pass up oversize packets, but dropping oversize | |
207 | packets is preferred. | |
208 | ||
209 | ||
1da177e4 LT |
210 | struct net_device synchronization rules |
211 | ======================================= | |
b3cf6545 | 212 | ndo_open: |
1da177e4 LT |
213 | Synchronization: rtnl_lock() semaphore. |
214 | Context: process | |
215 | ||
b3cf6545 | 216 | ndo_stop: |
1da177e4 LT |
217 | Synchronization: rtnl_lock() semaphore. |
218 | Context: process | |
93b6a3ad | 219 | Note: netif_running() is guaranteed false |
1da177e4 | 220 | |
b3cf6545 | 221 | ndo_do_ioctl: |
1da177e4 LT |
222 | Synchronization: rtnl_lock() semaphore. |
223 | Context: process | |
224 | ||
3d9d00bd AB |
225 | This is only called by network subsystems internally, |
226 | not by user space calling ioctl as it was in before | |
227 | linux-5.14. | |
228 | ||
229 | ndo_siocbond: | |
230 | Synchronization: rtnl_lock() semaphore. | |
231 | Context: process | |
232 | ||
233 | Used by the bonding driver for the SIOCBOND family of | |
234 | ioctl commands. | |
235 | ||
ad7eab2a AB |
236 | ndo_siocwandev: |
237 | Synchronization: rtnl_lock() semaphore. | |
238 | Context: process | |
239 | ||
240 | Used by the drivers/net/wan framework to handle | |
241 | the SIOCWANDEV ioctl with the if_settings structure. | |
242 | ||
b9067f5d AB |
243 | ndo_siocdevprivate: |
244 | Synchronization: rtnl_lock() semaphore. | |
245 | Context: process | |
246 | ||
247 | This is used to implement SIOCDEVPRIVATE ioctl helpers. | |
248 | These should not be added to new drivers, so don't use. | |
249 | ||
a7605370 AB |
250 | ndo_eth_ioctl: |
251 | Synchronization: rtnl_lock() semaphore. | |
252 | Context: process | |
253 | ||
b3cf6545 | 254 | ndo_get_stats: |
9f9d41f0 JK |
255 | Synchronization: rtnl_lock() semaphore, dev_base_lock rwlock, or RCU. |
256 | Context: atomic (can't sleep under rwlock or RCU) | |
1da177e4 | 257 | |
b3cf6545 | 258 | ndo_start_xmit: |
04fd3d35 | 259 | Synchronization: __netif_tx_lock spinlock. |
17229333 | 260 | |
1da177e4 | 261 | When the driver sets NETIF_F_LLTX in dev->features this will be |
932ff279 | 262 | called without holding netif_tx_lock. In this case the driver |
f0cdf76c FW |
263 | has to lock by itself when needed. |
264 | The locking there should also properly protect against | |
265 | set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated. | |
19f59460 | 266 | Don't use it for new drivers. |
17229333 SH |
267 | |
268 | Context: Process with BHs disabled or BH (timer), | |
482a4360 | 269 | will be called with interrupts disabled by netconsole. |
17229333 | 270 | |
482a4360 MCC |
271 | Return codes: |
272 | ||
273 | * NETDEV_TX_OK everything ok. | |
274 | * NETDEV_TX_BUSY Cannot transmit packet, try later | |
1da177e4 LT |
275 | Usually a bug, means queue start/stop flow control is broken in |
276 | the driver. Note: the driver must NOT put the skb in its DMA ring. | |
1da177e4 | 277 | |
b3cf6545 | 278 | ndo_tx_timeout: |
04fd3d35 | 279 | Synchronization: netif_tx_lock spinlock; all TX queues frozen. |
1da177e4 LT |
280 | Context: BHs disabled |
281 | Notes: netif_queue_stopped() is guaranteed true | |
282 | ||
b3cf6545 | 283 | ndo_set_rx_mode: |
04fd3d35 | 284 | Synchronization: netif_addr_lock spinlock. |
1da177e4 LT |
285 | Context: BHs disabled |
286 | ||
bea3348e SH |
287 | struct napi_struct synchronization rules |
288 | ======================================== | |
289 | napi->poll: | |
482a4360 MCC |
290 | Synchronization: |
291 | NAPI_STATE_SCHED bit in napi->state. Device | |
b3cf6545 | 292 | driver's ndo_stop method will invoke napi_disable() on |
bea3348e SH |
293 | all NAPI instances which will do a sleeping poll on the |
294 | NAPI_STATE_SCHED napi->state bit, waiting for all pending | |
295 | NAPI activity to cease. | |
482a4360 MCC |
296 | |
297 | Context: | |
298 | softirq | |
299 | will be called with interrupts disabled by netconsole. |