[NET_SCHED]: Reread dev->qdisc for NETDEV_TX_OK
[linux-2.6-block.git] / net / sched / sch_generic.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_generic.c Generic packet scheduler routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11 * - Ingress support
12 */
13
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
1da177e4
LT
17#include <linux/module.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/init.h>
32#include <linux/rcupdate.h>
33#include <linux/list.h>
34#include <net/sock.h>
35#include <net/pkt_sched.h>
36
37/* Main transmission queue. */
38
0463d4ae
PM
39/* Modifications to data participating in scheduling must be protected with
40 * dev->queue_lock spinlock.
41 *
42 * The idea is the following:
43 * - enqueue, dequeue are serialized via top level device
44 * spinlock dev->queue_lock.
fd44de7c
PM
45 * - ingress filtering is serialized via top level device
46 * spinlock dev->ingress_lock.
0463d4ae 47 * - updates to tree and tree walking are only done under the rtnl mutex.
1da177e4 48 */
1da177e4
LT
49
50void qdisc_lock_tree(struct net_device *dev)
51{
1da177e4 52 spin_lock_bh(&dev->queue_lock);
fd44de7c 53 spin_lock(&dev->ingress_lock);
1da177e4
LT
54}
55
56void qdisc_unlock_tree(struct net_device *dev)
57{
fd44de7c 58 spin_unlock(&dev->ingress_lock);
1da177e4 59 spin_unlock_bh(&dev->queue_lock);
1da177e4
LT
60}
61
10297b99 62/*
1da177e4
LT
63 dev->queue_lock serializes queue accesses for this device
64 AND dev->qdisc pointer itself.
65
932ff279 66 netif_tx_lock serializes accesses to device driver.
1da177e4 67
932ff279 68 dev->queue_lock and netif_tx_lock are mutually exclusive,
1da177e4
LT
69 if one is grabbed, another must be free.
70 */
71
72
73/* Kick device.
1da177e4 74
d90df3ad
HX
75 Returns: 0 - queue is empty or throttled.
76 >0 - queue is not empty.
1da177e4
LT
77
78 NOTE: Called under dev->queue_lock with locally disabled BH.
79*/
80
48d83325 81static inline int qdisc_restart(struct net_device *dev)
1da177e4
LT
82{
83 struct Qdisc *q = dev->qdisc;
84 struct sk_buff *skb;
85
86 /* Dequeue packet */
f6a78bfc 87 if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
1da177e4 88 unsigned nolock = (dev->features & NETIF_F_LLTX);
f6a78bfc
HX
89
90 dev->gso_skb = NULL;
91
1da177e4
LT
92 /*
93 * When the driver has LLTX set it does its own locking
94 * in start_xmit. No need to add additional overhead by
95 * locking again. These checks are worth it because
96 * even uncongested locks can be quite expensive.
97 * The driver can do trylock like here too, in case
98 * of lock congestion it should return -1 and the packet
99 * will be requeued.
100 */
101 if (!nolock) {
932ff279 102 if (!netif_tx_trylock(dev)) {
1da177e4
LT
103 collision:
104 /* So, someone grabbed the driver. */
10297b99 105
1da177e4
LT
106 /* It may be transient configuration error,
107 when hard_start_xmit() recurses. We detect
108 it by checking xmit owner and drop the
109 packet when deadloop is detected.
110 */
111 if (dev->xmit_lock_owner == smp_processor_id()) {
112 kfree_skb(skb);
113 if (net_ratelimit())
114 printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
d90df3ad 115 goto out;
1da177e4
LT
116 }
117 __get_cpu_var(netdev_rx_stat).cpu_collision++;
118 goto requeue;
119 }
1da177e4 120 }
10297b99 121
1da177e4
LT
122 {
123 /* And release queue */
124 spin_unlock(&dev->queue_lock);
125
126 if (!netif_queue_stopped(dev)) {
127 int ret;
1da177e4 128
f6a78bfc 129 ret = dev_hard_start_xmit(skb, dev);
10297b99 130 if (ret == NETDEV_TX_OK) {
1da177e4 131 if (!nolock) {
932ff279 132 netif_tx_unlock(dev);
1da177e4
LT
133 }
134 spin_lock(&dev->queue_lock);
cce1fa36 135 q = dev->qdisc;
d90df3ad 136 goto out;
1da177e4
LT
137 }
138 if (ret == NETDEV_TX_LOCKED && nolock) {
139 spin_lock(&dev->queue_lock);
5830725f 140 q = dev->qdisc;
10297b99 141 goto collision;
1da177e4
LT
142 }
143 }
144
145 /* NETDEV_TX_BUSY - we need to requeue */
146 /* Release the driver */
10297b99 147 if (!nolock) {
932ff279 148 netif_tx_unlock(dev);
10297b99 149 }
1da177e4
LT
150 spin_lock(&dev->queue_lock);
151 q = dev->qdisc;
152 }
153
154 /* Device kicked us out :(
155 This is possible in three cases:
156
157 0. driver is locked
158 1. fastroute is enabled
159 2. device cannot determine busy state
160 before start of transmission (f.e. dialout)
161 3. device is buggy (ppp)
162 */
163
164requeue:
f6a78bfc
HX
165 if (skb->next)
166 dev->gso_skb = skb;
167 else
168 q->ops->requeue(skb, q);
1da177e4 169 netif_schedule(dev);
d90df3ad 170 return 0;
1da177e4 171 }
d90df3ad
HX
172
173out:
8cbe1d46 174 BUG_ON((int) q->q.qlen < 0);
1da177e4
LT
175 return q->q.qlen;
176}
177
48d83325
HX
178void __qdisc_run(struct net_device *dev)
179{
d4828d85
HX
180 if (unlikely(dev->qdisc == &noop_qdisc))
181 goto out;
182
d90df3ad
HX
183 do {
184 if (!qdisc_restart(dev))
185 break;
186 } while (!netif_queue_stopped(dev));
48d83325 187
d4828d85 188out:
48d83325
HX
189 clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
190}
191
1da177e4
LT
192static void dev_watchdog(unsigned long arg)
193{
194 struct net_device *dev = (struct net_device *)arg;
195
932ff279 196 netif_tx_lock(dev);
1da177e4
LT
197 if (dev->qdisc != &noop_qdisc) {
198 if (netif_device_present(dev) &&
199 netif_running(dev) &&
200 netif_carrier_ok(dev)) {
201 if (netif_queue_stopped(dev) &&
338f7566
SH
202 time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
203
204 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
205 dev->name);
1da177e4
LT
206 dev->tx_timeout(dev);
207 }
f5a6e01c 208 if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
1da177e4
LT
209 dev_hold(dev);
210 }
211 }
932ff279 212 netif_tx_unlock(dev);
1da177e4
LT
213
214 dev_put(dev);
215}
216
217static void dev_watchdog_init(struct net_device *dev)
218{
219 init_timer(&dev->watchdog_timer);
220 dev->watchdog_timer.data = (unsigned long)dev;
221 dev->watchdog_timer.function = dev_watchdog;
222}
223
224void __netdev_watchdog_up(struct net_device *dev)
225{
226 if (dev->tx_timeout) {
227 if (dev->watchdog_timeo <= 0)
228 dev->watchdog_timeo = 5*HZ;
229 if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
230 dev_hold(dev);
231 }
232}
233
234static void dev_watchdog_up(struct net_device *dev)
235{
1da177e4 236 __netdev_watchdog_up(dev);
1da177e4
LT
237}
238
239static void dev_watchdog_down(struct net_device *dev)
240{
932ff279 241 netif_tx_lock_bh(dev);
1da177e4 242 if (del_timer(&dev->watchdog_timer))
15333061 243 dev_put(dev);
932ff279 244 netif_tx_unlock_bh(dev);
1da177e4
LT
245}
246
0a242efc
DV
247void netif_carrier_on(struct net_device *dev)
248{
249 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
250 linkwatch_fire_event(dev);
251 if (netif_running(dev))
252 __netdev_watchdog_up(dev);
253}
254
255void netif_carrier_off(struct net_device *dev)
256{
257 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
258 linkwatch_fire_event(dev);
259}
260
1da177e4
LT
261/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
262 under all circumstances. It is difficult to invent anything faster or
263 cheaper.
264 */
265
94df109a 266static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
1da177e4
LT
267{
268 kfree_skb(skb);
269 return NET_XMIT_CN;
270}
271
94df109a 272static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
1da177e4
LT
273{
274 return NULL;
275}
276
94df109a 277static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
1da177e4
LT
278{
279 if (net_ratelimit())
94df109a
TG
280 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
281 skb->dev->name);
1da177e4
LT
282 kfree_skb(skb);
283 return NET_XMIT_CN;
284}
285
286struct Qdisc_ops noop_qdisc_ops = {
1da177e4
LT
287 .id = "noop",
288 .priv_size = 0,
289 .enqueue = noop_enqueue,
290 .dequeue = noop_dequeue,
291 .requeue = noop_requeue,
292 .owner = THIS_MODULE,
293};
294
295struct Qdisc noop_qdisc = {
296 .enqueue = noop_enqueue,
297 .dequeue = noop_dequeue,
298 .flags = TCQ_F_BUILTIN,
10297b99 299 .ops = &noop_qdisc_ops,
1da177e4
LT
300 .list = LIST_HEAD_INIT(noop_qdisc.list),
301};
302
303static struct Qdisc_ops noqueue_qdisc_ops = {
1da177e4
LT
304 .id = "noqueue",
305 .priv_size = 0,
306 .enqueue = noop_enqueue,
307 .dequeue = noop_dequeue,
308 .requeue = noop_requeue,
309 .owner = THIS_MODULE,
310};
311
312static struct Qdisc noqueue_qdisc = {
313 .enqueue = NULL,
314 .dequeue = noop_dequeue,
315 .flags = TCQ_F_BUILTIN,
316 .ops = &noqueue_qdisc_ops,
317 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
318};
319
320
321static const u8 prio2band[TC_PRIO_MAX+1] =
322 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
323
324/* 3-band FIFO queue: old style, but should be a bit faster than
325 generic prio+fifo combination.
326 */
327
f87a9c3d
TG
328#define PFIFO_FAST_BANDS 3
329
321090e7
TG
330static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
331 struct Qdisc *qdisc)
1da177e4
LT
332{
333 struct sk_buff_head *list = qdisc_priv(qdisc);
321090e7
TG
334 return list + prio2band[skb->priority & TC_PRIO_MAX];
335}
1da177e4 336
f87a9c3d 337static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
321090e7
TG
338{
339 struct sk_buff_head *list = prio2list(skb, qdisc);
1da177e4 340
821d24ae 341 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
1da177e4 342 qdisc->q.qlen++;
821d24ae 343 return __qdisc_enqueue_tail(skb, qdisc, list);
1da177e4 344 }
821d24ae
TG
345
346 return qdisc_drop(skb, qdisc);
1da177e4
LT
347}
348
f87a9c3d 349static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
1da177e4
LT
350{
351 int prio;
352 struct sk_buff_head *list = qdisc_priv(qdisc);
1da177e4 353
452f299d
TG
354 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
355 if (!skb_queue_empty(list + prio)) {
1da177e4 356 qdisc->q.qlen--;
452f299d 357 return __qdisc_dequeue_head(qdisc, list + prio);
1da177e4
LT
358 }
359 }
f87a9c3d 360
1da177e4
LT
361 return NULL;
362}
363
f87a9c3d 364static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
1da177e4 365{
1da177e4 366 qdisc->q.qlen++;
321090e7 367 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
1da177e4
LT
368}
369
f87a9c3d 370static void pfifo_fast_reset(struct Qdisc* qdisc)
1da177e4
LT
371{
372 int prio;
373 struct sk_buff_head *list = qdisc_priv(qdisc);
374
f87a9c3d 375 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
821d24ae
TG
376 __qdisc_reset_queue(qdisc, list + prio);
377
378 qdisc->qstats.backlog = 0;
1da177e4
LT
379 qdisc->q.qlen = 0;
380}
381
382static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
383{
f87a9c3d 384 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
1da177e4 385
1da177e4
LT
386 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
387 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
388 return skb->len;
389
390rtattr_failure:
1da177e4
LT
391 return -1;
392}
393
394static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
395{
f87a9c3d 396 int prio;
1da177e4
LT
397 struct sk_buff_head *list = qdisc_priv(qdisc);
398
f87a9c3d
TG
399 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
400 skb_queue_head_init(list + prio);
1da177e4
LT
401
402 return 0;
403}
404
405static struct Qdisc_ops pfifo_fast_ops = {
1da177e4 406 .id = "pfifo_fast",
f87a9c3d 407 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
1da177e4
LT
408 .enqueue = pfifo_fast_enqueue,
409 .dequeue = pfifo_fast_dequeue,
410 .requeue = pfifo_fast_requeue,
411 .init = pfifo_fast_init,
412 .reset = pfifo_fast_reset,
413 .dump = pfifo_fast_dump,
414 .owner = THIS_MODULE,
415};
416
3d54b82f 417struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
1da177e4
LT
418{
419 void *p;
420 struct Qdisc *sch;
3d54b82f
TG
421 unsigned int size;
422 int err = -ENOBUFS;
1da177e4
LT
423
424 /* ensure that the Qdisc and the private data are 32-byte aligned */
3d54b82f
TG
425 size = QDISC_ALIGN(sizeof(*sch));
426 size += ops->priv_size + (QDISC_ALIGNTO - 1);
1da177e4 427
0da974f4 428 p = kzalloc(size, GFP_KERNEL);
1da177e4 429 if (!p)
3d54b82f 430 goto errout;
3d54b82f
TG
431 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
432 sch->padded = (char *) sch - (char *) p;
1da177e4
LT
433
434 INIT_LIST_HEAD(&sch->list);
435 skb_queue_head_init(&sch->q);
436 sch->ops = ops;
437 sch->enqueue = ops->enqueue;
438 sch->dequeue = ops->dequeue;
439 sch->dev = dev;
440 dev_hold(dev);
1da177e4 441 atomic_set(&sch->refcnt, 1);
3d54b82f
TG
442
443 return sch;
444errout:
445 return ERR_PTR(-err);
446}
447
9f9afec4
PM
448struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
449 unsigned int parentid)
3d54b82f
TG
450{
451 struct Qdisc *sch;
10297b99 452
3d54b82f
TG
453 sch = qdisc_alloc(dev, ops);
454 if (IS_ERR(sch))
455 goto errout;
fd44de7c 456 sch->stats_lock = &dev->queue_lock;
9f9afec4 457 sch->parent = parentid;
3d54b82f 458
1da177e4
LT
459 if (!ops->init || ops->init(sch, NULL) == 0)
460 return sch;
461
0fbbeb1b 462 qdisc_destroy(sch);
3d54b82f 463errout:
1da177e4
LT
464 return NULL;
465}
466
467/* Under dev->queue_lock and BH! */
468
469void qdisc_reset(struct Qdisc *qdisc)
470{
471 struct Qdisc_ops *ops = qdisc->ops;
472
473 if (ops->reset)
474 ops->reset(qdisc);
475}
476
10297b99 477/* this is the rcu callback function to clean up a qdisc when there
1da177e4
LT
478 * are no further references to it */
479
480static void __qdisc_destroy(struct rcu_head *head)
481{
482 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
1da177e4
LT
483 kfree((char *) qdisc - qdisc->padded);
484}
485
486/* Under dev->queue_lock and BH! */
487
488void qdisc_destroy(struct Qdisc *qdisc)
489{
85670cc1 490 struct Qdisc_ops *ops = qdisc->ops;
1da177e4
LT
491
492 if (qdisc->flags & TCQ_F_BUILTIN ||
85670cc1 493 !atomic_dec_and_test(&qdisc->refcnt))
1da177e4
LT
494 return;
495
85670cc1
PM
496 list_del(&qdisc->list);
497#ifdef CONFIG_NET_ESTIMATOR
498 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
499#endif
500 if (ops->reset)
501 ops->reset(qdisc);
502 if (ops->destroy)
503 ops->destroy(qdisc);
1da177e4 504
85670cc1
PM
505 module_put(ops->owner);
506 dev_put(qdisc->dev);
1da177e4
LT
507 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
508}
509
510void dev_activate(struct net_device *dev)
511{
512 /* No queueing discipline is attached to device;
513 create default one i.e. pfifo_fast for devices,
514 which need queueing and noqueue_qdisc for
515 virtual interfaces
516 */
517
518 if (dev->qdisc_sleeping == &noop_qdisc) {
519 struct Qdisc *qdisc;
520 if (dev->tx_queue_len) {
9f9afec4
PM
521 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
522 TC_H_ROOT);
1da177e4
LT
523 if (qdisc == NULL) {
524 printk(KERN_INFO "%s: activation failed\n", dev->name);
525 return;
526 }
1da177e4 527 list_add_tail(&qdisc->list, &dev->qdisc_list);
1da177e4
LT
528 } else {
529 qdisc = &noqueue_qdisc;
530 }
1da177e4 531 dev->qdisc_sleeping = qdisc;
1da177e4
LT
532 }
533
cacaddf5
TC
534 if (!netif_carrier_ok(dev))
535 /* Delay activation until next carrier-on event */
536 return;
537
1da177e4
LT
538 spin_lock_bh(&dev->queue_lock);
539 rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
540 if (dev->qdisc != &noqueue_qdisc) {
541 dev->trans_start = jiffies;
542 dev_watchdog_up(dev);
543 }
544 spin_unlock_bh(&dev->queue_lock);
545}
546
547void dev_deactivate(struct net_device *dev)
548{
549 struct Qdisc *qdisc;
550
551 spin_lock_bh(&dev->queue_lock);
552 qdisc = dev->qdisc;
553 dev->qdisc = &noop_qdisc;
554
555 qdisc_reset(qdisc);
556
557 spin_unlock_bh(&dev->queue_lock);
558
559 dev_watchdog_down(dev);
560
d4828d85
HX
561 /* Wait for outstanding dev_queue_xmit calls. */
562 synchronize_rcu();
1da177e4 563
d4828d85
HX
564 /* Wait for outstanding qdisc_run calls. */
565 while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
566 yield();
f6a78bfc
HX
567
568 if (dev->gso_skb) {
569 kfree_skb(dev->gso_skb);
570 dev->gso_skb = NULL;
571 }
1da177e4
LT
572}
573
574void dev_init_scheduler(struct net_device *dev)
575{
576 qdisc_lock_tree(dev);
577 dev->qdisc = &noop_qdisc;
578 dev->qdisc_sleeping = &noop_qdisc;
579 INIT_LIST_HEAD(&dev->qdisc_list);
580 qdisc_unlock_tree(dev);
581
582 dev_watchdog_init(dev);
583}
584
585void dev_shutdown(struct net_device *dev)
586{
587 struct Qdisc *qdisc;
588
589 qdisc_lock_tree(dev);
590 qdisc = dev->qdisc_sleeping;
591 dev->qdisc = &noop_qdisc;
592 dev->qdisc_sleeping = &noop_qdisc;
593 qdisc_destroy(qdisc);
594#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
10297b99 595 if ((qdisc = dev->qdisc_ingress) != NULL) {
1da177e4
LT
596 dev->qdisc_ingress = NULL;
597 qdisc_destroy(qdisc);
10297b99 598 }
1da177e4
LT
599#endif
600 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
601 qdisc_unlock_tree(dev);
602}
603
0a242efc
DV
604EXPORT_SYMBOL(netif_carrier_on);
605EXPORT_SYMBOL(netif_carrier_off);
1da177e4 606EXPORT_SYMBOL(noop_qdisc);
1da177e4
LT
607EXPORT_SYMBOL(qdisc_create_dflt);
608EXPORT_SYMBOL(qdisc_destroy);
609EXPORT_SYMBOL(qdisc_reset);
1da177e4
LT
610EXPORT_SYMBOL(qdisc_lock_tree);
611EXPORT_SYMBOL(qdisc_unlock_tree);