[CCID3]: Fix a bug in the send time processing
[linux-2.6-block.git] / net / sched / sch_generic.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_generic.c Generic packet scheduler routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Jamal Hadi Salim, <hadi@cyberus.ca> 990601
11 * - Ingress support
12 */
13
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
1da177e4
LT
17#include <linux/module.h>
18#include <linux/types.h>
19#include <linux/kernel.h>
20#include <linux/sched.h>
21#include <linux/string.h>
22#include <linux/mm.h>
23#include <linux/socket.h>
24#include <linux/sockios.h>
25#include <linux/in.h>
26#include <linux/errno.h>
27#include <linux/interrupt.h>
28#include <linux/netdevice.h>
29#include <linux/skbuff.h>
30#include <linux/rtnetlink.h>
31#include <linux/init.h>
32#include <linux/rcupdate.h>
33#include <linux/list.h>
34#include <net/sock.h>
35#include <net/pkt_sched.h>
36
c716a81a
JHS
37#define SCHED_TX_DROP -2
38#define SCHED_TX_QUEUE -3
39
1da177e4
LT
40/* Main transmission queue. */
41
0463d4ae
PM
42/* Modifications to data participating in scheduling must be protected with
43 * dev->queue_lock spinlock.
44 *
45 * The idea is the following:
46 * - enqueue, dequeue are serialized via top level device
47 * spinlock dev->queue_lock.
fd44de7c
PM
48 * - ingress filtering is serialized via top level device
49 * spinlock dev->ingress_lock.
0463d4ae 50 * - updates to tree and tree walking are only done under the rtnl mutex.
1da177e4 51 */
1da177e4
LT
52
53void qdisc_lock_tree(struct net_device *dev)
54{
1da177e4 55 spin_lock_bh(&dev->queue_lock);
fd44de7c 56 spin_lock(&dev->ingress_lock);
1da177e4
LT
57}
58
59void qdisc_unlock_tree(struct net_device *dev)
60{
fd44de7c 61 spin_unlock(&dev->ingress_lock);
1da177e4 62 spin_unlock_bh(&dev->queue_lock);
1da177e4
LT
63}
64
c716a81a
JHS
65static inline int qdisc_qlen(struct Qdisc *q)
66{
67 BUG_ON((int) q->q.qlen < 0);
68 return q->q.qlen;
69}
70
71static inline int handle_dev_cpu_collision(struct net_device *dev)
72{
73 if (unlikely(dev->xmit_lock_owner == smp_processor_id())) {
74 if (net_ratelimit())
75 printk(KERN_WARNING
76 "Dead loop on netdevice %s, fix it urgently!\n",
77 dev->name);
78 return SCHED_TX_DROP;
79 }
80 __get_cpu_var(netdev_rx_stat).cpu_collision++;
81 return SCHED_TX_QUEUE;
82}
83
84static inline int
85do_dev_requeue(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
86{
87
88 if (unlikely(skb->next))
89 dev->gso_skb = skb;
90 else
91 q->ops->requeue(skb, q);
92 /* XXX: Could netif_schedule fail? Or is the fact we are
93 * requeueing imply the hardware path is closed
94 * and even if we fail, some interupt will wake us
95 */
96 netif_schedule(dev);
97 return 0;
98}
99
100static inline struct sk_buff *
101try_get_tx_pkt(struct net_device *dev, struct Qdisc *q)
102{
103 struct sk_buff *skb = dev->gso_skb;
104
105 if (skb)
106 dev->gso_skb = NULL;
107 else
108 skb = q->dequeue(q);
109
110 return skb;
111}
112
113static inline int
114tx_islocked(struct sk_buff *skb, struct net_device *dev, struct Qdisc *q)
115{
116 int ret = handle_dev_cpu_collision(dev);
117
118 if (ret == SCHED_TX_DROP) {
119 kfree_skb(skb);
120 return qdisc_qlen(q);
121 }
122
123 return do_dev_requeue(skb, dev, q);
124}
125
126
10297b99 127/*
c716a81a
JHS
128 NOTE: Called under dev->queue_lock with locally disabled BH.
129
130 __LINK_STATE_QDISC_RUNNING guarantees only one CPU
131 can enter this region at a time.
132
1da177e4
LT
133 dev->queue_lock serializes queue accesses for this device
134 AND dev->qdisc pointer itself.
135
932ff279 136 netif_tx_lock serializes accesses to device driver.
1da177e4 137
932ff279 138 dev->queue_lock and netif_tx_lock are mutually exclusive,
1da177e4 139 if one is grabbed, another must be free.
1da177e4 140
c716a81a 141 Multiple CPUs may contend for the two locks.
1da177e4 142
c716a81a 143 Note, that this procedure can be called by a watchdog timer
1da177e4 144
c716a81a 145 Returns to the caller:
d90df3ad
HX
146 Returns: 0 - queue is empty or throttled.
147 >0 - queue is not empty.
1da177e4 148
1da177e4
LT
149*/
150
48d83325 151static inline int qdisc_restart(struct net_device *dev)
1da177e4
LT
152{
153 struct Qdisc *q = dev->qdisc;
c716a81a 154 unsigned lockless = (dev->features & NETIF_F_LLTX);
1da177e4 155 struct sk_buff *skb;
c716a81a 156 int ret;
1da177e4 157
c716a81a
JHS
158 skb = try_get_tx_pkt(dev, q);
159 if (skb == NULL)
160 return 0;
f6a78bfc 161
c716a81a
JHS
162 /* we have a packet to send */
163 if (!lockless) {
164 if (!netif_tx_trylock(dev))
165 return tx_islocked(skb, dev, q);
1da177e4 166 }
c716a81a
JHS
167 /* all clear .. */
168 spin_unlock(&dev->queue_lock);
169
170 ret = NETDEV_TX_BUSY;
171 if (!netif_queue_stopped(dev))
172 /* churn baby churn .. */
173 ret = dev_hard_start_xmit(skb, dev);
174
175 if (!lockless)
176 netif_tx_unlock(dev);
177
178 spin_lock(&dev->queue_lock);
179
180 /* we need to refresh q because it may be invalid since
181 * we dropped dev->queue_lock earlier ...
182 * So dont try to be clever grasshopper
183 */
184 q = dev->qdisc;
185 /* most likely result, packet went ok */
186 if (ret == NETDEV_TX_OK)
187 return qdisc_qlen(q);
188 /* only for lockless drivers .. */
189 if (ret == NETDEV_TX_LOCKED && lockless)
190 return tx_islocked(skb, dev, q);
191
192 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
193 printk(KERN_WARNING " BUG %s code %d qlen %d\n",dev->name, ret, q->q.qlen);
194
195 return do_dev_requeue(skb, dev, q);
1da177e4
LT
196}
197
c716a81a 198
48d83325
HX
199void __qdisc_run(struct net_device *dev)
200{
d90df3ad
HX
201 do {
202 if (!qdisc_restart(dev))
203 break;
204 } while (!netif_queue_stopped(dev));
48d83325
HX
205
206 clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
207}
208
1da177e4
LT
209static void dev_watchdog(unsigned long arg)
210{
211 struct net_device *dev = (struct net_device *)arg;
212
932ff279 213 netif_tx_lock(dev);
1da177e4
LT
214 if (dev->qdisc != &noop_qdisc) {
215 if (netif_device_present(dev) &&
216 netif_running(dev) &&
217 netif_carrier_ok(dev)) {
218 if (netif_queue_stopped(dev) &&
338f7566
SH
219 time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {
220
221 printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
222 dev->name);
1da177e4
LT
223 dev->tx_timeout(dev);
224 }
f5a6e01c 225 if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
1da177e4
LT
226 dev_hold(dev);
227 }
228 }
932ff279 229 netif_tx_unlock(dev);
1da177e4
LT
230
231 dev_put(dev);
232}
233
234static void dev_watchdog_init(struct net_device *dev)
235{
236 init_timer(&dev->watchdog_timer);
237 dev->watchdog_timer.data = (unsigned long)dev;
238 dev->watchdog_timer.function = dev_watchdog;
239}
240
241void __netdev_watchdog_up(struct net_device *dev)
242{
243 if (dev->tx_timeout) {
244 if (dev->watchdog_timeo <= 0)
245 dev->watchdog_timeo = 5*HZ;
60468d5b
VP
246 if (!mod_timer(&dev->watchdog_timer,
247 round_jiffies(jiffies + dev->watchdog_timeo)))
1da177e4
LT
248 dev_hold(dev);
249 }
250}
251
252static void dev_watchdog_up(struct net_device *dev)
253{
1da177e4 254 __netdev_watchdog_up(dev);
1da177e4
LT
255}
256
257static void dev_watchdog_down(struct net_device *dev)
258{
932ff279 259 netif_tx_lock_bh(dev);
1da177e4 260 if (del_timer(&dev->watchdog_timer))
15333061 261 dev_put(dev);
932ff279 262 netif_tx_unlock_bh(dev);
1da177e4
LT
263}
264
0a242efc
DV
265void netif_carrier_on(struct net_device *dev)
266{
267 if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
268 linkwatch_fire_event(dev);
269 if (netif_running(dev))
270 __netdev_watchdog_up(dev);
271}
272
273void netif_carrier_off(struct net_device *dev)
274{
275 if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
276 linkwatch_fire_event(dev);
277}
278
1da177e4
LT
279/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
280 under all circumstances. It is difficult to invent anything faster or
281 cheaper.
282 */
283
94df109a 284static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
1da177e4
LT
285{
286 kfree_skb(skb);
287 return NET_XMIT_CN;
288}
289
94df109a 290static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
1da177e4
LT
291{
292 return NULL;
293}
294
94df109a 295static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
1da177e4
LT
296{
297 if (net_ratelimit())
94df109a
TG
298 printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
299 skb->dev->name);
1da177e4
LT
300 kfree_skb(skb);
301 return NET_XMIT_CN;
302}
303
304struct Qdisc_ops noop_qdisc_ops = {
1da177e4
LT
305 .id = "noop",
306 .priv_size = 0,
307 .enqueue = noop_enqueue,
308 .dequeue = noop_dequeue,
309 .requeue = noop_requeue,
310 .owner = THIS_MODULE,
311};
312
313struct Qdisc noop_qdisc = {
314 .enqueue = noop_enqueue,
315 .dequeue = noop_dequeue,
316 .flags = TCQ_F_BUILTIN,
10297b99 317 .ops = &noop_qdisc_ops,
1da177e4
LT
318 .list = LIST_HEAD_INIT(noop_qdisc.list),
319};
320
321static struct Qdisc_ops noqueue_qdisc_ops = {
1da177e4
LT
322 .id = "noqueue",
323 .priv_size = 0,
324 .enqueue = noop_enqueue,
325 .dequeue = noop_dequeue,
326 .requeue = noop_requeue,
327 .owner = THIS_MODULE,
328};
329
330static struct Qdisc noqueue_qdisc = {
331 .enqueue = NULL,
332 .dequeue = noop_dequeue,
333 .flags = TCQ_F_BUILTIN,
334 .ops = &noqueue_qdisc_ops,
335 .list = LIST_HEAD_INIT(noqueue_qdisc.list),
336};
337
338
339static const u8 prio2band[TC_PRIO_MAX+1] =
340 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };
341
342/* 3-band FIFO queue: old style, but should be a bit faster than
343 generic prio+fifo combination.
344 */
345
f87a9c3d
TG
346#define PFIFO_FAST_BANDS 3
347
321090e7
TG
348static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
349 struct Qdisc *qdisc)
1da177e4
LT
350{
351 struct sk_buff_head *list = qdisc_priv(qdisc);
321090e7
TG
352 return list + prio2band[skb->priority & TC_PRIO_MAX];
353}
1da177e4 354
f87a9c3d 355static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
321090e7
TG
356{
357 struct sk_buff_head *list = prio2list(skb, qdisc);
1da177e4 358
821d24ae 359 if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
1da177e4 360 qdisc->q.qlen++;
821d24ae 361 return __qdisc_enqueue_tail(skb, qdisc, list);
1da177e4 362 }
821d24ae
TG
363
364 return qdisc_drop(skb, qdisc);
1da177e4
LT
365}
366
f87a9c3d 367static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
1da177e4
LT
368{
369 int prio;
370 struct sk_buff_head *list = qdisc_priv(qdisc);
1da177e4 371
452f299d
TG
372 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
373 if (!skb_queue_empty(list + prio)) {
1da177e4 374 qdisc->q.qlen--;
452f299d 375 return __qdisc_dequeue_head(qdisc, list + prio);
1da177e4
LT
376 }
377 }
f87a9c3d 378
1da177e4
LT
379 return NULL;
380}
381
f87a9c3d 382static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
1da177e4 383{
1da177e4 384 qdisc->q.qlen++;
321090e7 385 return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
1da177e4
LT
386}
387
f87a9c3d 388static void pfifo_fast_reset(struct Qdisc* qdisc)
1da177e4
LT
389{
390 int prio;
391 struct sk_buff_head *list = qdisc_priv(qdisc);
392
f87a9c3d 393 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
821d24ae
TG
394 __qdisc_reset_queue(qdisc, list + prio);
395
396 qdisc->qstats.backlog = 0;
1da177e4
LT
397 qdisc->q.qlen = 0;
398}
399
400static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
401{
f87a9c3d 402 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
1da177e4 403
1da177e4
LT
404 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
405 RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
406 return skb->len;
407
408rtattr_failure:
1da177e4
LT
409 return -1;
410}
411
412static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
413{
f87a9c3d 414 int prio;
1da177e4
LT
415 struct sk_buff_head *list = qdisc_priv(qdisc);
416
f87a9c3d
TG
417 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
418 skb_queue_head_init(list + prio);
1da177e4
LT
419
420 return 0;
421}
422
423static struct Qdisc_ops pfifo_fast_ops = {
1da177e4 424 .id = "pfifo_fast",
f87a9c3d 425 .priv_size = PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
1da177e4
LT
426 .enqueue = pfifo_fast_enqueue,
427 .dequeue = pfifo_fast_dequeue,
428 .requeue = pfifo_fast_requeue,
429 .init = pfifo_fast_init,
430 .reset = pfifo_fast_reset,
431 .dump = pfifo_fast_dump,
432 .owner = THIS_MODULE,
433};
434
3d54b82f 435struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
1da177e4
LT
436{
437 void *p;
438 struct Qdisc *sch;
3d54b82f
TG
439 unsigned int size;
440 int err = -ENOBUFS;
1da177e4
LT
441
442 /* ensure that the Qdisc and the private data are 32-byte aligned */
3d54b82f
TG
443 size = QDISC_ALIGN(sizeof(*sch));
444 size += ops->priv_size + (QDISC_ALIGNTO - 1);
1da177e4 445
0da974f4 446 p = kzalloc(size, GFP_KERNEL);
1da177e4 447 if (!p)
3d54b82f 448 goto errout;
3d54b82f
TG
449 sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
450 sch->padded = (char *) sch - (char *) p;
1da177e4
LT
451
452 INIT_LIST_HEAD(&sch->list);
453 skb_queue_head_init(&sch->q);
454 sch->ops = ops;
455 sch->enqueue = ops->enqueue;
456 sch->dequeue = ops->dequeue;
457 sch->dev = dev;
458 dev_hold(dev);
1da177e4 459 atomic_set(&sch->refcnt, 1);
3d54b82f
TG
460
461 return sch;
462errout:
463 return ERR_PTR(-err);
464}
465
9f9afec4
PM
466struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
467 unsigned int parentid)
3d54b82f
TG
468{
469 struct Qdisc *sch;
10297b99 470
3d54b82f
TG
471 sch = qdisc_alloc(dev, ops);
472 if (IS_ERR(sch))
473 goto errout;
fd44de7c 474 sch->stats_lock = &dev->queue_lock;
9f9afec4 475 sch->parent = parentid;
3d54b82f 476
1da177e4
LT
477 if (!ops->init || ops->init(sch, NULL) == 0)
478 return sch;
479
0fbbeb1b 480 qdisc_destroy(sch);
3d54b82f 481errout:
1da177e4
LT
482 return NULL;
483}
484
485/* Under dev->queue_lock and BH! */
486
487void qdisc_reset(struct Qdisc *qdisc)
488{
489 struct Qdisc_ops *ops = qdisc->ops;
490
491 if (ops->reset)
492 ops->reset(qdisc);
493}
494
10297b99 495/* this is the rcu callback function to clean up a qdisc when there
1da177e4
LT
496 * are no further references to it */
497
498static void __qdisc_destroy(struct rcu_head *head)
499{
500 struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
1da177e4
LT
501 kfree((char *) qdisc - qdisc->padded);
502}
503
504/* Under dev->queue_lock and BH! */
505
506void qdisc_destroy(struct Qdisc *qdisc)
507{
85670cc1 508 struct Qdisc_ops *ops = qdisc->ops;
1da177e4
LT
509
510 if (qdisc->flags & TCQ_F_BUILTIN ||
85670cc1 511 !atomic_dec_and_test(&qdisc->refcnt))
1da177e4
LT
512 return;
513
85670cc1
PM
514 list_del(&qdisc->list);
515#ifdef CONFIG_NET_ESTIMATOR
516 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
517#endif
518 if (ops->reset)
519 ops->reset(qdisc);
520 if (ops->destroy)
521 ops->destroy(qdisc);
1da177e4 522
85670cc1
PM
523 module_put(ops->owner);
524 dev_put(qdisc->dev);
1da177e4
LT
525 call_rcu(&qdisc->q_rcu, __qdisc_destroy);
526}
527
528void dev_activate(struct net_device *dev)
529{
530 /* No queueing discipline is attached to device;
531 create default one i.e. pfifo_fast for devices,
532 which need queueing and noqueue_qdisc for
533 virtual interfaces
534 */
535
536 if (dev->qdisc_sleeping == &noop_qdisc) {
537 struct Qdisc *qdisc;
538 if (dev->tx_queue_len) {
9f9afec4
PM
539 qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
540 TC_H_ROOT);
1da177e4
LT
541 if (qdisc == NULL) {
542 printk(KERN_INFO "%s: activation failed\n", dev->name);
543 return;
544 }
1da177e4 545 list_add_tail(&qdisc->list, &dev->qdisc_list);
1da177e4
LT
546 } else {
547 qdisc = &noqueue_qdisc;
548 }
1da177e4 549 dev->qdisc_sleeping = qdisc;
1da177e4
LT
550 }
551
cacaddf5
TC
552 if (!netif_carrier_ok(dev))
553 /* Delay activation until next carrier-on event */
554 return;
555
1da177e4
LT
556 spin_lock_bh(&dev->queue_lock);
557 rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
558 if (dev->qdisc != &noqueue_qdisc) {
559 dev->trans_start = jiffies;
560 dev_watchdog_up(dev);
561 }
562 spin_unlock_bh(&dev->queue_lock);
563}
564
565void dev_deactivate(struct net_device *dev)
566{
567 struct Qdisc *qdisc;
41a23b07 568 struct sk_buff *skb;
1da177e4
LT
569
570 spin_lock_bh(&dev->queue_lock);
571 qdisc = dev->qdisc;
572 dev->qdisc = &noop_qdisc;
573
574 qdisc_reset(qdisc);
575
41a23b07
HX
576 skb = dev->gso_skb;
577 dev->gso_skb = NULL;
1da177e4
LT
578 spin_unlock_bh(&dev->queue_lock);
579
41a23b07
HX
580 kfree_skb(skb);
581
1da177e4
LT
582 dev_watchdog_down(dev);
583
d4828d85
HX
584 /* Wait for outstanding dev_queue_xmit calls. */
585 synchronize_rcu();
1da177e4 586
d4828d85
HX
587 /* Wait for outstanding qdisc_run calls. */
588 while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
589 yield();
1da177e4
LT
590}
591
592void dev_init_scheduler(struct net_device *dev)
593{
594 qdisc_lock_tree(dev);
595 dev->qdisc = &noop_qdisc;
596 dev->qdisc_sleeping = &noop_qdisc;
597 INIT_LIST_HEAD(&dev->qdisc_list);
598 qdisc_unlock_tree(dev);
599
600 dev_watchdog_init(dev);
601}
602
603void dev_shutdown(struct net_device *dev)
604{
605 struct Qdisc *qdisc;
606
607 qdisc_lock_tree(dev);
608 qdisc = dev->qdisc_sleeping;
609 dev->qdisc = &noop_qdisc;
610 dev->qdisc_sleeping = &noop_qdisc;
611 qdisc_destroy(qdisc);
612#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
10297b99 613 if ((qdisc = dev->qdisc_ingress) != NULL) {
1da177e4
LT
614 dev->qdisc_ingress = NULL;
615 qdisc_destroy(qdisc);
10297b99 616 }
1da177e4
LT
617#endif
618 BUG_TRAP(!timer_pending(&dev->watchdog_timer));
619 qdisc_unlock_tree(dev);
620}
621
0a242efc
DV
622EXPORT_SYMBOL(netif_carrier_on);
623EXPORT_SYMBOL(netif_carrier_off);
1da177e4 624EXPORT_SYMBOL(noop_qdisc);
1da177e4
LT
625EXPORT_SYMBOL(qdisc_create_dflt);
626EXPORT_SYMBOL(qdisc_destroy);
627EXPORT_SYMBOL(qdisc_reset);
1da177e4
LT
628EXPORT_SYMBOL(qdisc_lock_tree);
629EXPORT_SYMBOL(qdisc_unlock_tree);