Merge remote-tracking branch 'asoc/fix/rt5645' into asoc-linus
[linux-2.6-block.git] / net / sched / sch_tbf.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_tbf.c Token Bucket Filter queue.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 * Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 * original idea by Martin Devera
12 *
13 */
14
1da177e4 15#include <linux/module.h>
1da177e4
LT
16#include <linux/types.h>
17#include <linux/kernel.h>
1da177e4 18#include <linux/string.h>
1da177e4 19#include <linux/errno.h>
1da177e4 20#include <linux/skbuff.h>
0ba48053 21#include <net/netlink.h>
b757c933 22#include <net/sch_generic.h>
1da177e4
LT
23#include <net/pkt_sched.h>
24
25
26/* Simple Token Bucket Filter.
27 =======================================
28
29 SOURCE.
30 -------
31
32 None.
33
34 Description.
35 ------------
36
37 A data flow obeys TBF with rate R and depth B, if for any
38 time interval t_i...t_f the number of transmitted bits
39 does not exceed B + R*(t_f-t_i).
40
41 Packetized version of this definition:
42 The sequence of packets of sizes s_i served at moments t_i
43 obeys TBF, if for any i<=k:
44
45 s_i+....+s_k <= B + R*(t_k - t_i)
46
47 Algorithm.
48 ----------
49
50 Let N(t_i) be B/R initially and N(t) grow continuously with time as:
51
52 N(t+delta) = min{B/R, N(t) + delta}
53
54 If the first packet in queue has length S, it may be
55 transmitted only at the time t_* when S/R <= N(t_*),
56 and in this case N(t) jumps:
57
58 N(t_* + 0) = N(t_* - 0) - S/R.
59
60
61
62 Actually, QoS requires two TBF to be applied to a data stream.
63 One of them controls steady state burst size, another
64 one with rate P (peak rate) and depth M (equal to link MTU)
65 limits bursts at a smaller time scale.
66
67 It is easy to see that P>R, and B>M. If P is infinity, this double
68 TBF is equivalent to a single one.
69
70 When TBF works in reshaping mode, latency is estimated as:
71
72 lat = max ((L-B)/R, (L-M)/P)
73
74
75 NOTES.
76 ------
77
78 If TBF throttles, it starts a watchdog timer, which will wake it up
79 when it is ready to transmit.
80 Note that the minimal timer resolution is 1/HZ.
81 If no new packets arrive during this period,
82 or if the device is not awaken by EOI for some previous packet,
83 TBF can stop its activity for 1/HZ.
84
85
86 This means, that with depth B, the maximal rate is
87
88 R_crit = B*HZ
89
90 F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92 Note that the peak rate TBF is much more tough: with MTU 1500
93 P_crit = 150Kbytes/sec. So, if you need greater peak
94 rates, use alpha with HZ=1000 :-)
95
96 With classful TBF, limit is just kept for backwards compatibility.
97 It is passed to the default bfifo qdisc - if the inner qdisc is
98 changed the limit is not effective anymore.
99*/
100
cc7ec456 101struct tbf_sched_data {
1da177e4
LT
102/* Parameters */
103 u32 limit; /* Maximal length of backlog: bytes */
a135e598 104 u32 max_size;
b757c933
JP
105 s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
106 s64 mtu;
b757c933
JP
107 struct psched_ratecfg rate;
108 struct psched_ratecfg peak;
1da177e4
LT
109
110/* Variables */
b757c933
JP
111 s64 tokens; /* Current number of B tokens */
112 s64 ptokens; /* Current number of P tokens */
113 s64 t_c; /* Time check-point */
1da177e4 114 struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */
f7f593e3 115 struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4
LT
116};
117
e43ac79a 118
cc106e44
YY
119/* Time to Length, convert time in ns to length in bytes
120 * to determinate how many bytes can be sent in given time.
121 */
122static u64 psched_ns_t2l(const struct psched_ratecfg *r,
123 u64 time_in_ns)
124{
125 /* The formula is :
126 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
127 */
128 u64 len = time_in_ns * r->rate_bytes_ps;
129
130 do_div(len, NSEC_PER_SEC);
131
d55d282e
YY
132 if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
133 do_div(len, 53);
134 len = len * 48;
135 }
cc106e44
YY
136
137 if (len > r->overhead)
138 len -= r->overhead;
139 else
140 len = 0;
141
142 return len;
143}
144
4d0820cf
ED
145/*
146 * Return length of individual segments of a gso packet,
147 * including all headers (MAC, IP, TCP/UDP)
148 */
de960aa9 149static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
4d0820cf
ED
150{
151 unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
de960aa9 152 return hdr_len + skb_gso_transport_seglen(skb);
4d0820cf
ED
153}
154
e43ac79a
ED
155/* GSO packet is too big, segment it so that tbf can transmit
156 * each segment in time
157 */
158static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
159{
160 struct tbf_sched_data *q = qdisc_priv(sch);
161 struct sk_buff *segs, *nskb;
162 netdev_features_t features = netif_skb_features(skb);
163 int ret, nb;
164
165 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
166
167 if (IS_ERR_OR_NULL(segs))
168 return qdisc_reshape_fail(skb, sch);
169
170 nb = 0;
171 while (segs) {
172 nskb = segs->next;
173 segs->next = NULL;
4d0820cf
ED
174 qdisc_skb_cb(segs)->pkt_len = segs->len;
175 ret = qdisc_enqueue(segs, q->qdisc);
e43ac79a
ED
176 if (ret != NET_XMIT_SUCCESS) {
177 if (net_xmit_drop_count(ret))
25331d6c 178 qdisc_qstats_drop(sch);
e43ac79a
ED
179 } else {
180 nb++;
181 }
182 segs = nskb;
183 }
184 sch->q.qlen += nb;
185 if (nb > 1)
186 qdisc_tree_decrease_qlen(sch, 1 - nb);
187 consume_skb(skb);
188 return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
189}
190
cc7ec456 191static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
1da177e4
LT
192{
193 struct tbf_sched_data *q = qdisc_priv(sch);
194 int ret;
195
e43ac79a 196 if (qdisc_pkt_len(skb) > q->max_size) {
de960aa9 197 if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
e43ac79a 198 return tbf_segment(skb, sch);
69747650 199 return qdisc_reshape_fail(skb, sch);
e43ac79a 200 }
5f86173b 201 ret = qdisc_enqueue(skb, q->qdisc);
9871e50e 202 if (ret != NET_XMIT_SUCCESS) {
378a2f09 203 if (net_xmit_drop_count(ret))
25331d6c 204 qdisc_qstats_drop(sch);
1da177e4
LT
205 return ret;
206 }
207
208 sch->q.qlen++;
9871e50e 209 return NET_XMIT_SUCCESS;
1da177e4
LT
210}
211
cc7ec456 212static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4
LT
213{
214 struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26 215 unsigned int len = 0;
1da177e4 216
6d037a26 217 if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4 218 sch->q.qlen--;
25331d6c 219 qdisc_qstats_drop(sch);
1da177e4
LT
220 }
221 return len;
222}
223
a135e598
HS
224static bool tbf_peak_present(const struct tbf_sched_data *q)
225{
226 return q->peak.rate_bytes_ps;
227}
228
cc7ec456 229static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
1da177e4
LT
230{
231 struct tbf_sched_data *q = qdisc_priv(sch);
232 struct sk_buff *skb;
233
03c05f0d 234 skb = q->qdisc->ops->peek(q->qdisc);
1da177e4
LT
235
236 if (skb) {
b757c933
JP
237 s64 now;
238 s64 toks;
239 s64 ptoks = 0;
0abf77e5 240 unsigned int len = qdisc_pkt_len(skb);
1da177e4 241
d2de875c 242 now = ktime_get_ns();
b757c933 243 toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4 244
a135e598 245 if (tbf_peak_present(q)) {
1da177e4 246 ptoks = toks + q->ptokens;
b757c933 247 if (ptoks > q->mtu)
1da177e4 248 ptoks = q->mtu;
b757c933 249 ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4
LT
250 }
251 toks += q->tokens;
b757c933 252 if (toks > q->buffer)
1da177e4 253 toks = q->buffer;
b757c933 254 toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4
LT
255
256 if ((toks|ptoks) >= 0) {
77be155c 257 skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d
JP
258 if (unlikely(!skb))
259 return NULL;
260
1da177e4
LT
261 q->t_c = now;
262 q->tokens = toks;
263 q->ptokens = ptoks;
264 sch->q.qlen--;
fd245a4a 265 qdisc_unthrottled(sch);
9190b3b3 266 qdisc_bstats_update(sch, skb);
1da177e4
LT
267 return skb;
268 }
269
b757c933 270 qdisc_watchdog_schedule_ns(&q->watchdog,
f2600cf0
ED
271 now + max_t(long, -toks, -ptoks),
272 true);
1da177e4
LT
273
274 /* Maybe we have a shorter packet in the queue,
275 which can be sent now. It sounds cool,
276 but, however, this is wrong in principle.
277 We MUST NOT reorder packets under these circumstances.
278
279 Really, if we split the flow into independent
280 subflows, it would be a very good solution.
281 This is the main idea of all FQ algorithms
282 (cf. CSZ, HPFQ, HFSC)
283 */
284
25331d6c 285 qdisc_qstats_overlimit(sch);
1da177e4
LT
286 }
287 return NULL;
288}
289
cc7ec456 290static void tbf_reset(struct Qdisc *sch)
1da177e4
LT
291{
292 struct tbf_sched_data *q = qdisc_priv(sch);
293
294 qdisc_reset(q->qdisc);
295 sch->q.qlen = 0;
d2de875c 296 q->t_c = ktime_get_ns();
1da177e4
LT
297 q->tokens = q->buffer;
298 q->ptokens = q->mtu;
f7f593e3 299 qdisc_watchdog_cancel(&q->watchdog);
1da177e4
LT
300}
301
27a3421e
PM
302static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
303 [TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
304 [TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
305 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26
YY
306 [TCA_TBF_RATE64] = { .type = NLA_U64 },
307 [TCA_TBF_PRATE64] = { .type = NLA_U64 },
2e04ad42
YY
308 [TCA_TBF_BURST] = { .type = NLA_U32 },
309 [TCA_TBF_PBURST] = { .type = NLA_U32 },
27a3421e
PM
310};
311
cc7ec456 312static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
1da177e4 313{
cee63723 314 int err;
1da177e4 315 struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26 316 struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4 317 struct tc_tbf_qopt *qopt;
1da177e4 318 struct Qdisc *child = NULL;
cc106e44
YY
319 struct psched_ratecfg rate;
320 struct psched_ratecfg peak;
321 u64 max_size;
322 s64 buffer, mtu;
a33c4a26 323 u64 rate64 = 0, prate64 = 0;
1da177e4 324
a33c4a26 325 err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
cee63723
PM
326 if (err < 0)
327 return err;
328
329 err = -EINVAL;
27a3421e 330 if (tb[TCA_TBF_PARMS] == NULL)
1da177e4
LT
331 goto done;
332
1e90474c 333 qopt = nla_data(tb[TCA_TBF_PARMS]);
cc106e44
YY
334 if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
335 qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
336 tb[TCA_TBF_RTAB]));
1da177e4 337
cc106e44
YY
338 if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
339 qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
340 tb[TCA_TBF_PTAB]));
4d0820cf 341
cc106e44
YY
342 buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
343 mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
344
345 if (tb[TCA_TBF_RATE64])
346 rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
347 psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
348
2e04ad42
YY
349 if (tb[TCA_TBF_BURST]) {
350 max_size = nla_get_u32(tb[TCA_TBF_BURST]);
351 buffer = psched_l2t_ns(&rate, max_size);
352 } else {
353 max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
354 }
cc106e44
YY
355
356 if (qopt->peakrate.rate) {
357 if (tb[TCA_TBF_PRATE64])
358 prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
359 psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
360 if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
361 pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
2e04ad42 362 peak.rate_bytes_ps, rate.rate_bytes_ps);
cc106e44
YY
363 err = -EINVAL;
364 goto done;
365 }
366
2e04ad42
YY
367 if (tb[TCA_TBF_PBURST]) {
368 u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
369 max_size = min_t(u32, max_size, pburst);
370 mtu = psched_l2t_ns(&peak, pburst);
371 } else {
372 max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
373 }
a135e598
HS
374 } else {
375 memset(&peak, 0, sizeof(peak));
cc106e44
YY
376 }
377
378 if (max_size < psched_mtu(qdisc_dev(sch)))
379 pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
380 max_size, qdisc_dev(sch)->name,
381 psched_mtu(qdisc_dev(sch)));
382
383 if (!max_size) {
384 err = -EINVAL;
385 goto done;
386 }
387
724b9e1d
HS
388 if (q->qdisc != &noop_qdisc) {
389 err = fifo_set_limit(q->qdisc, qopt->limit);
390 if (err)
391 goto done;
392 } else if (qopt->limit > 0) {
393 child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
394 if (IS_ERR(child)) {
395 err = PTR_ERR(child);
396 goto done;
397 }
398 }
399
1da177e4 400 sch_tree_lock(sch);
5e50da01
PM
401 if (child) {
402 qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc
PM
403 qdisc_destroy(q->qdisc);
404 q->qdisc = child;
5e50da01 405 }
1da177e4 406 q->limit = qopt->limit;
2e04ad42
YY
407 if (tb[TCA_TBF_PBURST])
408 q->mtu = mtu;
409 else
410 q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4 411 q->max_size = max_size;
2e04ad42
YY
412 if (tb[TCA_TBF_BURST])
413 q->buffer = buffer;
414 else
415 q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4
LT
416 q->tokens = q->buffer;
417 q->ptokens = q->mtu;
b94c8afc 418
cc106e44 419 memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
a135e598 420 memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
b94c8afc 421
1da177e4
LT
422 sch_tree_unlock(sch);
423 err = 0;
424done:
1da177e4
LT
425 return err;
426}
427
cc7ec456 428static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
429{
430 struct tbf_sched_data *q = qdisc_priv(sch);
431
432 if (opt == NULL)
433 return -EINVAL;
434
d2de875c 435 q->t_c = ktime_get_ns();
f7f593e3 436 qdisc_watchdog_init(&q->watchdog, sch);
1da177e4
LT
437 q->qdisc = &noop_qdisc;
438
439 return tbf_change(sch, opt);
440}
441
442static void tbf_destroy(struct Qdisc *sch)
443{
444 struct tbf_sched_data *q = qdisc_priv(sch);
445
f7f593e3 446 qdisc_watchdog_cancel(&q->watchdog);
1da177e4
LT
447 qdisc_destroy(q->qdisc);
448}
449
450static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
451{
452 struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef 453 struct nlattr *nest;
1da177e4
LT
454 struct tc_tbf_qopt opt;
455
b0460e44 456 sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef
PM
457 nest = nla_nest_start(skb, TCA_OPTIONS);
458 if (nest == NULL)
459 goto nla_put_failure;
1da177e4
LT
460
461 opt.limit = q->limit;
01cb71d2 462 psched_ratecfg_getrate(&opt.rate, &q->rate);
a135e598 463 if (tbf_peak_present(q))
01cb71d2 464 psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4
LT
465 else
466 memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933
JP
467 opt.mtu = PSCHED_NS2TICKS(q->mtu);
468 opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43
DM
469 if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
470 goto nla_put_failure;
a33c4a26
YY
471 if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
472 nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
473 goto nla_put_failure;
a135e598 474 if (tbf_peak_present(q) &&
a33c4a26
YY
475 q->peak.rate_bytes_ps >= (1ULL << 32) &&
476 nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
477 goto nla_put_failure;
1da177e4 478
d59b7d80 479 return nla_nest_end(skb, nest);
1da177e4 480
1e90474c 481nla_put_failure:
4b3550ef 482 nla_nest_cancel(skb, nest);
1da177e4
LT
483 return -1;
484}
485
486static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
487 struct sk_buff *skb, struct tcmsg *tcm)
488{
489 struct tbf_sched_data *q = qdisc_priv(sch);
490
1da177e4
LT
491 tcm->tcm_handle |= TC_H_MIN(1);
492 tcm->tcm_info = q->qdisc->handle;
493
494 return 0;
495}
496
497static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
498 struct Qdisc **old)
499{
500 struct tbf_sched_data *q = qdisc_priv(sch);
501
502 if (new == NULL)
503 new = &noop_qdisc;
504
505 sch_tree_lock(sch);
b94c8afc
PM
506 *old = q->qdisc;
507 q->qdisc = new;
5e50da01 508 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1da177e4 509 qdisc_reset(*old);
1da177e4
LT
510 sch_tree_unlock(sch);
511
512 return 0;
513}
514
515static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
516{
517 struct tbf_sched_data *q = qdisc_priv(sch);
518 return q->qdisc;
519}
520
521static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
522{
523 return 1;
524}
525
526static void tbf_put(struct Qdisc *sch, unsigned long arg)
527{
528}
529
1da177e4
LT
530static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
531{
532 if (!walker->stop) {
533 if (walker->count >= walker->skip)
534 if (walker->fn(sch, 1, walker) < 0) {
535 walker->stop = 1;
536 return;
537 }
538 walker->count++;
539 }
540}
541
cc7ec456 542static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4
LT
543 .graft = tbf_graft,
544 .leaf = tbf_leaf,
545 .get = tbf_get,
546 .put = tbf_put,
1da177e4 547 .walk = tbf_walk,
1da177e4
LT
548 .dump = tbf_dump_class,
549};
550
20fea08b 551static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4
LT
552 .next = NULL,
553 .cl_ops = &tbf_class_ops,
554 .id = "tbf",
555 .priv_size = sizeof(struct tbf_sched_data),
556 .enqueue = tbf_enqueue,
557 .dequeue = tbf_dequeue,
77be155c 558 .peek = qdisc_peek_dequeued,
1da177e4
LT
559 .drop = tbf_drop,
560 .init = tbf_init,
561 .reset = tbf_reset,
562 .destroy = tbf_destroy,
563 .change = tbf_change,
564 .dump = tbf_dump,
565 .owner = THIS_MODULE,
566};
567
568static int __init tbf_module_init(void)
569{
570 return register_qdisc(&tbf_qdisc_ops);
571}
572
573static void __exit tbf_module_exit(void)
574{
575 unregister_qdisc(&tbf_qdisc_ops);
576}
577module_init(tbf_module_init)
578module_exit(tbf_module_exit)
579MODULE_LICENSE("GPL");