[linux-2.6-block.git] / net / sched / sch_tbf.c

/*
 * net/sched/sch_tbf.c	Token Bucket Filter queue.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 *						 original idea by Martin Devera
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>


/*	Simple Token Bucket Filter.
	=======================================

	SOURCE.
	-------

	None.

	Description.
	------------

	A data flow obeys TBF with rate R and depth B, if for any
	time interval t_i...t_f the number of transmitted bits
	does not exceed B + R*(t_f-t_i).

	Packetized version of this definition:
	The sequence of packets of sizes s_i served at moments t_i
	obeys TBF, if for any i<=k:

	s_i+....+s_k <= B + R*(t_k - t_i)

	Algorithm.
	----------

	Let N(t_i) be B/R initially and N(t) grow continuously with time as:

	N(t+delta) = min{B/R, N(t) + delta}

	If the first packet in queue has length S, it may be
	transmitted only at the time t_* when S/R <= N(t_*),
	and in this case N(t) jumps:

	N(t_* + 0) = N(t_* - 0) - S/R.


	Actually, QoS requires two TBF to be applied to a data stream.
	One of them controls steady state burst size, another
	one with rate P (peak rate) and depth M (equal to link MTU)
	limits bursts at a smaller time scale.

	It is easy to see that P>R, and B>M. If P is infinity, this double
	TBF is equivalent to a single one.

	When TBF works in reshaping mode, latency is estimated as:

	lat = max ((L-B)/R, (L-M)/P)


	NOTES.
	------

	If TBF throttles, it starts a watchdog timer, which will wake it up
	when it is ready to transmit.
	Note that the minimal timer resolution is 1/HZ.
	If no new packets arrive during this period,
	or if the device is not awaken by EOI for some previous packet,
	TBF can stop its activity for 1/HZ.


	This means, that with depth B, the maximal rate is

	R_crit = B*HZ

	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.

	Note that the peak rate TBF is much more tough: with MTU 1500
	P_crit = 150Kbytes/sec. So, if you need greater peak
	rates, use alpha with HZ=1000 :-)

	With classful TBF, limit is just kept for backwards compatibility.
	It is passed to the default bfifo qdisc - if the inner qdisc is
	changed the limit is not effective anymore.
*/

struct tbf_sched_data {
/* Parameters */
	u32		limit;		/* Maximal length of backlog: bytes */
	u32		max_size;
	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
	s64		mtu;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;

/* Variables */
	s64	tokens;			/* Current number of B tokens */
	s64	ptokens;		/* Current number of P tokens */
	s64	t_c;			/* Time check-point */
	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
	struct qdisc_watchdog watchdog;	/* Watchdog timer */
};


/* Time to Length, convert time in ns to length in bytes
 * to determinate how many bytes can be sent in given time.
 */
static u64 psched_ns_t2l(const struct psched_ratecfg *r,
			 u64 time_in_ns)
{
	/* The formula is :
	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	 */
	u64 len = time_in_ns * r->rate_bytes_ps;

	do_div(len, NSEC_PER_SEC);

	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
		do_div(len, 53);
		len = len * 48;
	}

	if (len > r->overhead)
		len -= r->overhead;
	else
		len = 0;

	return len;
}

/*
 * Return length of individual segments of a gso packet,
 * including all headers (MAC, IP, TCP/UDP)
 */
static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
{
	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
	return hdr_len + skb_gso_transport_seglen(skb);
}

/* GSO packet is too big, segment it so that tbf can transmit
 * each segment in time
 */
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *segs, *nskb;
	netdev_features_t features = netif_skb_features(skb);
	int ret, nb;

	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);

	if (IS_ERR_OR_NULL(segs))
		return qdisc_reshape_fail(skb, sch);

	nb = 0;
	while (segs) {
		nskb = segs->next;
		segs->next = NULL;
		qdisc_skb_cb(segs)->pkt_len = segs->len;
		ret = qdisc_enqueue(segs, q->qdisc);
		if (ret != NET_XMIT_SUCCESS) {
			if (net_xmit_drop_count(ret))
				qdisc_qstats_drop(sch);
		} else {
			nb++;
		}
		segs = nskb;
	}
	sch->q.qlen += nb;
	if (nb > 1)
		qdisc_tree_decrease_qlen(sch, 1 - nb);
	consume_skb(skb);
	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	int ret;

	if (qdisc_pkt_len(skb) > q->max_size) {
		if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
			return tbf_segment(skb, sch);
		return qdisc_reshape_fail(skb, sch);
	}
	ret = qdisc_enqueue(skb, q->qdisc);
	if (ret != NET_XMIT_SUCCESS) {
		if (net_xmit_drop_count(ret))
			qdisc_qstats_drop(sch);
		return ret;
	}

	sch->q.qlen++;
	return NET_XMIT_SUCCESS;
}

static unsigned int tbf_drop(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	unsigned int len = 0;

	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
		sch->q.qlen--;
		qdisc_qstats_drop(sch);
	}
	return len;
}

static bool tbf_peak_present(const struct tbf_sched_data *q)
{
	return q->peak.rate_bytes_ps;
}

static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	skb = q->qdisc->ops->peek(q->qdisc);

	if (skb) {
		s64 now;
		s64 toks;
		s64 ptoks = 0;
		unsigned int len = qdisc_pkt_len(skb);

		now = ktime_get_ns();
		toks = min_t(s64, now - q->t_c, q->buffer);

		if (tbf_peak_present(q)) {
			ptoks = toks + q->ptokens;
			if (ptoks > q->mtu)
				ptoks = q->mtu;
			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
		}
		toks += q->tokens;
		if (toks > q->buffer)
			toks = q->buffer;
		toks -= (s64) psched_l2t_ns(&q->rate, len);

		if ((toks|ptoks) >= 0) {
			skb = qdisc_dequeue_peeked(q->qdisc);
			if (unlikely(!skb))
				return NULL;

			q->t_c = now;
			q->tokens = toks;
			q->ptokens = ptoks;
			sch->q.qlen--;
			qdisc_unthrottled(sch);
			qdisc_bstats_update(sch, skb);
			return skb;
		}

		qdisc_watchdog_schedule_ns(&q->watchdog,
					   now + max_t(long, -toks, -ptoks),
					   true);

		/* Maybe we have a shorter packet in the queue,
		   which can be sent now. It sounds cool,
		   but, however, this is wrong in principle.
		   We MUST NOT reorder packets under these circumstances.

		   Really, if we split the flow into independent
		   subflows, it would be a very good solution.
		   This is the main idea of all FQ algorithms
		   (cf. CSZ, HPFQ, HFSC)
		 */

		qdisc_qstats_overlimit(sch);
	}
	return NULL;
}

static void tbf_reset(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->q.qlen = 0;
	q->t_c = ktime_get_ns();
	q->tokens = q->buffer;
	q->ptokens = q->mtu;
	qdisc_watchdog_cancel(&q->watchdog);
}

static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
	[TCA_TBF_BURST] = { .type = NLA_U32 },
	[TCA_TBF_PBURST] = { .type = NLA_U32 },
};

static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
{
	int err;
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_TBF_MAX + 1];
	struct tc_tbf_qopt *qopt;
	struct Qdisc *child = NULL;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	u64 max_size;
	s64 buffer, mtu;
	u64 rate64 = 0, prate64 = 0;

	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
	if (err < 0)
		return err;

	err = -EINVAL;
	if (tb[TCA_TBF_PARMS] == NULL)
		goto done;

	qopt = nla_data(tb[TCA_TBF_PARMS]);
	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
					      tb[TCA_TBF_RTAB]));

	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
						      tb[TCA_TBF_PTAB]));

	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);

	if (tb[TCA_TBF_RATE64])
		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);

	if (tb[TCA_TBF_BURST]) {
		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
		buffer = psched_l2t_ns(&rate, max_size);
	} else {
		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	}

	if (qopt->peakrate.rate) {
		if (tb[TCA_TBF_PRATE64])
			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
					peak.rate_bytes_ps, rate.rate_bytes_ps);
			err = -EINVAL;
			goto done;
		}

		if (tb[TCA_TBF_PBURST]) {
			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
			max_size = min_t(u32, max_size, pburst);
			mtu = psched_l2t_ns(&peak, pburst);
		} else {
			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
		}
	} else {
		memset(&peak, 0, sizeof(peak));
	}

	if (max_size < psched_mtu(qdisc_dev(sch)))
		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
				    max_size, qdisc_dev(sch)->name,
				    psched_mtu(qdisc_dev(sch)));

	if (!max_size) {
		err = -EINVAL;
		goto done;
	}

	if (q->qdisc != &noop_qdisc) {
		err = fifo_set_limit(q->qdisc, qopt->limit);
		if (err)
			goto done;
	} else if (qopt->limit > 0) {
		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
		if (IS_ERR(child)) {
			err = PTR_ERR(child);
			goto done;
		}
	}

	sch_tree_lock(sch);
	if (child) {
		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
		qdisc_destroy(q->qdisc);
		q->qdisc = child;
	}
	q->limit = qopt->limit;
	if (tb[TCA_TBF_PBURST])
		q->mtu = mtu;
	else
		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
	q->max_size = max_size;
	if (tb[TCA_TBF_BURST])
		q->buffer = buffer;
	else
		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
	q->tokens = q->buffer;
	q->ptokens = q->mtu;

	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));

	sch_tree_unlock(sch);
	err = 0;
done:
	return err;
}

static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (opt == NULL)
		return -EINVAL;

	q->t_c = ktime_get_ns();
	qdisc_watchdog_init(&q->watchdog, sch);
	q->qdisc = &noop_qdisc;

	return tbf_change(sch, opt);
}

static void tbf_destroy(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_cancel(&q->watchdog);
	qdisc_destroy(q->qdisc);
}

static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *nest;
	struct tc_tbf_qopt opt;

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

	opt.limit = q->limit;
	psched_ratecfg_getrate(&opt.rate, &q->rate);
	if (tbf_peak_present(q))
		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
	else
		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
	opt.mtu = PSCHED_NS2TICKS(q->mtu);
	opt.buffer = PSCHED_NS2TICKS(q->buffer);
	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;
	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
		goto nla_put_failure;
	if (tbf_peak_present(q) &&
	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
		goto nla_put_failure;

	return nla_nest_end(skb, nest);

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	tcm->tcm_handle |= TC_H_MIN(1);
	tcm->tcm_info = q->qdisc->handle;

	return 0;
}

static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	sch_tree_lock(sch);
	*old = q->qdisc;
	q->qdisc = new;
	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
	qdisc_reset(*old);
	sch_tree_unlock(sch);

	return 0;
}

static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	return q->qdisc;
}

static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void tbf_put(struct Qdisc *sch, unsigned long arg)
{
}

static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static const struct Qdisc_class_ops tbf_class_ops = {
	.graft		=	tbf_graft,
	.leaf		=	tbf_leaf,
	.get		=	tbf_get,
	.put		=	tbf_put,
	.walk		=	tbf_walk,
	.dump		=	tbf_dump_class,
};

static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&tbf_class_ops,
	.id		=	"tbf",
	.priv_size	=	sizeof(struct tbf_sched_data),
	.enqueue	=	tbf_enqueue,
	.dequeue	=	tbf_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.drop		=	tbf_drop,
	.init		=	tbf_init,
	.reset		=	tbf_reset,
	.destroy	=	tbf_destroy,
	.change		=	tbf_change,
	.dump		=	tbf_dump,
	.owner		=	THIS_MODULE,
};

static int __init tbf_module_init(void)
{
	return register_qdisc(&tbf_qdisc_ops);
}

static void __exit tbf_module_exit(void)
{
	unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_tbf.c Token Bucket Filter queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
	11	* original idea by Martin Devera
	12	*
	13	*/
	14
1da177e4	15	#include <linux/module.h>
1da177e4 LT	16	#include <linux/types.h>
1da177e4 LT	17	#include <linux/kernel.h>
1da177e4	18	#include <linux/string.h>
1da177e4	19	#include <linux/errno.h>
1da177e4	20	#include <linux/skbuff.h>
0ba48053	21	#include <net/netlink.h>
b757c933	22	#include <net/sch_generic.h>
1da177e4 LT	23	#include <net/pkt_sched.h>
	24
	25
	26	/* Simple Token Bucket Filter.
	27	=======================================
	28
	29	SOURCE.
	30	-------
	31
	32	None.
	33
	34	Description.
	35	------------
	36
	37	A data flow obeys TBF with rate R and depth B, if for any
	38	time interval t_i...t_f the number of transmitted bits
	39	does not exceed B + R*(t_f-t_i).
	40
	41	Packetized version of this definition:
	42	The sequence of packets of sizes s_i served at moments t_i
	43	obeys TBF, if for any i<=k:
	44
	45	s_i+....+s_k <= B + R*(t_k - t_i)
	46
	47	Algorithm.
	48	----------
	49
	50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
	51
	52	N(t+delta) = min{B/R, N(t) + delta}
	53
	54	If the first packet in queue has length S, it may be
	55	transmitted only at the time t_* when S/R <= N(t_*),
	56	and in this case N(t) jumps:
	57
	58	N(t_* + 0) = N(t_* - 0) - S/R.
	59
	60
	61
	62	Actually, QoS requires two TBF to be applied to a data stream.
	63	One of them controls steady state burst size, another
	64	one with rate P (peak rate) and depth M (equal to link MTU)
	65	limits bursts at a smaller time scale.
	66
	67	It is easy to see that P>R, and B>M. If P is infinity, this double
	68	TBF is equivalent to a single one.
	69
	70	When TBF works in reshaping mode, latency is estimated as:
	71
	72	lat = max ((L-B)/R, (L-M)/P)
	73
	74
	75	NOTES.
	76	------
	77
	78	If TBF throttles, it starts a watchdog timer, which will wake it up
	79	when it is ready to transmit.
	80	Note that the minimal timer resolution is 1/HZ.
	81	If no new packets arrive during this period,
	82	or if the device is not awaken by EOI for some previous packet,
	83	TBF can stop its activity for 1/HZ.
	84
	85
	86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99	*/
100
cc7ec456	101	struct tbf_sched_data {
1da177e4 LT	102	/* Parameters */
1da177e4 LT	103	u32 limit; /* Maximal length of backlog: bytes */
a135e598	104	u32 max_size;
b757c933 JP	105	s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
b757c933 JP	106	s64 mtu;
b757c933 JP	107	struct psched_ratecfg rate;
b757c933 JP	108	struct psched_ratecfg peak;
1da177e4 LT	109
1da177e4 LT	110	/* Variables */
b757c933 JP	111	s64 tokens; /* Current number of B tokens */
	112	s64 ptokens; /* Current number of P tokens */
	113	s64 t_c; /* Time check-point */
1da177e4	114	struct Qdisc qdisc; / Inner qdisc, default - bfifo queue */
f7f593e3	115	struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4 LT	116	};
1da177e4 LT	117
e43ac79a	118
cc106e44 YY	119	/* Time to Length, convert time in ns to length in bytes
	120	* to determinate how many bytes can be sent in given time.
	121	*/
	122	static u64 psched_ns_t2l(const struct psched_ratecfg *r,
	123	u64 time_in_ns)
	124	{
	125	/* The formula is :
	126	* len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	127	*/
	128	u64 len = time_in_ns * r->rate_bytes_ps;
	129
	130	do_div(len, NSEC_PER_SEC);
	131
d55d282e YY	132	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
	133	do_div(len, 53);
	134	len = len * 48;
	135	}
cc106e44 YY	136
	137	if (len > r->overhead)
	138	len -= r->overhead;
	139	else
	140	len = 0;
	141
	142	return len;
	143	}
	144
4d0820cf ED	145	/*
	146	* Return length of individual segments of a gso packet,
	147	* including all headers (MAC, IP, TCP/UDP)
	148	*/
de960aa9	149	static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
4d0820cf ED	150	{
4d0820cf ED	151	unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
de960aa9	152	return hdr_len + skb_gso_transport_seglen(skb);
4d0820cf ED	153	}
4d0820cf ED	154
e43ac79a ED	155	/* GSO packet is too big, segment it so that tbf can transmit
	156	* each segment in time
	157	*/
	158	static int tbf_segment(struct sk_buff skb, struct Qdisc sch)
	159	{
	160	struct tbf_sched_data *q = qdisc_priv(sch);
	161	struct sk_buff segs, nskb;
	162	netdev_features_t features = netif_skb_features(skb);
	163	int ret, nb;
	164
	165	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	166
	167	if (IS_ERR_OR_NULL(segs))
	168	return qdisc_reshape_fail(skb, sch);
	169
	170	nb = 0;
	171	while (segs) {
	172	nskb = segs->next;
	173	segs->next = NULL;
4d0820cf ED	174	qdisc_skb_cb(segs)->pkt_len = segs->len;
4d0820cf ED	175	ret = qdisc_enqueue(segs, q->qdisc);
e43ac79a ED	176	if (ret != NET_XMIT_SUCCESS) {
e43ac79a ED	177	if (net_xmit_drop_count(ret))
25331d6c	178	qdisc_qstats_drop(sch);
e43ac79a ED	179	} else {
	180	nb++;
	181	}
	182	segs = nskb;
	183	}
	184	sch->q.qlen += nb;
	185	if (nb > 1)
	186	qdisc_tree_decrease_qlen(sch, 1 - nb);
	187	consume_skb(skb);
	188	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
	189	}
	190
cc7ec456	191	static int tbf_enqueue(struct sk_buff skb, struct Qdisc sch)
1da177e4 LT	192	{
	193	struct tbf_sched_data *q = qdisc_priv(sch);
	194	int ret;
	195
e43ac79a	196	if (qdisc_pkt_len(skb) > q->max_size) {
de960aa9	197	if (skb_is_gso(skb) && skb_gso_mac_seglen(skb) <= q->max_size)
e43ac79a	198	return tbf_segment(skb, sch);
69747650	199	return qdisc_reshape_fail(skb, sch);
e43ac79a	200	}
5f86173b	201	ret = qdisc_enqueue(skb, q->qdisc);
9871e50e	202	if (ret != NET_XMIT_SUCCESS) {
378a2f09	203	if (net_xmit_drop_count(ret))
25331d6c	204	qdisc_qstats_drop(sch);
1da177e4 LT	205	return ret;
	206	}
	207
	208	sch->q.qlen++;
9871e50e	209	return NET_XMIT_SUCCESS;
1da177e4 LT	210	}
1da177e4 LT	211
cc7ec456	212	static unsigned int tbf_drop(struct Qdisc *sch)
1da177e4 LT	213	{
1da177e4 LT	214	struct tbf_sched_data *q = qdisc_priv(sch);
6d037a26	215	unsigned int len = 0;
1da177e4	216
6d037a26	217	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
1da177e4	218	sch->q.qlen--;
25331d6c	219	qdisc_qstats_drop(sch);
1da177e4 LT	220	}
	221	return len;
	222	}
	223
a135e598 HS	224	static bool tbf_peak_present(const struct tbf_sched_data *q)
	225	{
	226	return q->peak.rate_bytes_ps;
	227	}
	228
cc7ec456	229	static struct sk_buff tbf_dequeue(struct Qdisc sch)
1da177e4 LT	230	{
	231	struct tbf_sched_data *q = qdisc_priv(sch);
	232	struct sk_buff *skb;
	233
03c05f0d	234	skb = q->qdisc->ops->peek(q->qdisc);
1da177e4 LT	235
1da177e4 LT	236	if (skb) {
b757c933 JP	237	s64 now;
	238	s64 toks;
	239	s64 ptoks = 0;
0abf77e5	240	unsigned int len = qdisc_pkt_len(skb);
1da177e4	241
d2de875c	242	now = ktime_get_ns();
b757c933	243	toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4	244
a135e598	245	if (tbf_peak_present(q)) {
1da177e4	246	ptoks = toks + q->ptokens;
b757c933	247	if (ptoks > q->mtu)
1da177e4	248	ptoks = q->mtu;
b757c933	249	ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4 LT	250	}
1da177e4 LT	251	toks += q->tokens;
b757c933	252	if (toks > q->buffer)
1da177e4	253	toks = q->buffer;
b757c933	254	toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4 LT	255
1da177e4 LT	256	if ((toks\|ptoks) >= 0) {
77be155c	257	skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d JP	258	if (unlikely(!skb))
	259	return NULL;
	260
1da177e4 LT	261	q->t_c = now;
	262	q->tokens = toks;
	263	q->ptokens = ptoks;
	264	sch->q.qlen--;
fd245a4a	265	qdisc_unthrottled(sch);
9190b3b3	266	qdisc_bstats_update(sch, skb);
1da177e4 LT	267	return skb;
	268	}
	269
b757c933	270	qdisc_watchdog_schedule_ns(&q->watchdog,
f2600cf0 ED	271	now + max_t(long, -toks, -ptoks),
f2600cf0 ED	272	true);
1da177e4 LT	273
	274	/* Maybe we have a shorter packet in the queue,
	275	which can be sent now. It sounds cool,
	276	but, however, this is wrong in principle.
	277	We MUST NOT reorder packets under these circumstances.
	278
	279	Really, if we split the flow into independent
	280	subflows, it would be a very good solution.
	281	This is the main idea of all FQ algorithms
	282	(cf. CSZ, HPFQ, HFSC)
	283	*/
	284
25331d6c	285	qdisc_qstats_overlimit(sch);
1da177e4 LT	286	}
	287	return NULL;
	288	}
	289
cc7ec456	290	static void tbf_reset(struct Qdisc *sch)
1da177e4 LT	291	{
	292	struct tbf_sched_data *q = qdisc_priv(sch);
	293
	294	qdisc_reset(q->qdisc);
	295	sch->q.qlen = 0;
d2de875c	296	q->t_c = ktime_get_ns();
1da177e4 LT	297	q->tokens = q->buffer;
1da177e4 LT	298	q->ptokens = q->mtu;
f7f593e3	299	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	300	}
1da177e4 LT	301
27a3421e PM	302	static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	303	[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
	304	[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	305	[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26 YY	306	[TCA_TBF_RATE64] = { .type = NLA_U64 },
a33c4a26 YY	307	[TCA_TBF_PRATE64] = { .type = NLA_U64 },
2e04ad42 YY	308	[TCA_TBF_BURST] = { .type = NLA_U32 },
2e04ad42 YY	309	[TCA_TBF_PBURST] = { .type = NLA_U32 },
27a3421e PM	310	};
27a3421e PM	311
cc7ec456	312	static int tbf_change(struct Qdisc sch, struct nlattr opt)
1da177e4	313	{
cee63723	314	int err;
1da177e4	315	struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26	316	struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4	317	struct tc_tbf_qopt *qopt;
1da177e4	318	struct Qdisc *child = NULL;
cc106e44 YY	319	struct psched_ratecfg rate;
	320	struct psched_ratecfg peak;
	321	u64 max_size;
	322	s64 buffer, mtu;
a33c4a26	323	u64 rate64 = 0, prate64 = 0;
1da177e4	324
a33c4a26	325	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy);
cee63723 PM	326	if (err < 0)
	327	return err;
	328
	329	err = -EINVAL;
27a3421e	330	if (tb[TCA_TBF_PARMS] == NULL)
1da177e4 LT	331	goto done;
1da177e4 LT	332
1e90474c	333	qopt = nla_data(tb[TCA_TBF_PARMS]);
cc106e44 YY	334	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
	335	qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
	336	tb[TCA_TBF_RTAB]));
1da177e4	337
cc106e44 YY	338	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
	339	qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
	340	tb[TCA_TBF_PTAB]));
4d0820cf	341
cc106e44 YY	342	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	343	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
	344
	345	if (tb[TCA_TBF_RATE64])
	346	rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	347	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
	348
2e04ad42 YY	349	if (tb[TCA_TBF_BURST]) {
	350	max_size = nla_get_u32(tb[TCA_TBF_BURST]);
	351	buffer = psched_l2t_ns(&rate, max_size);
	352	} else {
	353	max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	354	}
cc106e44 YY	355
	356	if (qopt->peakrate.rate) {
	357	if (tb[TCA_TBF_PRATE64])
	358	prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
	359	psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
	360	if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
	361	pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
2e04ad42	362	peak.rate_bytes_ps, rate.rate_bytes_ps);
cc106e44 YY	363	err = -EINVAL;
	364	goto done;
	365	}
	366
2e04ad42 YY	367	if (tb[TCA_TBF_PBURST]) {
	368	u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
	369	max_size = min_t(u32, max_size, pburst);
	370	mtu = psched_l2t_ns(&peak, pburst);
	371	} else {
	372	max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
	373	}
a135e598 HS	374	} else {
a135e598 HS	375	memset(&peak, 0, sizeof(peak));
cc106e44 YY	376	}
	377
	378	if (max_size < psched_mtu(qdisc_dev(sch)))
	379	pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
	380	max_size, qdisc_dev(sch)->name,
	381	psched_mtu(qdisc_dev(sch)));
	382
	383	if (!max_size) {
	384	err = -EINVAL;
	385	goto done;
	386	}
	387
724b9e1d HS	388	if (q->qdisc != &noop_qdisc) {
	389	err = fifo_set_limit(q->qdisc, qopt->limit);
	390	if (err)
	391	goto done;
	392	} else if (qopt->limit > 0) {
	393	child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit);
	394	if (IS_ERR(child)) {
	395	err = PTR_ERR(child);
	396	goto done;
	397	}
	398	}
	399
1da177e4	400	sch_tree_lock(sch);
5e50da01 PM	401	if (child) {
5e50da01 PM	402	qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
b94c8afc PM	403	qdisc_destroy(q->qdisc);
b94c8afc PM	404	q->qdisc = child;
5e50da01	405	}
1da177e4	406	q->limit = qopt->limit;
2e04ad42 YY	407	if (tb[TCA_TBF_PBURST])
	408	q->mtu = mtu;
	409	else
	410	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4	411	q->max_size = max_size;
2e04ad42 YY	412	if (tb[TCA_TBF_BURST])
	413	q->buffer = buffer;
	414	else
	415	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4 LT	416	q->tokens = q->buffer;
1da177e4 LT	417	q->ptokens = q->mtu;
b94c8afc	418
cc106e44	419	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
a135e598	420	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
b94c8afc	421
1da177e4 LT	422	sch_tree_unlock(sch);
	423	err = 0;
	424	done:
1da177e4 LT	425	return err;
	426	}
	427
cc7ec456	428	static int tbf_init(struct Qdisc sch, struct nlattr opt)
1da177e4 LT	429	{
	430	struct tbf_sched_data *q = qdisc_priv(sch);
	431
	432	if (opt == NULL)
	433	return -EINVAL;
	434
d2de875c	435	q->t_c = ktime_get_ns();
f7f593e3	436	qdisc_watchdog_init(&q->watchdog, sch);
1da177e4 LT	437	q->qdisc = &noop_qdisc;
	438
	439	return tbf_change(sch, opt);
	440	}
	441
	442	static void tbf_destroy(struct Qdisc *sch)
	443	{
	444	struct tbf_sched_data *q = qdisc_priv(sch);
	445
f7f593e3	446	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	447	qdisc_destroy(q->qdisc);
	448	}
	449
	450	static int tbf_dump(struct Qdisc sch, struct sk_buff skb)
	451	{
	452	struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef	453	struct nlattr *nest;
1da177e4 LT	454	struct tc_tbf_qopt opt;
1da177e4 LT	455
b0460e44	456	sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef PM	457	nest = nla_nest_start(skb, TCA_OPTIONS);
	458	if (nest == NULL)
	459	goto nla_put_failure;
1da177e4 LT	460
1da177e4 LT	461	opt.limit = q->limit;
01cb71d2	462	psched_ratecfg_getrate(&opt.rate, &q->rate);
a135e598	463	if (tbf_peak_present(q))
01cb71d2	464	psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4 LT	465	else
1da177e4 LT	466	memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933 JP	467	opt.mtu = PSCHED_NS2TICKS(q->mtu);
b757c933 JP	468	opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43 DM	469	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
1b34ec43 DM	470	goto nla_put_failure;
a33c4a26 YY	471	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	472	nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps))
	473	goto nla_put_failure;
a135e598	474	if (tbf_peak_present(q) &&
a33c4a26 YY	475	q->peak.rate_bytes_ps >= (1ULL << 32) &&
	476	nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps))
	477	goto nla_put_failure;
1da177e4	478
d59b7d80	479	return nla_nest_end(skb, nest);
1da177e4	480
1e90474c	481	nla_put_failure:
4b3550ef	482	nla_nest_cancel(skb, nest);
1da177e4 LT	483	return -1;
	484	}
	485
	486	static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
	487	struct sk_buff skb, struct tcmsg tcm)
	488	{
	489	struct tbf_sched_data *q = qdisc_priv(sch);
	490
1da177e4 LT	491	tcm->tcm_handle \|= TC_H_MIN(1);
	492	tcm->tcm_info = q->qdisc->handle;
	493
	494	return 0;
	495	}
	496
	497	static int tbf_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
	498	struct Qdisc **old)
	499	{
	500	struct tbf_sched_data *q = qdisc_priv(sch);
	501
	502	if (new == NULL)
	503	new = &noop_qdisc;
	504
	505	sch_tree_lock(sch);
b94c8afc PM	506	*old = q->qdisc;
b94c8afc PM	507	q->qdisc = new;
5e50da01	508	qdisc_tree_decrease_qlen(old, (old)->q.qlen);
1da177e4	509	qdisc_reset(*old);
1da177e4 LT	510	sch_tree_unlock(sch);
	511
	512	return 0;
	513	}
	514
	515	static struct Qdisc tbf_leaf(struct Qdisc sch, unsigned long arg)
	516	{
	517	struct tbf_sched_data *q = qdisc_priv(sch);
	518	return q->qdisc;
	519	}
	520
	521	static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
	522	{
	523	return 1;
	524	}
	525
	526	static void tbf_put(struct Qdisc *sch, unsigned long arg)
	527	{
	528	}
	529
1da177e4 LT	530	static void tbf_walk(struct Qdisc sch, struct qdisc_walker walker)
	531	{
	532	if (!walker->stop) {
	533	if (walker->count >= walker->skip)
	534	if (walker->fn(sch, 1, walker) < 0) {
	535	walker->stop = 1;
	536	return;
	537	}
	538	walker->count++;
	539	}
	540	}
	541
cc7ec456	542	static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4 LT	543	.graft = tbf_graft,
	544	.leaf = tbf_leaf,
	545	.get = tbf_get,
	546	.put = tbf_put,
1da177e4	547	.walk = tbf_walk,
1da177e4 LT	548	.dump = tbf_dump_class,
	549	};
	550
20fea08b	551	static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4 LT	552	.next = NULL,
	553	.cl_ops = &tbf_class_ops,
	554	.id = "tbf",
	555	.priv_size = sizeof(struct tbf_sched_data),
	556	.enqueue = tbf_enqueue,
	557	.dequeue = tbf_dequeue,
77be155c	558	.peek = qdisc_peek_dequeued,
1da177e4 LT	559	.drop = tbf_drop,
	560	.init = tbf_init,
	561	.reset = tbf_reset,
	562	.destroy = tbf_destroy,
	563	.change = tbf_change,
	564	.dump = tbf_dump,
	565	.owner = THIS_MODULE,
	566	};
	567
	568	static int __init tbf_module_init(void)
	569	{
	570	return register_qdisc(&tbf_qdisc_ops);
	571	}
	572
	573	static void __exit tbf_module_exit(void)
	574	{
	575	unregister_qdisc(&tbf_qdisc_ops);
	576	}
	577	module_init(tbf_module_init)
	578	module_exit(tbf_module_exit)
	579	MODULE_LICENSE("GPL");