[linux-2.6-block.git] / net / sched / sch_tbf.c

/*
 * net/sched/sch_tbf.c	Token Bucket Filter queue.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
 *						 original idea by Martin Devera
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>


/*	Simple Token Bucket Filter.
	=======================================

	SOURCE.
	-------

	None.

	Description.
	------------

	A data flow obeys TBF with rate R and depth B, if for any
	time interval t_i...t_f the number of transmitted bits
	does not exceed B + R*(t_f-t_i).

	Packetized version of this definition:
	The sequence of packets of sizes s_i served at moments t_i
	obeys TBF, if for any i<=k:

	s_i+....+s_k <= B + R*(t_k - t_i)

	Algorithm.
	----------

	Let N(t_i) be B/R initially and N(t) grow continuously with time as:

	N(t+delta) = min{B/R, N(t) + delta}

	If the first packet in queue has length S, it may be
	transmitted only at the time t_* when S/R <= N(t_*),
	and in this case N(t) jumps:

	N(t_* + 0) = N(t_* - 0) - S/R.


	Actually, QoS requires two TBF to be applied to a data stream.
	One of them controls steady state burst size, another
	one with rate P (peak rate) and depth M (equal to link MTU)
	limits bursts at a smaller time scale.

	It is easy to see that P>R, and B>M. If P is infinity, this double
	TBF is equivalent to a single one.

	When TBF works in reshaping mode, latency is estimated as:

	lat = max ((L-B)/R, (L-M)/P)


	NOTES.
	------

	If TBF throttles, it starts a watchdog timer, which will wake it up
	when it is ready to transmit.
	Note that the minimal timer resolution is 1/HZ.
	If no new packets arrive during this period,
	or if the device is not awaken by EOI for some previous packet,
	TBF can stop its activity for 1/HZ.


	This means, that with depth B, the maximal rate is

	R_crit = B*HZ

	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.

	Note that the peak rate TBF is much more tough: with MTU 1500
	P_crit = 150Kbytes/sec. So, if you need greater peak
	rates, use alpha with HZ=1000 :-)

	With classful TBF, limit is just kept for backwards compatibility.
	It is passed to the default bfifo qdisc - if the inner qdisc is
	changed the limit is not effective anymore.
*/

struct tbf_sched_data {
/* Parameters */
	u32		limit;		/* Maximal length of backlog: bytes */
	u32		max_size;
	s64		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
	s64		mtu;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;

/* Variables */
	s64	tokens;			/* Current number of B tokens */
	s64	ptokens;		/* Current number of P tokens */
	s64	t_c;			/* Time check-point */
	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
	struct qdisc_watchdog watchdog;	/* Watchdog timer */
};


/* Time to Length, convert time in ns to length in bytes
 * to determinate how many bytes can be sent in given time.
 */
static u64 psched_ns_t2l(const struct psched_ratecfg *r,
			 u64 time_in_ns)
{
	/* The formula is :
	 * len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	 */
	u64 len = time_in_ns * r->rate_bytes_ps;

	do_div(len, NSEC_PER_SEC);

	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
		do_div(len, 53);
		len = len * 48;
	}

	if (len > r->overhead)
		len -= r->overhead;
	else
		len = 0;

	return len;
}

/* GSO packet is too big, segment it so that tbf can transmit
 * each segment in time
 */
static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch,
		       struct sk_buff **to_free)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *segs, *nskb;
	netdev_features_t features = netif_skb_features(skb);
	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
	int ret, nb;

	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);

	if (IS_ERR_OR_NULL(segs))
		return qdisc_drop(skb, sch, to_free);

	nb = 0;
	while (segs) {
		nskb = segs->next;
		segs->next = NULL;
		qdisc_skb_cb(segs)->pkt_len = segs->len;
		len += segs->len;
		ret = qdisc_enqueue(segs, q->qdisc, to_free);
		if (ret != NET_XMIT_SUCCESS) {
			if (net_xmit_drop_count(ret))
				qdisc_qstats_drop(sch);
		} else {
			nb++;
		}
		segs = nskb;
	}
	sch->q.qlen += nb;
	if (nb > 1)
		qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
	consume_skb(skb);
	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch,
		       struct sk_buff **to_free)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	int ret;

	if (qdisc_pkt_len(skb) > q->max_size) {
		if (skb_is_gso(skb) &&
		    skb_gso_validate_mac_len(skb, q->max_size))
			return tbf_segment(skb, sch, to_free);
		return qdisc_drop(skb, sch, to_free);
	}
	ret = qdisc_enqueue(skb, q->qdisc, to_free);
	if (ret != NET_XMIT_SUCCESS) {
		if (net_xmit_drop_count(ret))
			qdisc_qstats_drop(sch);
		return ret;
	}

	qdisc_qstats_backlog_inc(sch, skb);
	sch->q.qlen++;
	return NET_XMIT_SUCCESS;
}

static bool tbf_peak_present(const struct tbf_sched_data *q)
{
	return q->peak.rate_bytes_ps;
}

static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	skb = q->qdisc->ops->peek(q->qdisc);

	if (skb) {
		s64 now;
		s64 toks;
		s64 ptoks = 0;
		unsigned int len = qdisc_pkt_len(skb);

		now = ktime_get_ns();
		toks = min_t(s64, now - q->t_c, q->buffer);

		if (tbf_peak_present(q)) {
			ptoks = toks + q->ptokens;
			if (ptoks > q->mtu)
				ptoks = q->mtu;
			ptoks -= (s64) psched_l2t_ns(&q->peak, len);
		}
		toks += q->tokens;
		if (toks > q->buffer)
			toks = q->buffer;
		toks -= (s64) psched_l2t_ns(&q->rate, len);

		if ((toks|ptoks) >= 0) {
			skb = qdisc_dequeue_peeked(q->qdisc);
			if (unlikely(!skb))
				return NULL;

			q->t_c = now;
			q->tokens = toks;
			q->ptokens = ptoks;
			qdisc_qstats_backlog_dec(sch, skb);
			sch->q.qlen--;
			qdisc_bstats_update(sch, skb);
			return skb;
		}

		qdisc_watchdog_schedule_ns(&q->watchdog,
					   now + max_t(long, -toks, -ptoks));

		/* Maybe we have a shorter packet in the queue,
		   which can be sent now. It sounds cool,
		   but, however, this is wrong in principle.
		   We MUST NOT reorder packets under these circumstances.

		   Really, if we split the flow into independent
		   subflows, it would be a very good solution.
		   This is the main idea of all FQ algorithms
		   (cf. CSZ, HPFQ, HFSC)
		 */

		qdisc_qstats_overlimit(sch);
	}
	return NULL;
}

static void tbf_reset(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_reset(q->qdisc);
	sch->qstats.backlog = 0;
	sch->q.qlen = 0;
	q->t_c = ktime_get_ns();
	q->tokens = q->buffer;
	q->ptokens = q->mtu;
	qdisc_watchdog_cancel(&q->watchdog);
}

static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	[TCA_TBF_PARMS]	= { .len = sizeof(struct tc_tbf_qopt) },
	[TCA_TBF_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_PTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_TBF_RATE64]	= { .type = NLA_U64 },
	[TCA_TBF_PRATE64]	= { .type = NLA_U64 },
	[TCA_TBF_BURST] = { .type = NLA_U32 },
	[TCA_TBF_PBURST] = { .type = NLA_U32 },
};

static int tbf_change(struct Qdisc *sch, struct nlattr *opt,
		      struct netlink_ext_ack *extack)
{
	int err;
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_TBF_MAX + 1];
	struct tc_tbf_qopt *qopt;
	struct Qdisc *child = NULL;
	struct psched_ratecfg rate;
	struct psched_ratecfg peak;
	u64 max_size;
	s64 buffer, mtu;
	u64 rate64 = 0, prate64 = 0;

	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
	if (err < 0)
		return err;

	err = -EINVAL;
	if (tb[TCA_TBF_PARMS] == NULL)
		goto done;

	qopt = nla_data(tb[TCA_TBF_PARMS]);
	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
		qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
					      tb[TCA_TBF_RTAB],
					      NULL));

	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
			qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
						      tb[TCA_TBF_PTAB],
						      NULL));

	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);

	if (tb[TCA_TBF_RATE64])
		rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);

	if (tb[TCA_TBF_BURST]) {
		max_size = nla_get_u32(tb[TCA_TBF_BURST]);
		buffer = psched_l2t_ns(&rate, max_size);
	} else {
		max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	}

	if (qopt->peakrate.rate) {
		if (tb[TCA_TBF_PRATE64])
			prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
		psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
		if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
			pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
					peak.rate_bytes_ps, rate.rate_bytes_ps);
			err = -EINVAL;
			goto done;
		}

		if (tb[TCA_TBF_PBURST]) {
			u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
			max_size = min_t(u32, max_size, pburst);
			mtu = psched_l2t_ns(&peak, pburst);
		} else {
			max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
		}
	} else {
		memset(&peak, 0, sizeof(peak));
	}

	if (max_size < psched_mtu(qdisc_dev(sch)))
		pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
				    max_size, qdisc_dev(sch)->name,
				    psched_mtu(qdisc_dev(sch)));

	if (!max_size) {
		err = -EINVAL;
		goto done;
	}

	if (q->qdisc != &noop_qdisc) {
		err = fifo_set_limit(q->qdisc, qopt->limit);
		if (err)
			goto done;
	} else if (qopt->limit > 0) {
		child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
					 extack);
		if (IS_ERR(child)) {
			err = PTR_ERR(child);
			goto done;
		}

		/* child is fifo, no need to check for noop_qdisc */
		qdisc_hash_add(child, true);
	}

	sch_tree_lock(sch);
	if (child) {
		qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
					  q->qdisc->qstats.backlog);
		qdisc_destroy(q->qdisc);
		q->qdisc = child;
	}
	q->limit = qopt->limit;
	if (tb[TCA_TBF_PBURST])
		q->mtu = mtu;
	else
		q->mtu = PSCHED_TICKS2NS(qopt->mtu);
	q->max_size = max_size;
	if (tb[TCA_TBF_BURST])
		q->buffer = buffer;
	else
		q->buffer = PSCHED_TICKS2NS(qopt->buffer);
	q->tokens = q->buffer;
	q->ptokens = q->mtu;

	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));

	sch_tree_unlock(sch);
	err = 0;
done:
	return err;
}

static int tbf_init(struct Qdisc *sch, struct nlattr *opt,
		    struct netlink_ext_ack *extack)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_init(&q->watchdog, sch);
	q->qdisc = &noop_qdisc;

	if (!opt)
		return -EINVAL;

	q->t_c = ktime_get_ns();

	return tbf_change(sch, opt, extack);
}

static void tbf_destroy(struct Qdisc *sch)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	qdisc_watchdog_cancel(&q->watchdog);
	qdisc_destroy(q->qdisc);
}

static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	struct nlattr *nest;
	struct tc_tbf_qopt opt;

	sch->qstats.backlog = q->qdisc->qstats.backlog;
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;

	opt.limit = q->limit;
	psched_ratecfg_getrate(&opt.rate, &q->rate);
	if (tbf_peak_present(q))
		psched_ratecfg_getrate(&opt.peakrate, &q->peak);
	else
		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
	opt.mtu = PSCHED_NS2TICKS(q->mtu);
	opt.buffer = PSCHED_NS2TICKS(q->buffer);
	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
		goto nla_put_failure;
	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
			      TCA_TBF_PAD))
		goto nla_put_failure;
	if (tbf_peak_present(q) &&
	    q->peak.rate_bytes_ps >= (1ULL << 32) &&
	    nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
			      TCA_TBF_PAD))
		goto nla_put_failure;

	return nla_nest_end(skb, nest);

nla_put_failure:
	nla_nest_cancel(skb, nest);
	return -1;
}

static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	tcm->tcm_handle |= TC_H_MIN(1);
	tcm->tcm_info = q->qdisc->handle;

	return 0;
}

static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old, struct netlink_ext_ack *extack)
{
	struct tbf_sched_data *q = qdisc_priv(sch);

	if (new == NULL)
		new = &noop_qdisc;

	*old = qdisc_replace(sch, new, &q->qdisc);
	return 0;
}

static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
{
	struct tbf_sched_data *q = qdisc_priv(sch);
	return q->qdisc;
}

static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
{
	return 1;
}

static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
	if (!walker->stop) {
		if (walker->count >= walker->skip)
			if (walker->fn(sch, 1, walker) < 0) {
				walker->stop = 1;
				return;
			}
		walker->count++;
	}
}

static const struct Qdisc_class_ops tbf_class_ops = {
	.graft		=	tbf_graft,
	.leaf		=	tbf_leaf,
	.find		=	tbf_find,
	.walk		=	tbf_walk,
	.dump		=	tbf_dump_class,
};

static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
	.next		=	NULL,
	.cl_ops		=	&tbf_class_ops,
	.id		=	"tbf",
	.priv_size	=	sizeof(struct tbf_sched_data),
	.enqueue	=	tbf_enqueue,
	.dequeue	=	tbf_dequeue,
	.peek		=	qdisc_peek_dequeued,
	.init		=	tbf_init,
	.reset		=	tbf_reset,
	.destroy	=	tbf_destroy,
	.change		=	tbf_change,
	.dump		=	tbf_dump,
	.owner		=	THIS_MODULE,
};

static int __init tbf_module_init(void)
{
	return register_qdisc(&tbf_qdisc_ops);
}

static void __exit tbf_module_exit(void)
{
	unregister_qdisc(&tbf_qdisc_ops);
}
module_init(tbf_module_init)
module_exit(tbf_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
1da177e4 LT	1	/*
	2	* net/sched/sch_tbf.c Token Bucket Filter queue.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	* Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
	11	* original idea by Martin Devera
	12	*
	13	*/
	14
1da177e4	15	#include <linux/module.h>
1da177e4 LT	16	#include <linux/types.h>
1da177e4 LT	17	#include <linux/kernel.h>
1da177e4	18	#include <linux/string.h>
1da177e4	19	#include <linux/errno.h>
1da177e4	20	#include <linux/skbuff.h>
0ba48053	21	#include <net/netlink.h>
b757c933	22	#include <net/sch_generic.h>
1da177e4 LT	23	#include <net/pkt_sched.h>
	24
	25
	26	/* Simple Token Bucket Filter.
	27	=======================================
	28
	29	SOURCE.
	30	-------
	31
	32	None.
	33
	34	Description.
	35	------------
	36
	37	A data flow obeys TBF with rate R and depth B, if for any
	38	time interval t_i...t_f the number of transmitted bits
	39	does not exceed B + R*(t_f-t_i).
	40
	41	Packetized version of this definition:
	42	The sequence of packets of sizes s_i served at moments t_i
	43	obeys TBF, if for any i<=k:
	44
	45	s_i+....+s_k <= B + R*(t_k - t_i)
	46
	47	Algorithm.
	48	----------
	49
	50	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
	51
	52	N(t+delta) = min{B/R, N(t) + delta}
	53
	54	If the first packet in queue has length S, it may be
	55	transmitted only at the time t_* when S/R <= N(t_*),
	56	and in this case N(t) jumps:
	57
	58	N(t_* + 0) = N(t_* - 0) - S/R.
	59
	60
	61
	62	Actually, QoS requires two TBF to be applied to a data stream.
	63	One of them controls steady state burst size, another
	64	one with rate P (peak rate) and depth M (equal to link MTU)
	65	limits bursts at a smaller time scale.
	66
	67	It is easy to see that P>R, and B>M. If P is infinity, this double
	68	TBF is equivalent to a single one.
	69
	70	When TBF works in reshaping mode, latency is estimated as:
	71
	72	lat = max ((L-B)/R, (L-M)/P)
	73
	74
	75	NOTES.
	76	------
	77
	78	If TBF throttles, it starts a watchdog timer, which will wake it up
	79	when it is ready to transmit.
	80	Note that the minimal timer resolution is 1/HZ.
	81	If no new packets arrive during this period,
	82	or if the device is not awaken by EOI for some previous packet,
	83	TBF can stop its activity for 1/HZ.
	84
	85
	86	This means, that with depth B, the maximal rate is
87
88	R_crit = B*HZ
89
90	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
91
92	Note that the peak rate TBF is much more tough: with MTU 1500
93	P_crit = 150Kbytes/sec. So, if you need greater peak
94	rates, use alpha with HZ=1000 :-)
95
96	With classful TBF, limit is just kept for backwards compatibility.
97	It is passed to the default bfifo qdisc - if the inner qdisc is
98	changed the limit is not effective anymore.
99	*/
100
cc7ec456	101	struct tbf_sched_data {
1da177e4 LT	102	/* Parameters */
1da177e4 LT	103	u32 limit; /* Maximal length of backlog: bytes */
a135e598	104	u32 max_size;
b757c933 JP	105	s64 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
b757c933 JP	106	s64 mtu;
b757c933 JP	107	struct psched_ratecfg rate;
b757c933 JP	108	struct psched_ratecfg peak;
1da177e4 LT	109
1da177e4 LT	110	/* Variables */
b757c933 JP	111	s64 tokens; /* Current number of B tokens */
	112	s64 ptokens; /* Current number of P tokens */
	113	s64 t_c; /* Time check-point */
1da177e4	114	struct Qdisc qdisc; / Inner qdisc, default - bfifo queue */
f7f593e3	115	struct qdisc_watchdog watchdog; /* Watchdog timer */
1da177e4 LT	116	};
1da177e4 LT	117
e43ac79a	118
cc106e44 YY	119	/* Time to Length, convert time in ns to length in bytes
	120	* to determinate how many bytes can be sent in given time.
	121	*/
	122	static u64 psched_ns_t2l(const struct psched_ratecfg *r,
	123	u64 time_in_ns)
	124	{
	125	/* The formula is :
	126	* len = (time_in_ns * r->rate_bytes_ps) / NSEC_PER_SEC
	127	*/
	128	u64 len = time_in_ns * r->rate_bytes_ps;
	129
	130	do_div(len, NSEC_PER_SEC);
	131
d55d282e YY	132	if (unlikely(r->linklayer == TC_LINKLAYER_ATM)) {
	133	do_div(len, 53);
	134	len = len * 48;
	135	}
cc106e44 YY	136
	137	if (len > r->overhead)
	138	len -= r->overhead;
	139	else
	140	len = 0;
	141
	142	return len;
	143	}
	144
e43ac79a ED	145	/* GSO packet is too big, segment it so that tbf can transmit
	146	* each segment in time
	147	*/
520ac30f ED	148	static int tbf_segment(struct sk_buff skb, struct Qdisc sch,
520ac30f ED	149	struct sk_buff **to_free)
e43ac79a ED	150	{
	151	struct tbf_sched_data *q = qdisc_priv(sch);
	152	struct sk_buff segs, nskb;
	153	netdev_features_t features = netif_skb_features(skb);
2ccccf5f	154	unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
e43ac79a ED	155	int ret, nb;
	156
	157	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
	158
	159	if (IS_ERR_OR_NULL(segs))
520ac30f	160	return qdisc_drop(skb, sch, to_free);
e43ac79a ED	161
	162	nb = 0;
	163	while (segs) {
	164	nskb = segs->next;
	165	segs->next = NULL;
4d0820cf	166	qdisc_skb_cb(segs)->pkt_len = segs->len;
2ccccf5f	167	len += segs->len;
520ac30f	168	ret = qdisc_enqueue(segs, q->qdisc, to_free);
e43ac79a ED	169	if (ret != NET_XMIT_SUCCESS) {
e43ac79a ED	170	if (net_xmit_drop_count(ret))
25331d6c	171	qdisc_qstats_drop(sch);
e43ac79a ED	172	} else {
	173	nb++;
	174	}
	175	segs = nskb;
	176	}
	177	sch->q.qlen += nb;
	178	if (nb > 1)
2ccccf5f	179	qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
e43ac79a ED	180	consume_skb(skb);
	181	return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
	182	}
	183
520ac30f ED	184	static int tbf_enqueue(struct sk_buff skb, struct Qdisc sch,
520ac30f ED	185	struct sk_buff **to_free)
1da177e4 LT	186	{
	187	struct tbf_sched_data *q = qdisc_priv(sch);
	188	int ret;
	189
e43ac79a	190	if (qdisc_pkt_len(skb) > q->max_size) {
ee78bbef DA	191	if (skb_is_gso(skb) &&
ee78bbef DA	192	skb_gso_validate_mac_len(skb, q->max_size))
520ac30f ED	193	return tbf_segment(skb, sch, to_free);
520ac30f ED	194	return qdisc_drop(skb, sch, to_free);
e43ac79a	195	}
520ac30f	196	ret = qdisc_enqueue(skb, q->qdisc, to_free);
9871e50e	197	if (ret != NET_XMIT_SUCCESS) {
378a2f09	198	if (net_xmit_drop_count(ret))
25331d6c	199	qdisc_qstats_drop(sch);
1da177e4 LT	200	return ret;
	201	}
	202
8d5958f4	203	qdisc_qstats_backlog_inc(sch, skb);
1da177e4	204	sch->q.qlen++;
9871e50e	205	return NET_XMIT_SUCCESS;
1da177e4 LT	206	}
1da177e4 LT	207
a135e598 HS	208	static bool tbf_peak_present(const struct tbf_sched_data *q)
	209	{
	210	return q->peak.rate_bytes_ps;
	211	}
	212
cc7ec456	213	static struct sk_buff tbf_dequeue(struct Qdisc sch)
1da177e4 LT	214	{
	215	struct tbf_sched_data *q = qdisc_priv(sch);
	216	struct sk_buff *skb;
	217
03c05f0d	218	skb = q->qdisc->ops->peek(q->qdisc);
1da177e4 LT	219
1da177e4 LT	220	if (skb) {
b757c933 JP	221	s64 now;
	222	s64 toks;
	223	s64 ptoks = 0;
0abf77e5	224	unsigned int len = qdisc_pkt_len(skb);
1da177e4	225
d2de875c	226	now = ktime_get_ns();
b757c933	227	toks = min_t(s64, now - q->t_c, q->buffer);
1da177e4	228
a135e598	229	if (tbf_peak_present(q)) {
1da177e4	230	ptoks = toks + q->ptokens;
b757c933	231	if (ptoks > q->mtu)
1da177e4	232	ptoks = q->mtu;
b757c933	233	ptoks -= (s64) psched_l2t_ns(&q->peak, len);
1da177e4 LT	234	}
1da177e4 LT	235	toks += q->tokens;
b757c933	236	if (toks > q->buffer)
1da177e4	237	toks = q->buffer;
b757c933	238	toks -= (s64) psched_l2t_ns(&q->rate, len);
1da177e4 LT	239
1da177e4 LT	240	if ((toks\|ptoks) >= 0) {
77be155c	241	skb = qdisc_dequeue_peeked(q->qdisc);
03c05f0d JP	242	if (unlikely(!skb))
	243	return NULL;
	244
1da177e4 LT	245	q->t_c = now;
	246	q->tokens = toks;
	247	q->ptokens = ptoks;
8d5958f4	248	qdisc_qstats_backlog_dec(sch, skb);
1da177e4	249	sch->q.qlen--;
9190b3b3	250	qdisc_bstats_update(sch, skb);
1da177e4 LT	251	return skb;
	252	}
	253
b757c933	254	qdisc_watchdog_schedule_ns(&q->watchdog,
45f50bed	255	now + max_t(long, -toks, -ptoks));
1da177e4 LT	256
	257	/* Maybe we have a shorter packet in the queue,
	258	which can be sent now. It sounds cool,
	259	but, however, this is wrong in principle.
	260	We MUST NOT reorder packets under these circumstances.
	261
	262	Really, if we split the flow into independent
	263	subflows, it would be a very good solution.
	264	This is the main idea of all FQ algorithms
	265	(cf. CSZ, HPFQ, HFSC)
	266	*/
	267
25331d6c	268	qdisc_qstats_overlimit(sch);
1da177e4 LT	269	}
	270	return NULL;
	271	}
	272
cc7ec456	273	static void tbf_reset(struct Qdisc *sch)
1da177e4 LT	274	{
	275	struct tbf_sched_data *q = qdisc_priv(sch);
	276
	277	qdisc_reset(q->qdisc);
8d5958f4	278	sch->qstats.backlog = 0;
1da177e4	279	sch->q.qlen = 0;
d2de875c	280	q->t_c = ktime_get_ns();
1da177e4 LT	281	q->tokens = q->buffer;
1da177e4 LT	282	q->ptokens = q->mtu;
f7f593e3	283	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	284	}
1da177e4 LT	285
27a3421e PM	286	static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
	287	[TCA_TBF_PARMS] = { .len = sizeof(struct tc_tbf_qopt) },
	288	[TCA_TBF_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	289	[TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
a33c4a26 YY	290	[TCA_TBF_RATE64] = { .type = NLA_U64 },
a33c4a26 YY	291	[TCA_TBF_PRATE64] = { .type = NLA_U64 },
2e04ad42 YY	292	[TCA_TBF_BURST] = { .type = NLA_U32 },
2e04ad42 YY	293	[TCA_TBF_PBURST] = { .type = NLA_U32 },
27a3421e PM	294	};
27a3421e PM	295
2030721c AA	296	static int tbf_change(struct Qdisc sch, struct nlattr opt,
2030721c AA	297	struct netlink_ext_ack *extack)
1da177e4	298	{
cee63723	299	int err;
1da177e4	300	struct tbf_sched_data *q = qdisc_priv(sch);
a33c4a26	301	struct nlattr *tb[TCA_TBF_MAX + 1];
1da177e4	302	struct tc_tbf_qopt *qopt;
1da177e4	303	struct Qdisc *child = NULL;
cc106e44 YY	304	struct psched_ratecfg rate;
	305	struct psched_ratecfg peak;
	306	u64 max_size;
	307	s64 buffer, mtu;
a33c4a26	308	u64 rate64 = 0, prate64 = 0;
1da177e4	309
fceb6435	310	err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL);
cee63723 PM	311	if (err < 0)
	312	return err;
	313
	314	err = -EINVAL;
27a3421e	315	if (tb[TCA_TBF_PARMS] == NULL)
1da177e4 LT	316	goto done;
1da177e4 LT	317
1e90474c	318	qopt = nla_data(tb[TCA_TBF_PARMS]);
cc106e44 YY	319	if (qopt->rate.linklayer == TC_LINKLAYER_UNAWARE)
cc106e44 YY	320	qdisc_put_rtab(qdisc_get_rtab(&qopt->rate,
e9bc3fa2 AA	321	tb[TCA_TBF_RTAB],
e9bc3fa2 AA	322	NULL));
1da177e4	323
cc106e44 YY	324	if (qopt->peakrate.linklayer == TC_LINKLAYER_UNAWARE)
cc106e44 YY	325	qdisc_put_rtab(qdisc_get_rtab(&qopt->peakrate,
e9bc3fa2 AA	326	tb[TCA_TBF_PTAB],
e9bc3fa2 AA	327	NULL));
4d0820cf	328
cc106e44 YY	329	buffer = min_t(u64, PSCHED_TICKS2NS(qopt->buffer), ~0U);
	330	mtu = min_t(u64, PSCHED_TICKS2NS(qopt->mtu), ~0U);
	331
	332	if (tb[TCA_TBF_RATE64])
	333	rate64 = nla_get_u64(tb[TCA_TBF_RATE64]);
	334	psched_ratecfg_precompute(&rate, &qopt->rate, rate64);
	335
2e04ad42 YY	336	if (tb[TCA_TBF_BURST]) {
	337	max_size = nla_get_u32(tb[TCA_TBF_BURST]);
	338	buffer = psched_l2t_ns(&rate, max_size);
	339	} else {
	340	max_size = min_t(u64, psched_ns_t2l(&rate, buffer), ~0U);
	341	}
cc106e44 YY	342
	343	if (qopt->peakrate.rate) {
	344	if (tb[TCA_TBF_PRATE64])
	345	prate64 = nla_get_u64(tb[TCA_TBF_PRATE64]);
	346	psched_ratecfg_precompute(&peak, &qopt->peakrate, prate64);
	347	if (peak.rate_bytes_ps <= rate.rate_bytes_ps) {
	348	pr_warn_ratelimited("sch_tbf: peakrate %llu is lower than or equals to rate %llu !\n",
2e04ad42	349	peak.rate_bytes_ps, rate.rate_bytes_ps);
cc106e44 YY	350	err = -EINVAL;
	351	goto done;
	352	}
	353
2e04ad42 YY	354	if (tb[TCA_TBF_PBURST]) {
	355	u32 pburst = nla_get_u32(tb[TCA_TBF_PBURST]);
	356	max_size = min_t(u32, max_size, pburst);
	357	mtu = psched_l2t_ns(&peak, pburst);
	358	} else {
	359	max_size = min_t(u64, max_size, psched_ns_t2l(&peak, mtu));
	360	}
a135e598 HS	361	} else {
a135e598 HS	362	memset(&peak, 0, sizeof(peak));
cc106e44 YY	363	}
	364
	365	if (max_size < psched_mtu(qdisc_dev(sch)))
	366	pr_warn_ratelimited("sch_tbf: burst %llu is lower than device %s mtu (%u) !\n",
	367	max_size, qdisc_dev(sch)->name,
	368	psched_mtu(qdisc_dev(sch)));
	369
	370	if (!max_size) {
	371	err = -EINVAL;
	372	goto done;
	373	}
	374
724b9e1d HS	375	if (q->qdisc != &noop_qdisc) {
	376	err = fifo_set_limit(q->qdisc, qopt->limit);
	377	if (err)
	378	goto done;
	379	} else if (qopt->limit > 0) {
a38a9882 AA	380	child = fifo_create_dflt(sch, &bfifo_qdisc_ops, qopt->limit,
a38a9882 AA	381	extack);
724b9e1d HS	382	if (IS_ERR(child)) {
	383	err = PTR_ERR(child);
	384	goto done;
	385	}
44a63b13 PA	386
	387	/* child is fifo, no need to check for noop_qdisc */
	388	qdisc_hash_add(child, true);
724b9e1d HS	389	}
724b9e1d HS	390
1da177e4	391	sch_tree_lock(sch);
5e50da01	392	if (child) {
2ccccf5f WC	393	qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
2ccccf5f WC	394	q->qdisc->qstats.backlog);
b94c8afc PM	395	qdisc_destroy(q->qdisc);
b94c8afc PM	396	q->qdisc = child;
5e50da01	397	}
1da177e4	398	q->limit = qopt->limit;
2e04ad42 YY	399	if (tb[TCA_TBF_PBURST])
	400	q->mtu = mtu;
	401	else
	402	q->mtu = PSCHED_TICKS2NS(qopt->mtu);
1da177e4	403	q->max_size = max_size;
2e04ad42 YY	404	if (tb[TCA_TBF_BURST])
	405	q->buffer = buffer;
	406	else
	407	q->buffer = PSCHED_TICKS2NS(qopt->buffer);
1da177e4 LT	408	q->tokens = q->buffer;
1da177e4 LT	409	q->ptokens = q->mtu;
b94c8afc	410
cc106e44	411	memcpy(&q->rate, &rate, sizeof(struct psched_ratecfg));
a135e598	412	memcpy(&q->peak, &peak, sizeof(struct psched_ratecfg));
b94c8afc	413
1da177e4 LT	414	sch_tree_unlock(sch);
	415	err = 0;
	416	done:
1da177e4 LT	417	return err;
	418	}
	419
e63d7dfd AA	420	static int tbf_init(struct Qdisc sch, struct nlattr opt,
e63d7dfd AA	421	struct netlink_ext_ack *extack)
1da177e4 LT	422	{
	423	struct tbf_sched_data *q = qdisc_priv(sch);
	424
c2d6511e NA	425	qdisc_watchdog_init(&q->watchdog, sch);
	426	q->qdisc = &noop_qdisc;
	427
ac8ef4ab	428	if (!opt)
1da177e4 LT	429	return -EINVAL;
1da177e4 LT	430
d2de875c	431	q->t_c = ktime_get_ns();
1da177e4	432
2030721c	433	return tbf_change(sch, opt, extack);
1da177e4 LT	434	}
	435
	436	static void tbf_destroy(struct Qdisc *sch)
	437	{
	438	struct tbf_sched_data *q = qdisc_priv(sch);
	439
f7f593e3	440	qdisc_watchdog_cancel(&q->watchdog);
1da177e4 LT	441	qdisc_destroy(q->qdisc);
	442	}
	443
	444	static int tbf_dump(struct Qdisc sch, struct sk_buff skb)
	445	{
	446	struct tbf_sched_data *q = qdisc_priv(sch);
4b3550ef	447	struct nlattr *nest;
1da177e4 LT	448	struct tc_tbf_qopt opt;
1da177e4 LT	449
b0460e44	450	sch->qstats.backlog = q->qdisc->qstats.backlog;
4b3550ef PM	451	nest = nla_nest_start(skb, TCA_OPTIONS);
	452	if (nest == NULL)
	453	goto nla_put_failure;
1da177e4 LT	454
1da177e4 LT	455	opt.limit = q->limit;
01cb71d2	456	psched_ratecfg_getrate(&opt.rate, &q->rate);
a135e598	457	if (tbf_peak_present(q))
01cb71d2	458	psched_ratecfg_getrate(&opt.peakrate, &q->peak);
1da177e4 LT	459	else
1da177e4 LT	460	memset(&opt.peakrate, 0, sizeof(opt.peakrate));
b757c933 JP	461	opt.mtu = PSCHED_NS2TICKS(q->mtu);
b757c933 JP	462	opt.buffer = PSCHED_NS2TICKS(q->buffer);
1b34ec43 DM	463	if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt))
1b34ec43 DM	464	goto nla_put_failure;
a33c4a26	465	if (q->rate.rate_bytes_ps >= (1ULL << 32) &&
2a51c1e8 ND	466	nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps,
2a51c1e8 ND	467	TCA_TBF_PAD))
a33c4a26	468	goto nla_put_failure;
a135e598	469	if (tbf_peak_present(q) &&
a33c4a26	470	q->peak.rate_bytes_ps >= (1ULL << 32) &&
2a51c1e8 ND	471	nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps,
2a51c1e8 ND	472	TCA_TBF_PAD))
a33c4a26	473	goto nla_put_failure;
1da177e4	474
d59b7d80	475	return nla_nest_end(skb, nest);
1da177e4	476
1e90474c	477	nla_put_failure:
4b3550ef	478	nla_nest_cancel(skb, nest);
1da177e4 LT	479	return -1;
	480	}
	481
	482	static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
	483	struct sk_buff skb, struct tcmsg tcm)
	484	{
	485	struct tbf_sched_data *q = qdisc_priv(sch);
	486
1da177e4 LT	487	tcm->tcm_handle \|= TC_H_MIN(1);
	488	tcm->tcm_info = q->qdisc->handle;
	489
	490	return 0;
	491	}
	492
	493	static int tbf_graft(struct Qdisc sch, unsigned long arg, struct Qdisc new,
653d6fd6	494	struct Qdisc *old, struct netlink_ext_ack extack)
1da177e4 LT	495	{
	496	struct tbf_sched_data *q = qdisc_priv(sch);
	497
	498	if (new == NULL)
	499	new = &noop_qdisc;
	500
86a7996c	501	*old = qdisc_replace(sch, new, &q->qdisc);
1da177e4 LT	502	return 0;
	503	}
	504
	505	static struct Qdisc tbf_leaf(struct Qdisc sch, unsigned long arg)
	506	{
	507	struct tbf_sched_data *q = qdisc_priv(sch);
	508	return q->qdisc;
	509	}
	510
143976ce	511	static unsigned long tbf_find(struct Qdisc *sch, u32 classid)
1da177e4 LT	512	{
	513	return 1;
	514	}
	515
1da177e4 LT	516	static void tbf_walk(struct Qdisc sch, struct qdisc_walker walker)
	517	{
	518	if (!walker->stop) {
	519	if (walker->count >= walker->skip)
	520	if (walker->fn(sch, 1, walker) < 0) {
	521	walker->stop = 1;
	522	return;
	523	}
	524	walker->count++;
	525	}
	526	}
	527
cc7ec456	528	static const struct Qdisc_class_ops tbf_class_ops = {
1da177e4 LT	529	.graft = tbf_graft,
1da177e4 LT	530	.leaf = tbf_leaf,
143976ce	531	.find = tbf_find,
1da177e4	532	.walk = tbf_walk,
1da177e4 LT	533	.dump = tbf_dump_class,
	534	};
	535
20fea08b	536	static struct Qdisc_ops tbf_qdisc_ops __read_mostly = {
1da177e4 LT	537	.next = NULL,
	538	.cl_ops = &tbf_class_ops,
	539	.id = "tbf",
	540	.priv_size = sizeof(struct tbf_sched_data),
	541	.enqueue = tbf_enqueue,
	542	.dequeue = tbf_dequeue,
77be155c	543	.peek = qdisc_peek_dequeued,
1da177e4 LT	544	.init = tbf_init,
	545	.reset = tbf_reset,
	546	.destroy = tbf_destroy,
	547	.change = tbf_change,
	548	.dump = tbf_dump,
	549	.owner = THIS_MODULE,
	550	};
	551
	552	static int __init tbf_module_init(void)
	553	{
	554	return register_qdisc(&tbf_qdisc_ops);
	555	}
	556
	557	static void __exit tbf_module_exit(void)
	558	{
	559	unregister_qdisc(&tbf_qdisc_ops);
	560	}
	561	module_init(tbf_module_init)
	562	module_exit(tbf_module_exit)
	563	MODULE_LICENSE("GPL");