Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
[linux-2.6-block.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 /*
36
37    Short review.
38    -------------
39
40    This file consists of two interrelated parts:
41
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63
64    All real intelligent work is done inside qdisc modules.
65
66
67
68    Every discipline has two major routines: enqueue and dequeue.
69
70    ---dequeue
71
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78
79    ---enqueue
80
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP        - this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88
89    Auxiliary routines:
90
91    ---peek
92
93    like dequeue but without removing a packet from the queue
94
95    ---reset
96
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99
100    ---init
101
102    initializes newly created qdisc.
103
104    ---destroy
105
106    destroys resources allocated by init and during lifetime of qdisc.
107
108    ---change
109
110    changes qdisc parameters.
111  */
112
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117 /************************************************
118  *      Queueing disciplines manipulation.      *
119  ************************************************/
120
121
122 /* The list of all installed queueing disciplines. */
123
124 static struct Qdisc_ops *qdisc_base;
125
126 /* Register/unregister queueing discipline */
127
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130         struct Qdisc_ops *q, **qp;
131         int rc = -EEXIST;
132
133         write_lock(&qdisc_mod_lock);
134         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135                 if (!strcmp(qops->id, q->id))
136                         goto out;
137
138         if (qops->enqueue == NULL)
139                 qops->enqueue = noop_qdisc_ops.enqueue;
140         if (qops->peek == NULL) {
141                 if (qops->dequeue == NULL)
142                         qops->peek = noop_qdisc_ops.peek;
143                 else
144                         goto out_einval;
145         }
146         if (qops->dequeue == NULL)
147                 qops->dequeue = noop_qdisc_ops.dequeue;
148
149         if (qops->cl_ops) {
150                 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152                 if (!(cops->find && cops->walk && cops->leaf))
153                         goto out_einval;
154
155                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156                         goto out_einval;
157         }
158
159         qops->next = NULL;
160         *qp = qops;
161         rc = 0;
162 out:
163         write_unlock(&qdisc_mod_lock);
164         return rc;
165
166 out_einval:
167         rc = -EINVAL;
168         goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194         read_lock(&qdisc_mod_lock);
195         strlcpy(name, default_qdisc_ops->id, len);
196         read_unlock(&qdisc_mod_lock);
197 }
198
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201         struct Qdisc_ops *q = NULL;
202
203         for (q = qdisc_base; q; q = q->next) {
204                 if (!strcmp(name, q->id)) {
205                         if (!try_module_get(q->owner))
206                                 q = NULL;
207                         break;
208                 }
209         }
210
211         return q;
212 }
213
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217         const struct Qdisc_ops *ops;
218
219         if (!capable(CAP_NET_ADMIN))
220                 return -EPERM;
221
222         write_lock(&qdisc_mod_lock);
223         ops = qdisc_lookup_default(name);
224         if (!ops) {
225                 /* Not found, drop lock and try to load module */
226                 write_unlock(&qdisc_mod_lock);
227                 request_module("sch_%s", name);
228                 write_lock(&qdisc_mod_lock);
229
230                 ops = qdisc_lookup_default(name);
231         }
232
233         if (ops) {
234                 /* Set new default */
235                 module_put(default_qdisc_ops->owner);
236                 default_qdisc_ops = ops;
237         }
238         write_unlock(&qdisc_mod_lock);
239
240         return ops ? 0 : -ENOENT;
241 }
242
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259         struct Qdisc *q;
260
261         if (!qdisc_dev(root))
262                 return (root->handle == handle ? root : NULL);
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 ASSERT_RTNL();
279                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280                 if (invisible)
281                         q->flags |= TCQ_F_INVISIBLE;
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(dev->qdisc, handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315         struct netdev_queue *nq;
316         struct Qdisc *q;
317
318         if (!handle)
319                 return NULL;
320         q = qdisc_match_from_root(dev->qdisc, handle);
321         if (q)
322                 goto out;
323
324         nq = dev_ingress_queue_rcu(dev);
325         if (nq)
326                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328         return q;
329 }
330
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333         unsigned long cl;
334         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336         if (cops == NULL)
337                 return NULL;
338         cl = cops->find(p, classid);
339
340         if (cl == 0)
341                 return NULL;
342         return cops->leaf(p, cl);
343 }
344
345 /* Find queueing discipline by name */
346
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349         struct Qdisc_ops *q = NULL;
350
351         if (kind) {
352                 read_lock(&qdisc_mod_lock);
353                 for (q = qdisc_base; q; q = q->next) {
354                         if (nla_strcmp(kind, q->id) == 0) {
355                                 if (!try_module_get(q->owner))
356                                         q = NULL;
357                                 break;
358                         }
359                 }
360                 read_unlock(&qdisc_mod_lock);
361         }
362         return q;
363 }
364
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384         int low       = roundup(r->mpu, 48);
385         int high      = roundup(low+1, 48);
386         int cell_low  = low >> r->cell_log;
387         int cell_high = (high >> r->cell_log) - 1;
388
389         /* rtab is too inaccurate at rates > 100Mbit/s */
390         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391                 pr_debug("TC linklayer: Giving up ATM detection\n");
392                 return TC_LINKLAYER_ETHERNET;
393         }
394
395         if ((cell_high > cell_low) && (cell_high < 256)
396             && (rtab[cell_low] == rtab[cell_high])) {
397                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398                          cell_low, cell_high, rtab[cell_high]);
399                 return TC_LINKLAYER_ATM;
400         }
401         return TC_LINKLAYER_ETHERNET;
402 }
403
404 static struct qdisc_rate_table *qdisc_rtab_list;
405
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407                                         struct nlattr *tab,
408                                         struct netlink_ext_ack *extack)
409 {
410         struct qdisc_rate_table *rtab;
411
412         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
413             nla_len(tab) != TC_RTAB_SIZE) {
414                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
415                 return NULL;
416         }
417
418         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
419                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
420                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
421                         rtab->refcnt++;
422                         return rtab;
423                 }
424         }
425
426         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
427         if (rtab) {
428                 rtab->rate = *r;
429                 rtab->refcnt = 1;
430                 memcpy(rtab->data, nla_data(tab), 1024);
431                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
432                         r->linklayer = __detect_linklayer(r, rtab->data);
433                 rtab->next = qdisc_rtab_list;
434                 qdisc_rtab_list = rtab;
435         } else {
436                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
437         }
438         return rtab;
439 }
440 EXPORT_SYMBOL(qdisc_get_rtab);
441
442 void qdisc_put_rtab(struct qdisc_rate_table *tab)
443 {
444         struct qdisc_rate_table *rtab, **rtabp;
445
446         if (!tab || --tab->refcnt)
447                 return;
448
449         for (rtabp = &qdisc_rtab_list;
450              (rtab = *rtabp) != NULL;
451              rtabp = &rtab->next) {
452                 if (rtab == tab) {
453                         *rtabp = rtab->next;
454                         kfree(rtab);
455                         return;
456                 }
457         }
458 }
459 EXPORT_SYMBOL(qdisc_put_rtab);
460
461 static LIST_HEAD(qdisc_stab_list);
462
463 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
464         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
465         [TCA_STAB_DATA] = { .type = NLA_BINARY },
466 };
467
468 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
469                                                struct netlink_ext_ack *extack)
470 {
471         struct nlattr *tb[TCA_STAB_MAX + 1];
472         struct qdisc_size_table *stab;
473         struct tc_sizespec *s;
474         unsigned int tsize = 0;
475         u16 *tab = NULL;
476         int err;
477
478         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
479                                           extack);
480         if (err < 0)
481                 return ERR_PTR(err);
482         if (!tb[TCA_STAB_BASE]) {
483                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
484                 return ERR_PTR(-EINVAL);
485         }
486
487         s = nla_data(tb[TCA_STAB_BASE]);
488
489         if (s->tsize > 0) {
490                 if (!tb[TCA_STAB_DATA]) {
491                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
492                         return ERR_PTR(-EINVAL);
493                 }
494                 tab = nla_data(tb[TCA_STAB_DATA]);
495                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
496         }
497
498         if (tsize != s->tsize || (!tab && tsize > 0)) {
499                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
500                 return ERR_PTR(-EINVAL);
501         }
502
503         list_for_each_entry(stab, &qdisc_stab_list, list) {
504                 if (memcmp(&stab->szopts, s, sizeof(*s)))
505                         continue;
506                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
507                         continue;
508                 stab->refcnt++;
509                 return stab;
510         }
511
512         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
513         if (!stab)
514                 return ERR_PTR(-ENOMEM);
515
516         stab->refcnt = 1;
517         stab->szopts = *s;
518         if (tsize > 0)
519                 memcpy(stab->data, tab, tsize * sizeof(u16));
520
521         list_add_tail(&stab->list, &qdisc_stab_list);
522
523         return stab;
524 }
525
526 void qdisc_put_stab(struct qdisc_size_table *tab)
527 {
528         if (!tab)
529                 return;
530
531         if (--tab->refcnt == 0) {
532                 list_del(&tab->list);
533                 kfree_rcu(tab, rcu);
534         }
535 }
536 EXPORT_SYMBOL(qdisc_put_stab);
537
538 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
539 {
540         struct nlattr *nest;
541
542         nest = nla_nest_start_noflag(skb, TCA_STAB);
543         if (nest == NULL)
544                 goto nla_put_failure;
545         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
546                 goto nla_put_failure;
547         nla_nest_end(skb, nest);
548
549         return skb->len;
550
551 nla_put_failure:
552         return -1;
553 }
554
555 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
556                                const struct qdisc_size_table *stab)
557 {
558         int pkt_len, slot;
559
560         pkt_len = skb->len + stab->szopts.overhead;
561         if (unlikely(!stab->szopts.tsize))
562                 goto out;
563
564         slot = pkt_len + stab->szopts.cell_align;
565         if (unlikely(slot < 0))
566                 slot = 0;
567
568         slot >>= stab->szopts.cell_log;
569         if (likely(slot < stab->szopts.tsize))
570                 pkt_len = stab->data[slot];
571         else
572                 pkt_len = stab->data[stab->szopts.tsize - 1] *
573                                 (slot / stab->szopts.tsize) +
574                                 stab->data[slot % stab->szopts.tsize];
575
576         pkt_len <<= stab->szopts.size_log;
577 out:
578         if (unlikely(pkt_len < 1))
579                 pkt_len = 1;
580         qdisc_skb_cb(skb)->pkt_len = pkt_len;
581 }
582 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
583
584 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
585 {
586         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
587                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
588                         txt, qdisc->ops->id, qdisc->handle >> 16);
589                 qdisc->flags |= TCQ_F_WARN_NONWC;
590         }
591 }
592 EXPORT_SYMBOL(qdisc_warn_nonwc);
593
594 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
595 {
596         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
597                                                  timer);
598
599         rcu_read_lock();
600         __netif_schedule(qdisc_root(wd->qdisc));
601         rcu_read_unlock();
602
603         return HRTIMER_NORESTART;
604 }
605
606 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
607                                  clockid_t clockid)
608 {
609         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
610         wd->timer.function = qdisc_watchdog;
611         wd->qdisc = qdisc;
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
614
615 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
616 {
617         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
618 }
619 EXPORT_SYMBOL(qdisc_watchdog_init);
620
621 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
622 {
623         if (test_bit(__QDISC_STATE_DEACTIVATED,
624                      &qdisc_root_sleeping(wd->qdisc)->state))
625                 return;
626
627         if (wd->last_expires == expires)
628                 return;
629
630         wd->last_expires = expires;
631         hrtimer_start(&wd->timer,
632                       ns_to_ktime(expires),
633                       HRTIMER_MODE_ABS_PINNED);
634 }
635 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
636
637 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
638 {
639         hrtimer_cancel(&wd->timer);
640 }
641 EXPORT_SYMBOL(qdisc_watchdog_cancel);
642
643 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
644 {
645         struct hlist_head *h;
646         unsigned int i;
647
648         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
649
650         if (h != NULL) {
651                 for (i = 0; i < n; i++)
652                         INIT_HLIST_HEAD(&h[i]);
653         }
654         return h;
655 }
656
657 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
658 {
659         struct Qdisc_class_common *cl;
660         struct hlist_node *next;
661         struct hlist_head *nhash, *ohash;
662         unsigned int nsize, nmask, osize;
663         unsigned int i, h;
664
665         /* Rehash when load factor exceeds 0.75 */
666         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
667                 return;
668         nsize = clhash->hashsize * 2;
669         nmask = nsize - 1;
670         nhash = qdisc_class_hash_alloc(nsize);
671         if (nhash == NULL)
672                 return;
673
674         ohash = clhash->hash;
675         osize = clhash->hashsize;
676
677         sch_tree_lock(sch);
678         for (i = 0; i < osize; i++) {
679                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
680                         h = qdisc_class_hash(cl->classid, nmask);
681                         hlist_add_head(&cl->hnode, &nhash[h]);
682                 }
683         }
684         clhash->hash     = nhash;
685         clhash->hashsize = nsize;
686         clhash->hashmask = nmask;
687         sch_tree_unlock(sch);
688
689         kvfree(ohash);
690 }
691 EXPORT_SYMBOL(qdisc_class_hash_grow);
692
693 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
694 {
695         unsigned int size = 4;
696
697         clhash->hash = qdisc_class_hash_alloc(size);
698         if (!clhash->hash)
699                 return -ENOMEM;
700         clhash->hashsize  = size;
701         clhash->hashmask  = size - 1;
702         clhash->hashelems = 0;
703         return 0;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_init);
706
707 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
708 {
709         kvfree(clhash->hash);
710 }
711 EXPORT_SYMBOL(qdisc_class_hash_destroy);
712
713 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
714                              struct Qdisc_class_common *cl)
715 {
716         unsigned int h;
717
718         INIT_HLIST_NODE(&cl->hnode);
719         h = qdisc_class_hash(cl->classid, clhash->hashmask);
720         hlist_add_head(&cl->hnode, &clhash->hash[h]);
721         clhash->hashelems++;
722 }
723 EXPORT_SYMBOL(qdisc_class_hash_insert);
724
725 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
726                              struct Qdisc_class_common *cl)
727 {
728         hlist_del(&cl->hnode);
729         clhash->hashelems--;
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_remove);
732
733 /* Allocate an unique handle from space managed by kernel
734  * Possible range is [8000-FFFF]:0000 (0x8000 values)
735  */
736 static u32 qdisc_alloc_handle(struct net_device *dev)
737 {
738         int i = 0x8000;
739         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
740
741         do {
742                 autohandle += TC_H_MAKE(0x10000U, 0);
743                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
744                         autohandle = TC_H_MAKE(0x80000000U, 0);
745                 if (!qdisc_lookup(dev, autohandle))
746                         return autohandle;
747                 cond_resched();
748         } while (--i > 0);
749
750         return 0;
751 }
752
753 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
754 {
755         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
756         const struct Qdisc_class_ops *cops;
757         unsigned long cl;
758         u32 parentid;
759         bool notify;
760         int drops;
761
762         if (n == 0 && len == 0)
763                 return;
764         drops = max_t(int, n, 0);
765         rcu_read_lock();
766         while ((parentid = sch->parent)) {
767                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
768                         break;
769
770                 if (sch->flags & TCQ_F_NOPARENT)
771                         break;
772                 /* Notify parent qdisc only if child qdisc becomes empty.
773                  *
774                  * If child was empty even before update then backlog
775                  * counter is screwed and we skip notification because
776                  * parent class is already passive.
777                  *
778                  * If the original child was offloaded then it is allowed
779                  * to be seem as empty, so the parent is notified anyway.
780                  */
781                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
782                                                        !qdisc_is_offloaded);
783                 /* TODO: perform the search on a per txq basis */
784                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
785                 if (sch == NULL) {
786                         WARN_ON_ONCE(parentid != TC_H_ROOT);
787                         break;
788                 }
789                 cops = sch->ops->cl_ops;
790                 if (notify && cops->qlen_notify) {
791                         cl = cops->find(sch, parentid);
792                         cops->qlen_notify(sch, cl);
793                 }
794                 sch->q.qlen -= n;
795                 sch->qstats.backlog -= len;
796                 __qdisc_qstats_drop(sch, drops);
797         }
798         rcu_read_unlock();
799 }
800 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
801
802 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
803                               void *type_data)
804 {
805         struct net_device *dev = qdisc_dev(sch);
806         int err;
807
808         sch->flags &= ~TCQ_F_OFFLOADED;
809         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
810                 return 0;
811
812         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
813         if (err == -EOPNOTSUPP)
814                 return 0;
815
816         if (!err)
817                 sch->flags |= TCQ_F_OFFLOADED;
818
819         return err;
820 }
821 EXPORT_SYMBOL(qdisc_offload_dump_helper);
822
823 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
824                                 struct Qdisc *new, struct Qdisc *old,
825                                 enum tc_setup_type type, void *type_data,
826                                 struct netlink_ext_ack *extack)
827 {
828         bool any_qdisc_is_offloaded;
829         int err;
830
831         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
832                 return;
833
834         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
835
836         /* Don't report error if the graft is part of destroy operation. */
837         if (!err || !new || new == &noop_qdisc)
838                 return;
839
840         /* Don't report error if the parent, the old child and the new
841          * one are not offloaded.
842          */
843         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
844         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
845         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
846
847         if (any_qdisc_is_offloaded)
848                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
849 }
850 EXPORT_SYMBOL(qdisc_offload_graft_helper);
851
852 static void qdisc_offload_graft_root(struct net_device *dev,
853                                      struct Qdisc *new, struct Qdisc *old,
854                                      struct netlink_ext_ack *extack)
855 {
856         struct tc_root_qopt_offload graft_offload = {
857                 .command        = TC_ROOT_GRAFT,
858                 .handle         = new ? new->handle : 0,
859                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
860                                   (old && old->flags & TCQ_F_INGRESS),
861         };
862
863         qdisc_offload_graft_helper(dev, NULL, new, old,
864                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
865 }
866
867 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
868                          u32 portid, u32 seq, u16 flags, int event)
869 {
870         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
871         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
872         struct tcmsg *tcm;
873         struct nlmsghdr  *nlh;
874         unsigned char *b = skb_tail_pointer(skb);
875         struct gnet_dump d;
876         struct qdisc_size_table *stab;
877         u32 block_index;
878         __u32 qlen;
879
880         cond_resched();
881         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
882         if (!nlh)
883                 goto out_nlmsg_trim;
884         tcm = nlmsg_data(nlh);
885         tcm->tcm_family = AF_UNSPEC;
886         tcm->tcm__pad1 = 0;
887         tcm->tcm__pad2 = 0;
888         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
889         tcm->tcm_parent = clid;
890         tcm->tcm_handle = q->handle;
891         tcm->tcm_info = refcount_read(&q->refcnt);
892         if (nla_put_string(skb, TCA_KIND, q->ops->id))
893                 goto nla_put_failure;
894         if (q->ops->ingress_block_get) {
895                 block_index = q->ops->ingress_block_get(q);
896                 if (block_index &&
897                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
898                         goto nla_put_failure;
899         }
900         if (q->ops->egress_block_get) {
901                 block_index = q->ops->egress_block_get(q);
902                 if (block_index &&
903                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
904                         goto nla_put_failure;
905         }
906         if (q->ops->dump && q->ops->dump(q, skb) < 0)
907                 goto nla_put_failure;
908         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
909                 goto nla_put_failure;
910         qlen = qdisc_qlen_sum(q);
911
912         stab = rtnl_dereference(q->stab);
913         if (stab && qdisc_dump_stab(skb, stab) < 0)
914                 goto nla_put_failure;
915
916         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
917                                          NULL, &d, TCA_PAD) < 0)
918                 goto nla_put_failure;
919
920         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
921                 goto nla_put_failure;
922
923         if (qdisc_is_percpu_stats(q)) {
924                 cpu_bstats = q->cpu_bstats;
925                 cpu_qstats = q->cpu_qstats;
926         }
927
928         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
929                                   &d, cpu_bstats, &q->bstats) < 0 ||
930             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
931             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
932                 goto nla_put_failure;
933
934         if (gnet_stats_finish_copy(&d) < 0)
935                 goto nla_put_failure;
936
937         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
938         return skb->len;
939
940 out_nlmsg_trim:
941 nla_put_failure:
942         nlmsg_trim(skb, b);
943         return -1;
944 }
945
946 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
947 {
948         if (q->flags & TCQ_F_BUILTIN)
949                 return true;
950         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
951                 return true;
952
953         return false;
954 }
955
956 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
957                         struct nlmsghdr *n, u32 clid,
958                         struct Qdisc *old, struct Qdisc *new)
959 {
960         struct sk_buff *skb;
961         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
962
963         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
964         if (!skb)
965                 return -ENOBUFS;
966
967         if (old && !tc_qdisc_dump_ignore(old, false)) {
968                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
969                                   0, RTM_DELQDISC) < 0)
970                         goto err_out;
971         }
972         if (new && !tc_qdisc_dump_ignore(new, false)) {
973                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
974                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
975                         goto err_out;
976         }
977
978         if (skb->len)
979                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
980                                       n->nlmsg_flags & NLM_F_ECHO);
981
982 err_out:
983         kfree_skb(skb);
984         return -EINVAL;
985 }
986
987 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
988                                struct nlmsghdr *n, u32 clid,
989                                struct Qdisc *old, struct Qdisc *new)
990 {
991         if (new || old)
992                 qdisc_notify(net, skb, n, clid, old, new);
993
994         if (old)
995                 qdisc_put(old);
996 }
997
998 static void qdisc_clear_nolock(struct Qdisc *sch)
999 {
1000         sch->flags &= ~TCQ_F_NOLOCK;
1001         if (!(sch->flags & TCQ_F_CPUSTATS))
1002                 return;
1003
1004         free_percpu(sch->cpu_bstats);
1005         free_percpu(sch->cpu_qstats);
1006         sch->cpu_bstats = NULL;
1007         sch->cpu_qstats = NULL;
1008         sch->flags &= ~TCQ_F_CPUSTATS;
1009 }
1010
1011 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1012  * to device "dev".
1013  *
1014  * When appropriate send a netlink notification using 'skb'
1015  * and "n".
1016  *
1017  * On success, destroy old qdisc.
1018  */
1019
1020 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1021                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1022                        struct Qdisc *new, struct Qdisc *old,
1023                        struct netlink_ext_ack *extack)
1024 {
1025         struct Qdisc *q = old;
1026         struct net *net = dev_net(dev);
1027
1028         if (parent == NULL) {
1029                 unsigned int i, num_q, ingress;
1030
1031                 ingress = 0;
1032                 num_q = dev->num_tx_queues;
1033                 if ((q && q->flags & TCQ_F_INGRESS) ||
1034                     (new && new->flags & TCQ_F_INGRESS)) {
1035                         num_q = 1;
1036                         ingress = 1;
1037                         if (!dev_ingress_queue(dev)) {
1038                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1039                                 return -ENOENT;
1040                         }
1041                 }
1042
1043                 if (dev->flags & IFF_UP)
1044                         dev_deactivate(dev);
1045
1046                 qdisc_offload_graft_root(dev, new, old, extack);
1047
1048                 if (new && new->ops->attach)
1049                         goto skip;
1050
1051                 for (i = 0; i < num_q; i++) {
1052                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1053
1054                         if (!ingress)
1055                                 dev_queue = netdev_get_tx_queue(dev, i);
1056
1057                         old = dev_graft_qdisc(dev_queue, new);
1058                         if (new && i > 0)
1059                                 qdisc_refcount_inc(new);
1060
1061                         if (!ingress)
1062                                 qdisc_put(old);
1063                 }
1064
1065 skip:
1066                 if (!ingress) {
1067                         notify_and_destroy(net, skb, n, classid,
1068                                            dev->qdisc, new);
1069                         if (new && !new->ops->attach)
1070                                 qdisc_refcount_inc(new);
1071                         dev->qdisc = new ? : &noop_qdisc;
1072
1073                         if (new && new->ops->attach)
1074                                 new->ops->attach(new);
1075                 } else {
1076                         notify_and_destroy(net, skb, n, classid, old, new);
1077                 }
1078
1079                 if (dev->flags & IFF_UP)
1080                         dev_activate(dev);
1081         } else {
1082                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1083                 unsigned long cl;
1084                 int err;
1085
1086                 /* Only support running class lockless if parent is lockless */
1087                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1088                     parent && !(parent->flags & TCQ_F_NOLOCK))
1089                         qdisc_clear_nolock(new);
1090
1091                 if (!cops || !cops->graft)
1092                         return -EOPNOTSUPP;
1093
1094                 cl = cops->find(parent, classid);
1095                 if (!cl) {
1096                         NL_SET_ERR_MSG(extack, "Specified class not found");
1097                         return -ENOENT;
1098                 }
1099
1100                 err = cops->graft(parent, cl, new, &old, extack);
1101                 if (err)
1102                         return err;
1103                 notify_and_destroy(net, skb, n, classid, old, new);
1104         }
1105         return 0;
1106 }
1107
1108 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1109                                    struct netlink_ext_ack *extack)
1110 {
1111         u32 block_index;
1112
1113         if (tca[TCA_INGRESS_BLOCK]) {
1114                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1115
1116                 if (!block_index) {
1117                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1118                         return -EINVAL;
1119                 }
1120                 if (!sch->ops->ingress_block_set) {
1121                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1122                         return -EOPNOTSUPP;
1123                 }
1124                 sch->ops->ingress_block_set(sch, block_index);
1125         }
1126         if (tca[TCA_EGRESS_BLOCK]) {
1127                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1128
1129                 if (!block_index) {
1130                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1131                         return -EINVAL;
1132                 }
1133                 if (!sch->ops->egress_block_set) {
1134                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1135                         return -EOPNOTSUPP;
1136                 }
1137                 sch->ops->egress_block_set(sch, block_index);
1138         }
1139         return 0;
1140 }
1141
1142 /*
1143    Allocate and initialize new qdisc.
1144
1145    Parameters are passed via opt.
1146  */
1147
1148 static struct Qdisc *qdisc_create(struct net_device *dev,
1149                                   struct netdev_queue *dev_queue,
1150                                   struct Qdisc *p, u32 parent, u32 handle,
1151                                   struct nlattr **tca, int *errp,
1152                                   struct netlink_ext_ack *extack)
1153 {
1154         int err;
1155         struct nlattr *kind = tca[TCA_KIND];
1156         struct Qdisc *sch;
1157         struct Qdisc_ops *ops;
1158         struct qdisc_size_table *stab;
1159
1160         ops = qdisc_lookup_ops(kind);
1161 #ifdef CONFIG_MODULES
1162         if (ops == NULL && kind != NULL) {
1163                 char name[IFNAMSIZ];
1164                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1165                         /* We dropped the RTNL semaphore in order to
1166                          * perform the module load.  So, even if we
1167                          * succeeded in loading the module we have to
1168                          * tell the caller to replay the request.  We
1169                          * indicate this using -EAGAIN.
1170                          * We replay the request because the device may
1171                          * go away in the mean time.
1172                          */
1173                         rtnl_unlock();
1174                         request_module("sch_%s", name);
1175                         rtnl_lock();
1176                         ops = qdisc_lookup_ops(kind);
1177                         if (ops != NULL) {
1178                                 /* We will try again qdisc_lookup_ops,
1179                                  * so don't keep a reference.
1180                                  */
1181                                 module_put(ops->owner);
1182                                 err = -EAGAIN;
1183                                 goto err_out;
1184                         }
1185                 }
1186         }
1187 #endif
1188
1189         err = -ENOENT;
1190         if (!ops) {
1191                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1192                 goto err_out;
1193         }
1194
1195         sch = qdisc_alloc(dev_queue, ops, extack);
1196         if (IS_ERR(sch)) {
1197                 err = PTR_ERR(sch);
1198                 goto err_out2;
1199         }
1200
1201         sch->parent = parent;
1202
1203         if (handle == TC_H_INGRESS) {
1204                 sch->flags |= TCQ_F_INGRESS;
1205                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1206         } else {
1207                 if (handle == 0) {
1208                         handle = qdisc_alloc_handle(dev);
1209                         if (handle == 0) {
1210                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1211                                 err = -ENOSPC;
1212                                 goto err_out3;
1213                         }
1214                 }
1215                 if (!netif_is_multiqueue(dev))
1216                         sch->flags |= TCQ_F_ONETXQUEUE;
1217         }
1218
1219         sch->handle = handle;
1220
1221         /* This exist to keep backward compatible with a userspace
1222          * loophole, what allowed userspace to get IFF_NO_QUEUE
1223          * facility on older kernels by setting tx_queue_len=0 (prior
1224          * to qdisc init), and then forgot to reinit tx_queue_len
1225          * before again attaching a qdisc.
1226          */
1227         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1228                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1229                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1230         }
1231
1232         err = qdisc_block_indexes_set(sch, tca, extack);
1233         if (err)
1234                 goto err_out3;
1235
1236         if (ops->init) {
1237                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1238                 if (err != 0)
1239                         goto err_out5;
1240         }
1241
1242         if (tca[TCA_STAB]) {
1243                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1244                 if (IS_ERR(stab)) {
1245                         err = PTR_ERR(stab);
1246                         goto err_out4;
1247                 }
1248                 rcu_assign_pointer(sch->stab, stab);
1249         }
1250         if (tca[TCA_RATE]) {
1251                 seqcount_t *running;
1252
1253                 err = -EOPNOTSUPP;
1254                 if (sch->flags & TCQ_F_MQROOT) {
1255                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1256                         goto err_out4;
1257                 }
1258
1259                 if (sch->parent != TC_H_ROOT &&
1260                     !(sch->flags & TCQ_F_INGRESS) &&
1261                     (!p || !(p->flags & TCQ_F_MQROOT)))
1262                         running = qdisc_root_sleeping_running(sch);
1263                 else
1264                         running = &sch->running;
1265
1266                 err = gen_new_estimator(&sch->bstats,
1267                                         sch->cpu_bstats,
1268                                         &sch->rate_est,
1269                                         NULL,
1270                                         running,
1271                                         tca[TCA_RATE]);
1272                 if (err) {
1273                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1274                         goto err_out4;
1275                 }
1276         }
1277
1278         qdisc_hash_add(sch, false);
1279
1280         return sch;
1281
1282 err_out5:
1283         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1284         if (ops->destroy)
1285                 ops->destroy(sch);
1286 err_out3:
1287         dev_put(dev);
1288         qdisc_free(sch);
1289 err_out2:
1290         module_put(ops->owner);
1291 err_out:
1292         *errp = err;
1293         return NULL;
1294
1295 err_out4:
1296         /*
1297          * Any broken qdiscs that would require a ops->reset() here?
1298          * The qdisc was never in action so it shouldn't be necessary.
1299          */
1300         qdisc_put_stab(rtnl_dereference(sch->stab));
1301         if (ops->destroy)
1302                 ops->destroy(sch);
1303         goto err_out3;
1304 }
1305
1306 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1307                         struct netlink_ext_ack *extack)
1308 {
1309         struct qdisc_size_table *ostab, *stab = NULL;
1310         int err = 0;
1311
1312         if (tca[TCA_OPTIONS]) {
1313                 if (!sch->ops->change) {
1314                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1315                         return -EINVAL;
1316                 }
1317                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1318                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1319                         return -EOPNOTSUPP;
1320                 }
1321                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1322                 if (err)
1323                         return err;
1324         }
1325
1326         if (tca[TCA_STAB]) {
1327                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1328                 if (IS_ERR(stab))
1329                         return PTR_ERR(stab);
1330         }
1331
1332         ostab = rtnl_dereference(sch->stab);
1333         rcu_assign_pointer(sch->stab, stab);
1334         qdisc_put_stab(ostab);
1335
1336         if (tca[TCA_RATE]) {
1337                 /* NB: ignores errors from replace_estimator
1338                    because change can't be undone. */
1339                 if (sch->flags & TCQ_F_MQROOT)
1340                         goto out;
1341                 gen_replace_estimator(&sch->bstats,
1342                                       sch->cpu_bstats,
1343                                       &sch->rate_est,
1344                                       NULL,
1345                                       qdisc_root_sleeping_running(sch),
1346                                       tca[TCA_RATE]);
1347         }
1348 out:
1349         return 0;
1350 }
1351
1352 struct check_loop_arg {
1353         struct qdisc_walker     w;
1354         struct Qdisc            *p;
1355         int                     depth;
1356 };
1357
1358 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1359                          struct qdisc_walker *w);
1360
1361 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1362 {
1363         struct check_loop_arg   arg;
1364
1365         if (q->ops->cl_ops == NULL)
1366                 return 0;
1367
1368         arg.w.stop = arg.w.skip = arg.w.count = 0;
1369         arg.w.fn = check_loop_fn;
1370         arg.depth = depth;
1371         arg.p = p;
1372         q->ops->cl_ops->walk(q, &arg.w);
1373         return arg.w.stop ? -ELOOP : 0;
1374 }
1375
1376 static int
1377 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1378 {
1379         struct Qdisc *leaf;
1380         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1381         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1382
1383         leaf = cops->leaf(q, cl);
1384         if (leaf) {
1385                 if (leaf == arg->p || arg->depth > 7)
1386                         return -ELOOP;
1387                 return check_loop(leaf, arg->p, arg->depth + 1);
1388         }
1389         return 0;
1390 }
1391
1392 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1393         [TCA_KIND]              = { .type = NLA_NUL_STRING,
1394                                     .len = IFNAMSIZ - 1 },
1395         [TCA_RATE]              = { .type = NLA_BINARY,
1396                                     .len = sizeof(struct tc_estimator) },
1397         [TCA_STAB]              = { .type = NLA_NESTED },
1398         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1399         [TCA_CHAIN]             = { .type = NLA_U32 },
1400         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1401         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1402 };
1403
1404 /*
1405  * Delete/get qdisc.
1406  */
1407
1408 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1409                         struct netlink_ext_ack *extack)
1410 {
1411         struct net *net = sock_net(skb->sk);
1412         struct tcmsg *tcm = nlmsg_data(n);
1413         struct nlattr *tca[TCA_MAX + 1];
1414         struct net_device *dev;
1415         u32 clid;
1416         struct Qdisc *q = NULL;
1417         struct Qdisc *p = NULL;
1418         int err;
1419
1420         if ((n->nlmsg_type != RTM_GETQDISC) &&
1421             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1422                 return -EPERM;
1423
1424         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1425                                      rtm_tca_policy, extack);
1426         if (err < 0)
1427                 return err;
1428
1429         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1430         if (!dev)
1431                 return -ENODEV;
1432
1433         clid = tcm->tcm_parent;
1434         if (clid) {
1435                 if (clid != TC_H_ROOT) {
1436                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1437                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1438                                 if (!p) {
1439                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1440                                         return -ENOENT;
1441                                 }
1442                                 q = qdisc_leaf(p, clid);
1443                         } else if (dev_ingress_queue(dev)) {
1444                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1445                         }
1446                 } else {
1447                         q = dev->qdisc;
1448                 }
1449                 if (!q) {
1450                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1451                         return -ENOENT;
1452                 }
1453
1454                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1455                         NL_SET_ERR_MSG(extack, "Invalid handle");
1456                         return -EINVAL;
1457                 }
1458         } else {
1459                 q = qdisc_lookup(dev, tcm->tcm_handle);
1460                 if (!q) {
1461                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1462                         return -ENOENT;
1463                 }
1464         }
1465
1466         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1467                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1468                 return -EINVAL;
1469         }
1470
1471         if (n->nlmsg_type == RTM_DELQDISC) {
1472                 if (!clid) {
1473                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1474                         return -EINVAL;
1475                 }
1476                 if (q->handle == 0) {
1477                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1478                         return -ENOENT;
1479                 }
1480                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1481                 if (err != 0)
1482                         return err;
1483         } else {
1484                 qdisc_notify(net, skb, n, clid, NULL, q);
1485         }
1486         return 0;
1487 }
1488
1489 /*
1490  * Create/change qdisc.
1491  */
1492
1493 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1494                            struct netlink_ext_ack *extack)
1495 {
1496         struct net *net = sock_net(skb->sk);
1497         struct tcmsg *tcm;
1498         struct nlattr *tca[TCA_MAX + 1];
1499         struct net_device *dev;
1500         u32 clid;
1501         struct Qdisc *q, *p;
1502         int err;
1503
1504         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1505                 return -EPERM;
1506
1507 replay:
1508         /* Reinit, just in case something touches this. */
1509         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1510                                      rtm_tca_policy, extack);
1511         if (err < 0)
1512                 return err;
1513
1514         tcm = nlmsg_data(n);
1515         clid = tcm->tcm_parent;
1516         q = p = NULL;
1517
1518         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1519         if (!dev)
1520                 return -ENODEV;
1521
1522
1523         if (clid) {
1524                 if (clid != TC_H_ROOT) {
1525                         if (clid != TC_H_INGRESS) {
1526                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1527                                 if (!p) {
1528                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1529                                         return -ENOENT;
1530                                 }
1531                                 q = qdisc_leaf(p, clid);
1532                         } else if (dev_ingress_queue_create(dev)) {
1533                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1534                         }
1535                 } else {
1536                         q = dev->qdisc;
1537                 }
1538
1539                 /* It may be default qdisc, ignore it */
1540                 if (q && q->handle == 0)
1541                         q = NULL;
1542
1543                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1544                         if (tcm->tcm_handle) {
1545                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1546                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1547                                         return -EEXIST;
1548                                 }
1549                                 if (TC_H_MIN(tcm->tcm_handle)) {
1550                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1551                                         return -EINVAL;
1552                                 }
1553                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1554                                 if (!q)
1555                                         goto create_n_graft;
1556                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1557                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1558                                         return -EEXIST;
1559                                 }
1560                                 if (tca[TCA_KIND] &&
1561                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1562                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1563                                         return -EINVAL;
1564                                 }
1565                                 if (q == p ||
1566                                     (p && check_loop(q, p, 0))) {
1567                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1568                                         return -ELOOP;
1569                                 }
1570                                 qdisc_refcount_inc(q);
1571                                 goto graft;
1572                         } else {
1573                                 if (!q)
1574                                         goto create_n_graft;
1575
1576                                 /* This magic test requires explanation.
1577                                  *
1578                                  *   We know, that some child q is already
1579                                  *   attached to this parent and have choice:
1580                                  *   either to change it or to create/graft new one.
1581                                  *
1582                                  *   1. We are allowed to create/graft only
1583                                  *   if CREATE and REPLACE flags are set.
1584                                  *
1585                                  *   2. If EXCL is set, requestor wanted to say,
1586                                  *   that qdisc tcm_handle is not expected
1587                                  *   to exist, so that we choose create/graft too.
1588                                  *
1589                                  *   3. The last case is when no flags are set.
1590                                  *   Alas, it is sort of hole in API, we
1591                                  *   cannot decide what to do unambiguously.
1592                                  *   For now we select create/graft, if
1593                                  *   user gave KIND, which does not match existing.
1594                                  */
1595                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1596                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1597                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1598                                      (tca[TCA_KIND] &&
1599                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1600                                         goto create_n_graft;
1601                         }
1602                 }
1603         } else {
1604                 if (!tcm->tcm_handle) {
1605                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1606                         return -EINVAL;
1607                 }
1608                 q = qdisc_lookup(dev, tcm->tcm_handle);
1609         }
1610
1611         /* Change qdisc parameters */
1612         if (!q) {
1613                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1614                 return -ENOENT;
1615         }
1616         if (n->nlmsg_flags & NLM_F_EXCL) {
1617                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1618                 return -EEXIST;
1619         }
1620         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1621                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1622                 return -EINVAL;
1623         }
1624         err = qdisc_change(q, tca, extack);
1625         if (err == 0)
1626                 qdisc_notify(net, skb, n, clid, NULL, q);
1627         return err;
1628
1629 create_n_graft:
1630         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1631                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1632                 return -ENOENT;
1633         }
1634         if (clid == TC_H_INGRESS) {
1635                 if (dev_ingress_queue(dev)) {
1636                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1637                                          tcm->tcm_parent, tcm->tcm_parent,
1638                                          tca, &err, extack);
1639                 } else {
1640                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1641                         err = -ENOENT;
1642                 }
1643         } else {
1644                 struct netdev_queue *dev_queue;
1645
1646                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1647                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1648                 else if (p)
1649                         dev_queue = p->dev_queue;
1650                 else
1651                         dev_queue = netdev_get_tx_queue(dev, 0);
1652
1653                 q = qdisc_create(dev, dev_queue, p,
1654                                  tcm->tcm_parent, tcm->tcm_handle,
1655                                  tca, &err, extack);
1656         }
1657         if (q == NULL) {
1658                 if (err == -EAGAIN)
1659                         goto replay;
1660                 return err;
1661         }
1662
1663 graft:
1664         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1665         if (err) {
1666                 if (q)
1667                         qdisc_put(q);
1668                 return err;
1669         }
1670
1671         return 0;
1672 }
1673
1674 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1675                               struct netlink_callback *cb,
1676                               int *q_idx_p, int s_q_idx, bool recur,
1677                               bool dump_invisible)
1678 {
1679         int ret = 0, q_idx = *q_idx_p;
1680         struct Qdisc *q;
1681         int b;
1682
1683         if (!root)
1684                 return 0;
1685
1686         q = root;
1687         if (q_idx < s_q_idx) {
1688                 q_idx++;
1689         } else {
1690                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1691                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1692                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1693                                   RTM_NEWQDISC) <= 0)
1694                         goto done;
1695                 q_idx++;
1696         }
1697
1698         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1699          * itself has already been dumped.
1700          *
1701          * If we've already dumped the top-level (ingress) qdisc above and the global
1702          * qdisc hashtable, we don't want to hit it again
1703          */
1704         if (!qdisc_dev(root) || !recur)
1705                 goto out;
1706
1707         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1708                 if (q_idx < s_q_idx) {
1709                         q_idx++;
1710                         continue;
1711                 }
1712                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1713                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1714                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1715                                   RTM_NEWQDISC) <= 0)
1716                         goto done;
1717                 q_idx++;
1718         }
1719
1720 out:
1721         *q_idx_p = q_idx;
1722         return ret;
1723 done:
1724         ret = -1;
1725         goto out;
1726 }
1727
1728 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1729 {
1730         struct net *net = sock_net(skb->sk);
1731         int idx, q_idx;
1732         int s_idx, s_q_idx;
1733         struct net_device *dev;
1734         const struct nlmsghdr *nlh = cb->nlh;
1735         struct nlattr *tca[TCA_MAX + 1];
1736         int err;
1737
1738         s_idx = cb->args[0];
1739         s_q_idx = q_idx = cb->args[1];
1740
1741         idx = 0;
1742         ASSERT_RTNL();
1743
1744         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1745                                      rtm_tca_policy, cb->extack);
1746         if (err < 0)
1747                 return err;
1748
1749         for_each_netdev(net, dev) {
1750                 struct netdev_queue *dev_queue;
1751
1752                 if (idx < s_idx)
1753                         goto cont;
1754                 if (idx > s_idx)
1755                         s_q_idx = 0;
1756                 q_idx = 0;
1757
1758                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1759                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1760                         goto done;
1761
1762                 dev_queue = dev_ingress_queue(dev);
1763                 if (dev_queue &&
1764                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1765                                        &q_idx, s_q_idx, false,
1766                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1767                         goto done;
1768
1769 cont:
1770                 idx++;
1771         }
1772
1773 done:
1774         cb->args[0] = idx;
1775         cb->args[1] = q_idx;
1776
1777         return skb->len;
1778 }
1779
1780
1781
1782 /************************************************
1783  *      Traffic classes manipulation.           *
1784  ************************************************/
1785
1786 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1787                           unsigned long cl,
1788                           u32 portid, u32 seq, u16 flags, int event)
1789 {
1790         struct tcmsg *tcm;
1791         struct nlmsghdr  *nlh;
1792         unsigned char *b = skb_tail_pointer(skb);
1793         struct gnet_dump d;
1794         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1795
1796         cond_resched();
1797         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1798         if (!nlh)
1799                 goto out_nlmsg_trim;
1800         tcm = nlmsg_data(nlh);
1801         tcm->tcm_family = AF_UNSPEC;
1802         tcm->tcm__pad1 = 0;
1803         tcm->tcm__pad2 = 0;
1804         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1805         tcm->tcm_parent = q->handle;
1806         tcm->tcm_handle = q->handle;
1807         tcm->tcm_info = 0;
1808         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1809                 goto nla_put_failure;
1810         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1811                 goto nla_put_failure;
1812
1813         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1814                                          NULL, &d, TCA_PAD) < 0)
1815                 goto nla_put_failure;
1816
1817         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1818                 goto nla_put_failure;
1819
1820         if (gnet_stats_finish_copy(&d) < 0)
1821                 goto nla_put_failure;
1822
1823         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1824         return skb->len;
1825
1826 out_nlmsg_trim:
1827 nla_put_failure:
1828         nlmsg_trim(skb, b);
1829         return -1;
1830 }
1831
1832 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1833                          struct nlmsghdr *n, struct Qdisc *q,
1834                          unsigned long cl, int event)
1835 {
1836         struct sk_buff *skb;
1837         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1838         int err = 0;
1839
1840         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1841         if (!skb)
1842                 return -ENOBUFS;
1843
1844         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1845                 kfree_skb(skb);
1846                 return -EINVAL;
1847         }
1848
1849         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1850                              n->nlmsg_flags & NLM_F_ECHO);
1851         if (err > 0)
1852                 err = 0;
1853         return err;
1854 }
1855
1856 static int tclass_del_notify(struct net *net,
1857                              const struct Qdisc_class_ops *cops,
1858                              struct sk_buff *oskb, struct nlmsghdr *n,
1859                              struct Qdisc *q, unsigned long cl)
1860 {
1861         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1862         struct sk_buff *skb;
1863         int err = 0;
1864
1865         if (!cops->delete)
1866                 return -EOPNOTSUPP;
1867
1868         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1869         if (!skb)
1870                 return -ENOBUFS;
1871
1872         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1873                            RTM_DELTCLASS) < 0) {
1874                 kfree_skb(skb);
1875                 return -EINVAL;
1876         }
1877
1878         err = cops->delete(q, cl);
1879         if (err) {
1880                 kfree_skb(skb);
1881                 return err;
1882         }
1883
1884         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1885                              n->nlmsg_flags & NLM_F_ECHO);
1886         if (err > 0)
1887                 err = 0;
1888         return err;
1889 }
1890
1891 #ifdef CONFIG_NET_CLS
1892
1893 struct tcf_bind_args {
1894         struct tcf_walker w;
1895         u32 classid;
1896         unsigned long cl;
1897 };
1898
1899 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1900 {
1901         struct tcf_bind_args *a = (void *)arg;
1902
1903         if (tp->ops->bind_class) {
1904                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1905
1906                 sch_tree_lock(q);
1907                 tp->ops->bind_class(n, a->classid, a->cl);
1908                 sch_tree_unlock(q);
1909         }
1910         return 0;
1911 }
1912
1913 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1914                            unsigned long new_cl)
1915 {
1916         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1917         struct tcf_block *block;
1918         struct tcf_chain *chain;
1919         unsigned long cl;
1920
1921         cl = cops->find(q, portid);
1922         if (!cl)
1923                 return;
1924         if (!cops->tcf_block)
1925                 return;
1926         block = cops->tcf_block(q, cl, NULL);
1927         if (!block)
1928                 return;
1929         for (chain = tcf_get_next_chain(block, NULL);
1930              chain;
1931              chain = tcf_get_next_chain(block, chain)) {
1932                 struct tcf_proto *tp;
1933
1934                 for (tp = tcf_get_next_proto(chain, NULL, true);
1935                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1936                         struct tcf_bind_args arg = {};
1937
1938                         arg.w.fn = tcf_node_bind;
1939                         arg.classid = clid;
1940                         arg.cl = new_cl;
1941                         tp->ops->walk(tp, &arg.w, true);
1942                 }
1943         }
1944 }
1945
1946 #else
1947
1948 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1949                            unsigned long new_cl)
1950 {
1951 }
1952
1953 #endif
1954
1955 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1956                          struct netlink_ext_ack *extack)
1957 {
1958         struct net *net = sock_net(skb->sk);
1959         struct tcmsg *tcm = nlmsg_data(n);
1960         struct nlattr *tca[TCA_MAX + 1];
1961         struct net_device *dev;
1962         struct Qdisc *q = NULL;
1963         const struct Qdisc_class_ops *cops;
1964         unsigned long cl = 0;
1965         unsigned long new_cl;
1966         u32 portid;
1967         u32 clid;
1968         u32 qid;
1969         int err;
1970
1971         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1972             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1973                 return -EPERM;
1974
1975         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1976                                      rtm_tca_policy, extack);
1977         if (err < 0)
1978                 return err;
1979
1980         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1981         if (!dev)
1982                 return -ENODEV;
1983
1984         /*
1985            parent == TC_H_UNSPEC - unspecified parent.
1986            parent == TC_H_ROOT   - class is root, which has no parent.
1987            parent == X:0         - parent is root class.
1988            parent == X:Y         - parent is a node in hierarchy.
1989            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1990
1991            handle == 0:0         - generate handle from kernel pool.
1992            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1993            handle == X:Y         - clear.
1994            handle == X:0         - root class.
1995          */
1996
1997         /* Step 1. Determine qdisc handle X:0 */
1998
1999         portid = tcm->tcm_parent;
2000         clid = tcm->tcm_handle;
2001         qid = TC_H_MAJ(clid);
2002
2003         if (portid != TC_H_ROOT) {
2004                 u32 qid1 = TC_H_MAJ(portid);
2005
2006                 if (qid && qid1) {
2007                         /* If both majors are known, they must be identical. */
2008                         if (qid != qid1)
2009                                 return -EINVAL;
2010                 } else if (qid1) {
2011                         qid = qid1;
2012                 } else if (qid == 0)
2013                         qid = dev->qdisc->handle;
2014
2015                 /* Now qid is genuine qdisc handle consistent
2016                  * both with parent and child.
2017                  *
2018                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2019                  */
2020                 if (portid)
2021                         portid = TC_H_MAKE(qid, portid);
2022         } else {
2023                 if (qid == 0)
2024                         qid = dev->qdisc->handle;
2025         }
2026
2027         /* OK. Locate qdisc */
2028         q = qdisc_lookup(dev, qid);
2029         if (!q)
2030                 return -ENOENT;
2031
2032         /* An check that it supports classes */
2033         cops = q->ops->cl_ops;
2034         if (cops == NULL)
2035                 return -EINVAL;
2036
2037         /* Now try to get class */
2038         if (clid == 0) {
2039                 if (portid == TC_H_ROOT)
2040                         clid = qid;
2041         } else
2042                 clid = TC_H_MAKE(qid, clid);
2043
2044         if (clid)
2045                 cl = cops->find(q, clid);
2046
2047         if (cl == 0) {
2048                 err = -ENOENT;
2049                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2050                     !(n->nlmsg_flags & NLM_F_CREATE))
2051                         goto out;
2052         } else {
2053                 switch (n->nlmsg_type) {
2054                 case RTM_NEWTCLASS:
2055                         err = -EEXIST;
2056                         if (n->nlmsg_flags & NLM_F_EXCL)
2057                                 goto out;
2058                         break;
2059                 case RTM_DELTCLASS:
2060                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2061                         /* Unbind the class with flilters with 0 */
2062                         tc_bind_tclass(q, portid, clid, 0);
2063                         goto out;
2064                 case RTM_GETTCLASS:
2065                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2066                         goto out;
2067                 default:
2068                         err = -EINVAL;
2069                         goto out;
2070                 }
2071         }
2072
2073         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2074                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2075                 return -EOPNOTSUPP;
2076         }
2077
2078         new_cl = cl;
2079         err = -EOPNOTSUPP;
2080         if (cops->change)
2081                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2082         if (err == 0) {
2083                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2084                 /* We just create a new class, need to do reverse binding. */
2085                 if (cl != new_cl)
2086                         tc_bind_tclass(q, portid, clid, new_cl);
2087         }
2088 out:
2089         return err;
2090 }
2091
2092 struct qdisc_dump_args {
2093         struct qdisc_walker     w;
2094         struct sk_buff          *skb;
2095         struct netlink_callback *cb;
2096 };
2097
2098 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2099                             struct qdisc_walker *arg)
2100 {
2101         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2102
2103         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2104                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2105                               RTM_NEWTCLASS);
2106 }
2107
2108 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2109                                 struct tcmsg *tcm, struct netlink_callback *cb,
2110                                 int *t_p, int s_t)
2111 {
2112         struct qdisc_dump_args arg;
2113
2114         if (tc_qdisc_dump_ignore(q, false) ||
2115             *t_p < s_t || !q->ops->cl_ops ||
2116             (tcm->tcm_parent &&
2117              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2118                 (*t_p)++;
2119                 return 0;
2120         }
2121         if (*t_p > s_t)
2122                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2123         arg.w.fn = qdisc_class_dump;
2124         arg.skb = skb;
2125         arg.cb = cb;
2126         arg.w.stop  = 0;
2127         arg.w.skip = cb->args[1];
2128         arg.w.count = 0;
2129         q->ops->cl_ops->walk(q, &arg.w);
2130         cb->args[1] = arg.w.count;
2131         if (arg.w.stop)
2132                 return -1;
2133         (*t_p)++;
2134         return 0;
2135 }
2136
2137 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2138                                struct tcmsg *tcm, struct netlink_callback *cb,
2139                                int *t_p, int s_t)
2140 {
2141         struct Qdisc *q;
2142         int b;
2143
2144         if (!root)
2145                 return 0;
2146
2147         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2148                 return -1;
2149
2150         if (!qdisc_dev(root))
2151                 return 0;
2152
2153         if (tcm->tcm_parent) {
2154                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2155                 if (q && q != root &&
2156                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2157                         return -1;
2158                 return 0;
2159         }
2160         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2161                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2162                         return -1;
2163         }
2164
2165         return 0;
2166 }
2167
2168 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2169 {
2170         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2171         struct net *net = sock_net(skb->sk);
2172         struct netdev_queue *dev_queue;
2173         struct net_device *dev;
2174         int t, s_t;
2175
2176         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2177                 return 0;
2178         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2179         if (!dev)
2180                 return 0;
2181
2182         s_t = cb->args[0];
2183         t = 0;
2184
2185         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2186                 goto done;
2187
2188         dev_queue = dev_ingress_queue(dev);
2189         if (dev_queue &&
2190             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2191                                 &t, s_t) < 0)
2192                 goto done;
2193
2194 done:
2195         cb->args[0] = t;
2196
2197         dev_put(dev);
2198         return skb->len;
2199 }
2200
2201 #ifdef CONFIG_PROC_FS
2202 static int psched_show(struct seq_file *seq, void *v)
2203 {
2204         seq_printf(seq, "%08x %08x %08x %08x\n",
2205                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2206                    1000000,
2207                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2208
2209         return 0;
2210 }
2211
2212 static int __net_init psched_net_init(struct net *net)
2213 {
2214         struct proc_dir_entry *e;
2215
2216         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2217         if (e == NULL)
2218                 return -ENOMEM;
2219
2220         return 0;
2221 }
2222
2223 static void __net_exit psched_net_exit(struct net *net)
2224 {
2225         remove_proc_entry("psched", net->proc_net);
2226 }
2227 #else
2228 static int __net_init psched_net_init(struct net *net)
2229 {
2230         return 0;
2231 }
2232
2233 static void __net_exit psched_net_exit(struct net *net)
2234 {
2235 }
2236 #endif
2237
2238 static struct pernet_operations psched_net_ops = {
2239         .init = psched_net_init,
2240         .exit = psched_net_exit,
2241 };
2242
2243 static int __init pktsched_init(void)
2244 {
2245         int err;
2246
2247         err = register_pernet_subsys(&psched_net_ops);
2248         if (err) {
2249                 pr_err("pktsched_init: "
2250                        "cannot initialize per netns operations\n");
2251                 return err;
2252         }
2253
2254         register_qdisc(&pfifo_fast_ops);
2255         register_qdisc(&pfifo_qdisc_ops);
2256         register_qdisc(&bfifo_qdisc_ops);
2257         register_qdisc(&pfifo_head_drop_qdisc_ops);
2258         register_qdisc(&mq_qdisc_ops);
2259         register_qdisc(&noqueue_qdisc_ops);
2260
2261         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2262         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2263         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2264                       0);
2265         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2266         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2267         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2268                       0);
2269
2270         return 0;
2271 }
2272
2273 subsys_initcall(pktsched_init);