regulator: lochnagar: Use a consisent comment style for SPDX header
[linux-2.6-block.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42    Short review.
43    -------------
44
45    This file consists of two interrelated parts:
46
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68
69    All real intelligent work is done inside qdisc modules.
70
71
72
73    Every discipline has two major routines: enqueue and dequeue.
74
75    ---dequeue
76
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83
84    ---enqueue
85
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP        - this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93
94    Auxiliary routines:
95
96    ---peek
97
98    like dequeue but without removing a packet from the queue
99
100    ---reset
101
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104
105    ---init
106
107    initializes newly created qdisc.
108
109    ---destroy
110
111    destroys resources allocated by init and during lifetime of qdisc.
112
113    ---change
114
115    changes qdisc parameters.
116  */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123  *      Queueing disciplines manipulation.      *
124  ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135         struct Qdisc_ops *q, **qp;
136         int rc = -EEXIST;
137
138         write_lock(&qdisc_mod_lock);
139         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140                 if (!strcmp(qops->id, q->id))
141                         goto out;
142
143         if (qops->enqueue == NULL)
144                 qops->enqueue = noop_qdisc_ops.enqueue;
145         if (qops->peek == NULL) {
146                 if (qops->dequeue == NULL)
147                         qops->peek = noop_qdisc_ops.peek;
148                 else
149                         goto out_einval;
150         }
151         if (qops->dequeue == NULL)
152                 qops->dequeue = noop_qdisc_ops.dequeue;
153
154         if (qops->cl_ops) {
155                 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157                 if (!(cops->find && cops->walk && cops->leaf))
158                         goto out_einval;
159
160                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161                         goto out_einval;
162         }
163
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170
171 out_einval:
172         rc = -EINVAL;
173         goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179         struct Qdisc_ops *q, **qp;
180         int err = -ENOENT;
181
182         write_lock(&qdisc_mod_lock);
183         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184                 if (q == qops)
185                         break;
186         if (q) {
187                 *qp = q->next;
188                 q->next = NULL;
189                 err = 0;
190         }
191         write_unlock(&qdisc_mod_lock);
192         return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199         read_lock(&qdisc_mod_lock);
200         strlcpy(name, default_qdisc_ops->id, len);
201         read_unlock(&qdisc_mod_lock);
202 }
203
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206         struct Qdisc_ops *q = NULL;
207
208         for (q = qdisc_base; q; q = q->next) {
209                 if (!strcmp(name, q->id)) {
210                         if (!try_module_get(q->owner))
211                                 q = NULL;
212                         break;
213                 }
214         }
215
216         return q;
217 }
218
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222         const struct Qdisc_ops *ops;
223
224         if (!capable(CAP_NET_ADMIN))
225                 return -EPERM;
226
227         write_lock(&qdisc_mod_lock);
228         ops = qdisc_lookup_default(name);
229         if (!ops) {
230                 /* Not found, drop lock and try to load module */
231                 write_unlock(&qdisc_mod_lock);
232                 request_module("sch_%s", name);
233                 write_lock(&qdisc_mod_lock);
234
235                 ops = qdisc_lookup_default(name);
236         }
237
238         if (ops) {
239                 /* Set new default */
240                 module_put(default_qdisc_ops->owner);
241                 default_qdisc_ops = ops;
242         }
243         write_unlock(&qdisc_mod_lock);
244
245         return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264         struct Qdisc *q;
265
266         if (!qdisc_dev(root))
267                 return (root->handle == handle ? root : NULL);
268
269         if (!(root->flags & TCQ_F_BUILTIN) &&
270             root->handle == handle)
271                 return root;
272
273         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(dev->qdisc, handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320         unsigned long cl;
321         struct Qdisc *leaf;
322         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323
324         if (cops == NULL)
325                 return NULL;
326         cl = cops->find(p, classid);
327
328         if (cl == 0)
329                 return NULL;
330         leaf = cops->leaf(p, cl);
331         return leaf;
332 }
333
334 /* Find queueing discipline by name */
335
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338         struct Qdisc_ops *q = NULL;
339
340         if (kind) {
341                 read_lock(&qdisc_mod_lock);
342                 for (q = qdisc_base; q; q = q->next) {
343                         if (nla_strcmp(kind, q->id) == 0) {
344                                 if (!try_module_get(q->owner))
345                                         q = NULL;
346                                 break;
347                         }
348                 }
349                 read_unlock(&qdisc_mod_lock);
350         }
351         return q;
352 }
353
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373         int low       = roundup(r->mpu, 48);
374         int high      = roundup(low+1, 48);
375         int cell_low  = low >> r->cell_log;
376         int cell_high = (high >> r->cell_log) - 1;
377
378         /* rtab is too inaccurate at rates > 100Mbit/s */
379         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380                 pr_debug("TC linklayer: Giving up ATM detection\n");
381                 return TC_LINKLAYER_ETHERNET;
382         }
383
384         if ((cell_high > cell_low) && (cell_high < 256)
385             && (rtab[cell_low] == rtab[cell_high])) {
386                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387                          cell_low, cell_high, rtab[cell_high]);
388                 return TC_LINKLAYER_ATM;
389         }
390         return TC_LINKLAYER_ETHERNET;
391 }
392
393 static struct qdisc_rate_table *qdisc_rtab_list;
394
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396                                         struct nlattr *tab,
397                                         struct netlink_ext_ack *extack)
398 {
399         struct qdisc_rate_table *rtab;
400
401         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
402             nla_len(tab) != TC_RTAB_SIZE) {
403                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
404                 return NULL;
405         }
406
407         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
408                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
409                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
410                         rtab->refcnt++;
411                         return rtab;
412                 }
413         }
414
415         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
416         if (rtab) {
417                 rtab->rate = *r;
418                 rtab->refcnt = 1;
419                 memcpy(rtab->data, nla_data(tab), 1024);
420                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
421                         r->linklayer = __detect_linklayer(r, rtab->data);
422                 rtab->next = qdisc_rtab_list;
423                 qdisc_rtab_list = rtab;
424         } else {
425                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
426         }
427         return rtab;
428 }
429 EXPORT_SYMBOL(qdisc_get_rtab);
430
431 void qdisc_put_rtab(struct qdisc_rate_table *tab)
432 {
433         struct qdisc_rate_table *rtab, **rtabp;
434
435         if (!tab || --tab->refcnt)
436                 return;
437
438         for (rtabp = &qdisc_rtab_list;
439              (rtab = *rtabp) != NULL;
440              rtabp = &rtab->next) {
441                 if (rtab == tab) {
442                         *rtabp = rtab->next;
443                         kfree(rtab);
444                         return;
445                 }
446         }
447 }
448 EXPORT_SYMBOL(qdisc_put_rtab);
449
450 static LIST_HEAD(qdisc_stab_list);
451
452 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
453         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
454         [TCA_STAB_DATA] = { .type = NLA_BINARY },
455 };
456
457 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
458                                                struct netlink_ext_ack *extack)
459 {
460         struct nlattr *tb[TCA_STAB_MAX + 1];
461         struct qdisc_size_table *stab;
462         struct tc_sizespec *s;
463         unsigned int tsize = 0;
464         u16 *tab = NULL;
465         int err;
466
467         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
468         if (err < 0)
469                 return ERR_PTR(err);
470         if (!tb[TCA_STAB_BASE]) {
471                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
472                 return ERR_PTR(-EINVAL);
473         }
474
475         s = nla_data(tb[TCA_STAB_BASE]);
476
477         if (s->tsize > 0) {
478                 if (!tb[TCA_STAB_DATA]) {
479                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
480                         return ERR_PTR(-EINVAL);
481                 }
482                 tab = nla_data(tb[TCA_STAB_DATA]);
483                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
484         }
485
486         if (tsize != s->tsize || (!tab && tsize > 0)) {
487                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         list_for_each_entry(stab, &qdisc_stab_list, list) {
492                 if (memcmp(&stab->szopts, s, sizeof(*s)))
493                         continue;
494                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
495                         continue;
496                 stab->refcnt++;
497                 return stab;
498         }
499
500         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
501         if (!stab)
502                 return ERR_PTR(-ENOMEM);
503
504         stab->refcnt = 1;
505         stab->szopts = *s;
506         if (tsize > 0)
507                 memcpy(stab->data, tab, tsize * sizeof(u16));
508
509         list_add_tail(&stab->list, &qdisc_stab_list);
510
511         return stab;
512 }
513
514 static void stab_kfree_rcu(struct rcu_head *head)
515 {
516         kfree(container_of(head, struct qdisc_size_table, rcu));
517 }
518
519 void qdisc_put_stab(struct qdisc_size_table *tab)
520 {
521         if (!tab)
522                 return;
523
524         if (--tab->refcnt == 0) {
525                 list_del(&tab->list);
526                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
527         }
528 }
529 EXPORT_SYMBOL(qdisc_put_stab);
530
531 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
532 {
533         struct nlattr *nest;
534
535         nest = nla_nest_start(skb, TCA_STAB);
536         if (nest == NULL)
537                 goto nla_put_failure;
538         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
539                 goto nla_put_failure;
540         nla_nest_end(skb, nest);
541
542         return skb->len;
543
544 nla_put_failure:
545         return -1;
546 }
547
548 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
549                                const struct qdisc_size_table *stab)
550 {
551         int pkt_len, slot;
552
553         pkt_len = skb->len + stab->szopts.overhead;
554         if (unlikely(!stab->szopts.tsize))
555                 goto out;
556
557         slot = pkt_len + stab->szopts.cell_align;
558         if (unlikely(slot < 0))
559                 slot = 0;
560
561         slot >>= stab->szopts.cell_log;
562         if (likely(slot < stab->szopts.tsize))
563                 pkt_len = stab->data[slot];
564         else
565                 pkt_len = stab->data[stab->szopts.tsize - 1] *
566                                 (slot / stab->szopts.tsize) +
567                                 stab->data[slot % stab->szopts.tsize];
568
569         pkt_len <<= stab->szopts.size_log;
570 out:
571         if (unlikely(pkt_len < 1))
572                 pkt_len = 1;
573         qdisc_skb_cb(skb)->pkt_len = pkt_len;
574 }
575 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
576
577 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
578 {
579         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
580                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
581                         txt, qdisc->ops->id, qdisc->handle >> 16);
582                 qdisc->flags |= TCQ_F_WARN_NONWC;
583         }
584 }
585 EXPORT_SYMBOL(qdisc_warn_nonwc);
586
587 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
588 {
589         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
590                                                  timer);
591
592         rcu_read_lock();
593         __netif_schedule(qdisc_root(wd->qdisc));
594         rcu_read_unlock();
595
596         return HRTIMER_NORESTART;
597 }
598
599 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
600                                  clockid_t clockid)
601 {
602         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
603         wd->timer.function = qdisc_watchdog;
604         wd->qdisc = qdisc;
605 }
606 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
607
608 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
609 {
610         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
611 }
612 EXPORT_SYMBOL(qdisc_watchdog_init);
613
614 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
615 {
616         if (test_bit(__QDISC_STATE_DEACTIVATED,
617                      &qdisc_root_sleeping(wd->qdisc)->state))
618                 return;
619
620         if (wd->last_expires == expires)
621                 return;
622
623         wd->last_expires = expires;
624         hrtimer_start(&wd->timer,
625                       ns_to_ktime(expires),
626                       HRTIMER_MODE_ABS_PINNED);
627 }
628 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
629
630 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
631 {
632         hrtimer_cancel(&wd->timer);
633 }
634 EXPORT_SYMBOL(qdisc_watchdog_cancel);
635
636 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
637 {
638         struct hlist_head *h;
639         unsigned int i;
640
641         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
642
643         if (h != NULL) {
644                 for (i = 0; i < n; i++)
645                         INIT_HLIST_HEAD(&h[i]);
646         }
647         return h;
648 }
649
650 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
651 {
652         struct Qdisc_class_common *cl;
653         struct hlist_node *next;
654         struct hlist_head *nhash, *ohash;
655         unsigned int nsize, nmask, osize;
656         unsigned int i, h;
657
658         /* Rehash when load factor exceeds 0.75 */
659         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
660                 return;
661         nsize = clhash->hashsize * 2;
662         nmask = nsize - 1;
663         nhash = qdisc_class_hash_alloc(nsize);
664         if (nhash == NULL)
665                 return;
666
667         ohash = clhash->hash;
668         osize = clhash->hashsize;
669
670         sch_tree_lock(sch);
671         for (i = 0; i < osize; i++) {
672                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
673                         h = qdisc_class_hash(cl->classid, nmask);
674                         hlist_add_head(&cl->hnode, &nhash[h]);
675                 }
676         }
677         clhash->hash     = nhash;
678         clhash->hashsize = nsize;
679         clhash->hashmask = nmask;
680         sch_tree_unlock(sch);
681
682         kvfree(ohash);
683 }
684 EXPORT_SYMBOL(qdisc_class_hash_grow);
685
686 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
687 {
688         unsigned int size = 4;
689
690         clhash->hash = qdisc_class_hash_alloc(size);
691         if (!clhash->hash)
692                 return -ENOMEM;
693         clhash->hashsize  = size;
694         clhash->hashmask  = size - 1;
695         clhash->hashelems = 0;
696         return 0;
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_init);
699
700 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
701 {
702         kvfree(clhash->hash);
703 }
704 EXPORT_SYMBOL(qdisc_class_hash_destroy);
705
706 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
707                              struct Qdisc_class_common *cl)
708 {
709         unsigned int h;
710
711         INIT_HLIST_NODE(&cl->hnode);
712         h = qdisc_class_hash(cl->classid, clhash->hashmask);
713         hlist_add_head(&cl->hnode, &clhash->hash[h]);
714         clhash->hashelems++;
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_insert);
717
718 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
719                              struct Qdisc_class_common *cl)
720 {
721         hlist_del(&cl->hnode);
722         clhash->hashelems--;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_remove);
725
726 /* Allocate an unique handle from space managed by kernel
727  * Possible range is [8000-FFFF]:0000 (0x8000 values)
728  */
729 static u32 qdisc_alloc_handle(struct net_device *dev)
730 {
731         int i = 0x8000;
732         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
733
734         do {
735                 autohandle += TC_H_MAKE(0x10000U, 0);
736                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
737                         autohandle = TC_H_MAKE(0x80000000U, 0);
738                 if (!qdisc_lookup(dev, autohandle))
739                         return autohandle;
740                 cond_resched();
741         } while (--i > 0);
742
743         return 0;
744 }
745
746 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
747                                unsigned int len)
748 {
749         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
750         const struct Qdisc_class_ops *cops;
751         unsigned long cl;
752         u32 parentid;
753         bool notify;
754         int drops;
755
756         if (n == 0 && len == 0)
757                 return;
758         drops = max_t(int, n, 0);
759         rcu_read_lock();
760         while ((parentid = sch->parent)) {
761                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
762                         break;
763
764                 if (sch->flags & TCQ_F_NOPARENT)
765                         break;
766                 /* Notify parent qdisc only if child qdisc becomes empty.
767                  *
768                  * If child was empty even before update then backlog
769                  * counter is screwed and we skip notification because
770                  * parent class is already passive.
771                  *
772                  * If the original child was offloaded then it is allowed
773                  * to be seem as empty, so the parent is notified anyway.
774                  */
775                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
776                                                        !qdisc_is_offloaded);
777                 /* TODO: perform the search on a per txq basis */
778                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
779                 if (sch == NULL) {
780                         WARN_ON_ONCE(parentid != TC_H_ROOT);
781                         break;
782                 }
783                 cops = sch->ops->cl_ops;
784                 if (notify && cops->qlen_notify) {
785                         cl = cops->find(sch, parentid);
786                         cops->qlen_notify(sch, cl);
787                 }
788                 sch->q.qlen -= n;
789                 sch->qstats.backlog -= len;
790                 __qdisc_qstats_drop(sch, drops);
791         }
792         rcu_read_unlock();
793 }
794 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
795
796 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
797                          u32 portid, u32 seq, u16 flags, int event)
798 {
799         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
800         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
801         struct tcmsg *tcm;
802         struct nlmsghdr  *nlh;
803         unsigned char *b = skb_tail_pointer(skb);
804         struct gnet_dump d;
805         struct qdisc_size_table *stab;
806         u32 block_index;
807         __u32 qlen;
808
809         cond_resched();
810         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
811         if (!nlh)
812                 goto out_nlmsg_trim;
813         tcm = nlmsg_data(nlh);
814         tcm->tcm_family = AF_UNSPEC;
815         tcm->tcm__pad1 = 0;
816         tcm->tcm__pad2 = 0;
817         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
818         tcm->tcm_parent = clid;
819         tcm->tcm_handle = q->handle;
820         tcm->tcm_info = refcount_read(&q->refcnt);
821         if (nla_put_string(skb, TCA_KIND, q->ops->id))
822                 goto nla_put_failure;
823         if (q->ops->ingress_block_get) {
824                 block_index = q->ops->ingress_block_get(q);
825                 if (block_index &&
826                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
827                         goto nla_put_failure;
828         }
829         if (q->ops->egress_block_get) {
830                 block_index = q->ops->egress_block_get(q);
831                 if (block_index &&
832                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
833                         goto nla_put_failure;
834         }
835         if (q->ops->dump && q->ops->dump(q, skb) < 0)
836                 goto nla_put_failure;
837         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
838                 goto nla_put_failure;
839         qlen = qdisc_qlen_sum(q);
840
841         stab = rtnl_dereference(q->stab);
842         if (stab && qdisc_dump_stab(skb, stab) < 0)
843                 goto nla_put_failure;
844
845         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
846                                          NULL, &d, TCA_PAD) < 0)
847                 goto nla_put_failure;
848
849         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
850                 goto nla_put_failure;
851
852         if (qdisc_is_percpu_stats(q)) {
853                 cpu_bstats = q->cpu_bstats;
854                 cpu_qstats = q->cpu_qstats;
855         }
856
857         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
858                                   &d, cpu_bstats, &q->bstats) < 0 ||
859             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
860             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
861                 goto nla_put_failure;
862
863         if (gnet_stats_finish_copy(&d) < 0)
864                 goto nla_put_failure;
865
866         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
867         return skb->len;
868
869 out_nlmsg_trim:
870 nla_put_failure:
871         nlmsg_trim(skb, b);
872         return -1;
873 }
874
875 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
876 {
877         if (q->flags & TCQ_F_BUILTIN)
878                 return true;
879         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
880                 return true;
881
882         return false;
883 }
884
885 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
886                         struct nlmsghdr *n, u32 clid,
887                         struct Qdisc *old, struct Qdisc *new)
888 {
889         struct sk_buff *skb;
890         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
891
892         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
893         if (!skb)
894                 return -ENOBUFS;
895
896         if (old && !tc_qdisc_dump_ignore(old, false)) {
897                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
898                                   0, RTM_DELQDISC) < 0)
899                         goto err_out;
900         }
901         if (new && !tc_qdisc_dump_ignore(new, false)) {
902                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
903                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
904                         goto err_out;
905         }
906
907         if (skb->len)
908                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
909                                       n->nlmsg_flags & NLM_F_ECHO);
910
911 err_out:
912         kfree_skb(skb);
913         return -EINVAL;
914 }
915
916 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
917                                struct nlmsghdr *n, u32 clid,
918                                struct Qdisc *old, struct Qdisc *new)
919 {
920         if (new || old)
921                 qdisc_notify(net, skb, n, clid, old, new);
922
923         if (old)
924                 qdisc_destroy(old);
925 }
926
927 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
928  * to device "dev".
929  *
930  * When appropriate send a netlink notification using 'skb'
931  * and "n".
932  *
933  * On success, destroy old qdisc.
934  */
935
936 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
937                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
938                        struct Qdisc *new, struct Qdisc *old,
939                        struct netlink_ext_ack *extack)
940 {
941         struct Qdisc *q = old;
942         struct net *net = dev_net(dev);
943         int err = 0;
944
945         if (parent == NULL) {
946                 unsigned int i, num_q, ingress;
947
948                 ingress = 0;
949                 num_q = dev->num_tx_queues;
950                 if ((q && q->flags & TCQ_F_INGRESS) ||
951                     (new && new->flags & TCQ_F_INGRESS)) {
952                         num_q = 1;
953                         ingress = 1;
954                         if (!dev_ingress_queue(dev)) {
955                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
956                                 return -ENOENT;
957                         }
958                 }
959
960                 if (dev->flags & IFF_UP)
961                         dev_deactivate(dev);
962
963                 if (new && new->ops->attach)
964                         goto skip;
965
966                 for (i = 0; i < num_q; i++) {
967                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
968
969                         if (!ingress)
970                                 dev_queue = netdev_get_tx_queue(dev, i);
971
972                         old = dev_graft_qdisc(dev_queue, new);
973                         if (new && i > 0)
974                                 qdisc_refcount_inc(new);
975
976                         if (!ingress)
977                                 qdisc_destroy(old);
978                 }
979
980 skip:
981                 if (!ingress) {
982                         notify_and_destroy(net, skb, n, classid,
983                                            dev->qdisc, new);
984                         if (new && !new->ops->attach)
985                                 qdisc_refcount_inc(new);
986                         dev->qdisc = new ? : &noop_qdisc;
987
988                         if (new && new->ops->attach)
989                                 new->ops->attach(new);
990                 } else {
991                         notify_and_destroy(net, skb, n, classid, old, new);
992                 }
993
994                 if (dev->flags & IFF_UP)
995                         dev_activate(dev);
996         } else {
997                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
998
999                 /* Only support running class lockless if parent is lockless */
1000                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1001                     parent && !(parent->flags & TCQ_F_NOLOCK))
1002                         new->flags &= ~TCQ_F_NOLOCK;
1003
1004                 err = -EOPNOTSUPP;
1005                 if (cops && cops->graft) {
1006                         unsigned long cl = cops->find(parent, classid);
1007
1008                         if (cl) {
1009                                 err = cops->graft(parent, cl, new, &old,
1010                                                   extack);
1011                         } else {
1012                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1013                                 err = -ENOENT;
1014                         }
1015                 }
1016                 if (!err)
1017                         notify_and_destroy(net, skb, n, classid, old, new);
1018         }
1019         return err;
1020 }
1021
1022 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1023                                    struct netlink_ext_ack *extack)
1024 {
1025         u32 block_index;
1026
1027         if (tca[TCA_INGRESS_BLOCK]) {
1028                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1029
1030                 if (!block_index) {
1031                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1032                         return -EINVAL;
1033                 }
1034                 if (!sch->ops->ingress_block_set) {
1035                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1036                         return -EOPNOTSUPP;
1037                 }
1038                 sch->ops->ingress_block_set(sch, block_index);
1039         }
1040         if (tca[TCA_EGRESS_BLOCK]) {
1041                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1042
1043                 if (!block_index) {
1044                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1045                         return -EINVAL;
1046                 }
1047                 if (!sch->ops->egress_block_set) {
1048                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1049                         return -EOPNOTSUPP;
1050                 }
1051                 sch->ops->egress_block_set(sch, block_index);
1052         }
1053         return 0;
1054 }
1055
1056 /* lockdep annotation is needed for ingress; egress gets it only for name */
1057 static struct lock_class_key qdisc_tx_lock;
1058 static struct lock_class_key qdisc_rx_lock;
1059
1060 /*
1061    Allocate and initialize new qdisc.
1062
1063    Parameters are passed via opt.
1064  */
1065
1066 static struct Qdisc *qdisc_create(struct net_device *dev,
1067                                   struct netdev_queue *dev_queue,
1068                                   struct Qdisc *p, u32 parent, u32 handle,
1069                                   struct nlattr **tca, int *errp,
1070                                   struct netlink_ext_ack *extack)
1071 {
1072         int err;
1073         struct nlattr *kind = tca[TCA_KIND];
1074         struct Qdisc *sch;
1075         struct Qdisc_ops *ops;
1076         struct qdisc_size_table *stab;
1077
1078         ops = qdisc_lookup_ops(kind);
1079 #ifdef CONFIG_MODULES
1080         if (ops == NULL && kind != NULL) {
1081                 char name[IFNAMSIZ];
1082                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1083                         /* We dropped the RTNL semaphore in order to
1084                          * perform the module load.  So, even if we
1085                          * succeeded in loading the module we have to
1086                          * tell the caller to replay the request.  We
1087                          * indicate this using -EAGAIN.
1088                          * We replay the request because the device may
1089                          * go away in the mean time.
1090                          */
1091                         rtnl_unlock();
1092                         request_module("sch_%s", name);
1093                         rtnl_lock();
1094                         ops = qdisc_lookup_ops(kind);
1095                         if (ops != NULL) {
1096                                 /* We will try again qdisc_lookup_ops,
1097                                  * so don't keep a reference.
1098                                  */
1099                                 module_put(ops->owner);
1100                                 err = -EAGAIN;
1101                                 goto err_out;
1102                         }
1103                 }
1104         }
1105 #endif
1106
1107         err = -ENOENT;
1108         if (!ops) {
1109                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1110                 goto err_out;
1111         }
1112
1113         sch = qdisc_alloc(dev_queue, ops, extack);
1114         if (IS_ERR(sch)) {
1115                 err = PTR_ERR(sch);
1116                 goto err_out2;
1117         }
1118
1119         sch->parent = parent;
1120
1121         if (handle == TC_H_INGRESS) {
1122                 sch->flags |= TCQ_F_INGRESS;
1123                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1124                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1125         } else {
1126                 if (handle == 0) {
1127                         handle = qdisc_alloc_handle(dev);
1128                         err = -ENOMEM;
1129                         if (handle == 0)
1130                                 goto err_out3;
1131                 }
1132                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1133                 if (!netif_is_multiqueue(dev))
1134                         sch->flags |= TCQ_F_ONETXQUEUE;
1135         }
1136
1137         sch->handle = handle;
1138
1139         /* This exist to keep backward compatible with a userspace
1140          * loophole, what allowed userspace to get IFF_NO_QUEUE
1141          * facility on older kernels by setting tx_queue_len=0 (prior
1142          * to qdisc init), and then forgot to reinit tx_queue_len
1143          * before again attaching a qdisc.
1144          */
1145         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1146                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1147                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1148         }
1149
1150         err = qdisc_block_indexes_set(sch, tca, extack);
1151         if (err)
1152                 goto err_out3;
1153
1154         if (ops->init) {
1155                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1156                 if (err != 0)
1157                         goto err_out5;
1158         }
1159
1160         if (tca[TCA_STAB]) {
1161                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1162                 if (IS_ERR(stab)) {
1163                         err = PTR_ERR(stab);
1164                         goto err_out4;
1165                 }
1166                 rcu_assign_pointer(sch->stab, stab);
1167         }
1168         if (tca[TCA_RATE]) {
1169                 seqcount_t *running;
1170
1171                 err = -EOPNOTSUPP;
1172                 if (sch->flags & TCQ_F_MQROOT) {
1173                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1174                         goto err_out4;
1175                 }
1176
1177                 if (sch->parent != TC_H_ROOT &&
1178                     !(sch->flags & TCQ_F_INGRESS) &&
1179                     (!p || !(p->flags & TCQ_F_MQROOT)))
1180                         running = qdisc_root_sleeping_running(sch);
1181                 else
1182                         running = &sch->running;
1183
1184                 err = gen_new_estimator(&sch->bstats,
1185                                         sch->cpu_bstats,
1186                                         &sch->rate_est,
1187                                         NULL,
1188                                         running,
1189                                         tca[TCA_RATE]);
1190                 if (err) {
1191                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1192                         goto err_out4;
1193                 }
1194         }
1195
1196         qdisc_hash_add(sch, false);
1197
1198         return sch;
1199
1200 err_out5:
1201         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1202         if (ops->destroy)
1203                 ops->destroy(sch);
1204 err_out3:
1205         dev_put(dev);
1206         qdisc_free(sch);
1207 err_out2:
1208         module_put(ops->owner);
1209 err_out:
1210         *errp = err;
1211         return NULL;
1212
1213 err_out4:
1214         /*
1215          * Any broken qdiscs that would require a ops->reset() here?
1216          * The qdisc was never in action so it shouldn't be necessary.
1217          */
1218         qdisc_put_stab(rtnl_dereference(sch->stab));
1219         if (ops->destroy)
1220                 ops->destroy(sch);
1221         goto err_out3;
1222 }
1223
1224 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1225                         struct netlink_ext_ack *extack)
1226 {
1227         struct qdisc_size_table *ostab, *stab = NULL;
1228         int err = 0;
1229
1230         if (tca[TCA_OPTIONS]) {
1231                 if (!sch->ops->change) {
1232                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1233                         return -EINVAL;
1234                 }
1235                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1236                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1237                         return -EOPNOTSUPP;
1238                 }
1239                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1240                 if (err)
1241                         return err;
1242         }
1243
1244         if (tca[TCA_STAB]) {
1245                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1246                 if (IS_ERR(stab))
1247                         return PTR_ERR(stab);
1248         }
1249
1250         ostab = rtnl_dereference(sch->stab);
1251         rcu_assign_pointer(sch->stab, stab);
1252         qdisc_put_stab(ostab);
1253
1254         if (tca[TCA_RATE]) {
1255                 /* NB: ignores errors from replace_estimator
1256                    because change can't be undone. */
1257                 if (sch->flags & TCQ_F_MQROOT)
1258                         goto out;
1259                 gen_replace_estimator(&sch->bstats,
1260                                       sch->cpu_bstats,
1261                                       &sch->rate_est,
1262                                       NULL,
1263                                       qdisc_root_sleeping_running(sch),
1264                                       tca[TCA_RATE]);
1265         }
1266 out:
1267         return 0;
1268 }
1269
1270 struct check_loop_arg {
1271         struct qdisc_walker     w;
1272         struct Qdisc            *p;
1273         int                     depth;
1274 };
1275
1276 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1277                          struct qdisc_walker *w);
1278
1279 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1280 {
1281         struct check_loop_arg   arg;
1282
1283         if (q->ops->cl_ops == NULL)
1284                 return 0;
1285
1286         arg.w.stop = arg.w.skip = arg.w.count = 0;
1287         arg.w.fn = check_loop_fn;
1288         arg.depth = depth;
1289         arg.p = p;
1290         q->ops->cl_ops->walk(q, &arg.w);
1291         return arg.w.stop ? -ELOOP : 0;
1292 }
1293
1294 static int
1295 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1296 {
1297         struct Qdisc *leaf;
1298         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1299         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1300
1301         leaf = cops->leaf(q, cl);
1302         if (leaf) {
1303                 if (leaf == arg->p || arg->depth > 7)
1304                         return -ELOOP;
1305                 return check_loop(leaf, arg->p, arg->depth + 1);
1306         }
1307         return 0;
1308 }
1309
1310 /*
1311  * Delete/get qdisc.
1312  */
1313
1314 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1315                         struct netlink_ext_ack *extack)
1316 {
1317         struct net *net = sock_net(skb->sk);
1318         struct tcmsg *tcm = nlmsg_data(n);
1319         struct nlattr *tca[TCA_MAX + 1];
1320         struct net_device *dev;
1321         u32 clid;
1322         struct Qdisc *q = NULL;
1323         struct Qdisc *p = NULL;
1324         int err;
1325
1326         if ((n->nlmsg_type != RTM_GETQDISC) &&
1327             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1328                 return -EPERM;
1329
1330         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1331         if (err < 0)
1332                 return err;
1333
1334         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1335         if (!dev)
1336                 return -ENODEV;
1337
1338         clid = tcm->tcm_parent;
1339         if (clid) {
1340                 if (clid != TC_H_ROOT) {
1341                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1342                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1343                                 if (!p) {
1344                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1345                                         return -ENOENT;
1346                                 }
1347                                 q = qdisc_leaf(p, clid);
1348                         } else if (dev_ingress_queue(dev)) {
1349                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1350                         }
1351                 } else {
1352                         q = dev->qdisc;
1353                 }
1354                 if (!q) {
1355                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1356                         return -ENOENT;
1357                 }
1358
1359                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1360                         NL_SET_ERR_MSG(extack, "Invalid handle");
1361                         return -EINVAL;
1362                 }
1363         } else {
1364                 q = qdisc_lookup(dev, tcm->tcm_handle);
1365                 if (!q) {
1366                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1367                         return -ENOENT;
1368                 }
1369         }
1370
1371         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1372                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1373                 return -EINVAL;
1374         }
1375
1376         if (n->nlmsg_type == RTM_DELQDISC) {
1377                 if (!clid) {
1378                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1379                         return -EINVAL;
1380                 }
1381                 if (q->handle == 0) {
1382                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1383                         return -ENOENT;
1384                 }
1385                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1386                 if (err != 0)
1387                         return err;
1388         } else {
1389                 qdisc_notify(net, skb, n, clid, NULL, q);
1390         }
1391         return 0;
1392 }
1393
1394 /*
1395  * Create/change qdisc.
1396  */
1397
1398 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1399                            struct netlink_ext_ack *extack)
1400 {
1401         struct net *net = sock_net(skb->sk);
1402         struct tcmsg *tcm;
1403         struct nlattr *tca[TCA_MAX + 1];
1404         struct net_device *dev;
1405         u32 clid;
1406         struct Qdisc *q, *p;
1407         int err;
1408
1409         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1410                 return -EPERM;
1411
1412 replay:
1413         /* Reinit, just in case something touches this. */
1414         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1415         if (err < 0)
1416                 return err;
1417
1418         tcm = nlmsg_data(n);
1419         clid = tcm->tcm_parent;
1420         q = p = NULL;
1421
1422         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1423         if (!dev)
1424                 return -ENODEV;
1425
1426
1427         if (clid) {
1428                 if (clid != TC_H_ROOT) {
1429                         if (clid != TC_H_INGRESS) {
1430                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1431                                 if (!p) {
1432                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1433                                         return -ENOENT;
1434                                 }
1435                                 q = qdisc_leaf(p, clid);
1436                         } else if (dev_ingress_queue_create(dev)) {
1437                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1438                         }
1439                 } else {
1440                         q = dev->qdisc;
1441                 }
1442
1443                 /* It may be default qdisc, ignore it */
1444                 if (q && q->handle == 0)
1445                         q = NULL;
1446
1447                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1448                         if (tcm->tcm_handle) {
1449                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1450                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1451                                         return -EEXIST;
1452                                 }
1453                                 if (TC_H_MIN(tcm->tcm_handle)) {
1454                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1455                                         return -EINVAL;
1456                                 }
1457                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1458                                 if (!q)
1459                                         goto create_n_graft;
1460                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1461                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1462                                         return -EEXIST;
1463                                 }
1464                                 if (tca[TCA_KIND] &&
1465                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1466                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1467                                         return -EINVAL;
1468                                 }
1469                                 if (q == p ||
1470                                     (p && check_loop(q, p, 0))) {
1471                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1472                                         return -ELOOP;
1473                                 }
1474                                 qdisc_refcount_inc(q);
1475                                 goto graft;
1476                         } else {
1477                                 if (!q)
1478                                         goto create_n_graft;
1479
1480                                 /* This magic test requires explanation.
1481                                  *
1482                                  *   We know, that some child q is already
1483                                  *   attached to this parent and have choice:
1484                                  *   either to change it or to create/graft new one.
1485                                  *
1486                                  *   1. We are allowed to create/graft only
1487                                  *   if CREATE and REPLACE flags are set.
1488                                  *
1489                                  *   2. If EXCL is set, requestor wanted to say,
1490                                  *   that qdisc tcm_handle is not expected
1491                                  *   to exist, so that we choose create/graft too.
1492                                  *
1493                                  *   3. The last case is when no flags are set.
1494                                  *   Alas, it is sort of hole in API, we
1495                                  *   cannot decide what to do unambiguously.
1496                                  *   For now we select create/graft, if
1497                                  *   user gave KIND, which does not match existing.
1498                                  */
1499                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1500                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1501                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1502                                      (tca[TCA_KIND] &&
1503                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1504                                         goto create_n_graft;
1505                         }
1506                 }
1507         } else {
1508                 if (!tcm->tcm_handle) {
1509                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1510                         return -EINVAL;
1511                 }
1512                 q = qdisc_lookup(dev, tcm->tcm_handle);
1513         }
1514
1515         /* Change qdisc parameters */
1516         if (!q) {
1517                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1518                 return -ENOENT;
1519         }
1520         if (n->nlmsg_flags & NLM_F_EXCL) {
1521                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1522                 return -EEXIST;
1523         }
1524         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1525                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1526                 return -EINVAL;
1527         }
1528         err = qdisc_change(q, tca, extack);
1529         if (err == 0)
1530                 qdisc_notify(net, skb, n, clid, NULL, q);
1531         return err;
1532
1533 create_n_graft:
1534         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1535                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1536                 return -ENOENT;
1537         }
1538         if (clid == TC_H_INGRESS) {
1539                 if (dev_ingress_queue(dev)) {
1540                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1541                                          tcm->tcm_parent, tcm->tcm_parent,
1542                                          tca, &err, extack);
1543                 } else {
1544                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1545                         err = -ENOENT;
1546                 }
1547         } else {
1548                 struct netdev_queue *dev_queue;
1549
1550                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1551                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1552                 else if (p)
1553                         dev_queue = p->dev_queue;
1554                 else
1555                         dev_queue = netdev_get_tx_queue(dev, 0);
1556
1557                 q = qdisc_create(dev, dev_queue, p,
1558                                  tcm->tcm_parent, tcm->tcm_handle,
1559                                  tca, &err, extack);
1560         }
1561         if (q == NULL) {
1562                 if (err == -EAGAIN)
1563                         goto replay;
1564                 return err;
1565         }
1566
1567 graft:
1568         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1569         if (err) {
1570                 if (q)
1571                         qdisc_destroy(q);
1572                 return err;
1573         }
1574
1575         return 0;
1576 }
1577
1578 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1579                               struct netlink_callback *cb,
1580                               int *q_idx_p, int s_q_idx, bool recur,
1581                               bool dump_invisible)
1582 {
1583         int ret = 0, q_idx = *q_idx_p;
1584         struct Qdisc *q;
1585         int b;
1586
1587         if (!root)
1588                 return 0;
1589
1590         q = root;
1591         if (q_idx < s_q_idx) {
1592                 q_idx++;
1593         } else {
1594                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1595                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1596                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1597                                   RTM_NEWQDISC) <= 0)
1598                         goto done;
1599                 q_idx++;
1600         }
1601
1602         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1603          * itself has already been dumped.
1604          *
1605          * If we've already dumped the top-level (ingress) qdisc above and the global
1606          * qdisc hashtable, we don't want to hit it again
1607          */
1608         if (!qdisc_dev(root) || !recur)
1609                 goto out;
1610
1611         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1612                 if (q_idx < s_q_idx) {
1613                         q_idx++;
1614                         continue;
1615                 }
1616                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1617                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1618                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1619                                   RTM_NEWQDISC) <= 0)
1620                         goto done;
1621                 q_idx++;
1622         }
1623
1624 out:
1625         *q_idx_p = q_idx;
1626         return ret;
1627 done:
1628         ret = -1;
1629         goto out;
1630 }
1631
1632 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1633 {
1634         struct net *net = sock_net(skb->sk);
1635         int idx, q_idx;
1636         int s_idx, s_q_idx;
1637         struct net_device *dev;
1638         const struct nlmsghdr *nlh = cb->nlh;
1639         struct nlattr *tca[TCA_MAX + 1];
1640         int err;
1641
1642         s_idx = cb->args[0];
1643         s_q_idx = q_idx = cb->args[1];
1644
1645         idx = 0;
1646         ASSERT_RTNL();
1647
1648         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX, NULL, NULL);
1649         if (err < 0)
1650                 return err;
1651
1652         for_each_netdev(net, dev) {
1653                 struct netdev_queue *dev_queue;
1654
1655                 if (idx < s_idx)
1656                         goto cont;
1657                 if (idx > s_idx)
1658                         s_q_idx = 0;
1659                 q_idx = 0;
1660
1661                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1662                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1663                         goto done;
1664
1665                 dev_queue = dev_ingress_queue(dev);
1666                 if (dev_queue &&
1667                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1668                                        &q_idx, s_q_idx, false,
1669                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1670                         goto done;
1671
1672 cont:
1673                 idx++;
1674         }
1675
1676 done:
1677         cb->args[0] = idx;
1678         cb->args[1] = q_idx;
1679
1680         return skb->len;
1681 }
1682
1683
1684
1685 /************************************************
1686  *      Traffic classes manipulation.           *
1687  ************************************************/
1688
1689 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1690                           unsigned long cl,
1691                           u32 portid, u32 seq, u16 flags, int event)
1692 {
1693         struct tcmsg *tcm;
1694         struct nlmsghdr  *nlh;
1695         unsigned char *b = skb_tail_pointer(skb);
1696         struct gnet_dump d;
1697         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1698
1699         cond_resched();
1700         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1701         if (!nlh)
1702                 goto out_nlmsg_trim;
1703         tcm = nlmsg_data(nlh);
1704         tcm->tcm_family = AF_UNSPEC;
1705         tcm->tcm__pad1 = 0;
1706         tcm->tcm__pad2 = 0;
1707         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1708         tcm->tcm_parent = q->handle;
1709         tcm->tcm_handle = q->handle;
1710         tcm->tcm_info = 0;
1711         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1712                 goto nla_put_failure;
1713         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1714                 goto nla_put_failure;
1715
1716         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1717                                          NULL, &d, TCA_PAD) < 0)
1718                 goto nla_put_failure;
1719
1720         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1721                 goto nla_put_failure;
1722
1723         if (gnet_stats_finish_copy(&d) < 0)
1724                 goto nla_put_failure;
1725
1726         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1727         return skb->len;
1728
1729 out_nlmsg_trim:
1730 nla_put_failure:
1731         nlmsg_trim(skb, b);
1732         return -1;
1733 }
1734
1735 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1736                          struct nlmsghdr *n, struct Qdisc *q,
1737                          unsigned long cl, int event)
1738 {
1739         struct sk_buff *skb;
1740         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1741
1742         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1743         if (!skb)
1744                 return -ENOBUFS;
1745
1746         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1747                 kfree_skb(skb);
1748                 return -EINVAL;
1749         }
1750
1751         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1752                               n->nlmsg_flags & NLM_F_ECHO);
1753 }
1754
1755 static int tclass_del_notify(struct net *net,
1756                              const struct Qdisc_class_ops *cops,
1757                              struct sk_buff *oskb, struct nlmsghdr *n,
1758                              struct Qdisc *q, unsigned long cl)
1759 {
1760         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1761         struct sk_buff *skb;
1762         int err = 0;
1763
1764         if (!cops->delete)
1765                 return -EOPNOTSUPP;
1766
1767         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1768         if (!skb)
1769                 return -ENOBUFS;
1770
1771         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1772                            RTM_DELTCLASS) < 0) {
1773                 kfree_skb(skb);
1774                 return -EINVAL;
1775         }
1776
1777         err = cops->delete(q, cl);
1778         if (err) {
1779                 kfree_skb(skb);
1780                 return err;
1781         }
1782
1783         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1784                               n->nlmsg_flags & NLM_F_ECHO);
1785 }
1786
1787 #ifdef CONFIG_NET_CLS
1788
1789 struct tcf_bind_args {
1790         struct tcf_walker w;
1791         u32 classid;
1792         unsigned long cl;
1793 };
1794
1795 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1796 {
1797         struct tcf_bind_args *a = (void *)arg;
1798
1799         if (tp->ops->bind_class) {
1800                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1801
1802                 sch_tree_lock(q);
1803                 tp->ops->bind_class(n, a->classid, a->cl);
1804                 sch_tree_unlock(q);
1805         }
1806         return 0;
1807 }
1808
1809 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1810                            unsigned long new_cl)
1811 {
1812         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1813         struct tcf_block *block;
1814         struct tcf_chain *chain;
1815         unsigned long cl;
1816
1817         cl = cops->find(q, portid);
1818         if (!cl)
1819                 return;
1820         block = cops->tcf_block(q, cl, NULL);
1821         if (!block)
1822                 return;
1823         list_for_each_entry(chain, &block->chain_list, list) {
1824                 struct tcf_proto *tp;
1825
1826                 for (tp = rtnl_dereference(chain->filter_chain);
1827                      tp; tp = rtnl_dereference(tp->next)) {
1828                         struct tcf_bind_args arg = {};
1829
1830                         arg.w.fn = tcf_node_bind;
1831                         arg.classid = clid;
1832                         arg.cl = new_cl;
1833                         tp->ops->walk(tp, &arg.w);
1834                 }
1835         }
1836 }
1837
1838 #else
1839
1840 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1841                            unsigned long new_cl)
1842 {
1843 }
1844
1845 #endif
1846
1847 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1848                          struct netlink_ext_ack *extack)
1849 {
1850         struct net *net = sock_net(skb->sk);
1851         struct tcmsg *tcm = nlmsg_data(n);
1852         struct nlattr *tca[TCA_MAX + 1];
1853         struct net_device *dev;
1854         struct Qdisc *q = NULL;
1855         const struct Qdisc_class_ops *cops;
1856         unsigned long cl = 0;
1857         unsigned long new_cl;
1858         u32 portid;
1859         u32 clid;
1860         u32 qid;
1861         int err;
1862
1863         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1864             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1865                 return -EPERM;
1866
1867         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack);
1868         if (err < 0)
1869                 return err;
1870
1871         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1872         if (!dev)
1873                 return -ENODEV;
1874
1875         /*
1876            parent == TC_H_UNSPEC - unspecified parent.
1877            parent == TC_H_ROOT   - class is root, which has no parent.
1878            parent == X:0         - parent is root class.
1879            parent == X:Y         - parent is a node in hierarchy.
1880            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1881
1882            handle == 0:0         - generate handle from kernel pool.
1883            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1884            handle == X:Y         - clear.
1885            handle == X:0         - root class.
1886          */
1887
1888         /* Step 1. Determine qdisc handle X:0 */
1889
1890         portid = tcm->tcm_parent;
1891         clid = tcm->tcm_handle;
1892         qid = TC_H_MAJ(clid);
1893
1894         if (portid != TC_H_ROOT) {
1895                 u32 qid1 = TC_H_MAJ(portid);
1896
1897                 if (qid && qid1) {
1898                         /* If both majors are known, they must be identical. */
1899                         if (qid != qid1)
1900                                 return -EINVAL;
1901                 } else if (qid1) {
1902                         qid = qid1;
1903                 } else if (qid == 0)
1904                         qid = dev->qdisc->handle;
1905
1906                 /* Now qid is genuine qdisc handle consistent
1907                  * both with parent and child.
1908                  *
1909                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1910                  */
1911                 if (portid)
1912                         portid = TC_H_MAKE(qid, portid);
1913         } else {
1914                 if (qid == 0)
1915                         qid = dev->qdisc->handle;
1916         }
1917
1918         /* OK. Locate qdisc */
1919         q = qdisc_lookup(dev, qid);
1920         if (!q)
1921                 return -ENOENT;
1922
1923         /* An check that it supports classes */
1924         cops = q->ops->cl_ops;
1925         if (cops == NULL)
1926                 return -EINVAL;
1927
1928         /* Now try to get class */
1929         if (clid == 0) {
1930                 if (portid == TC_H_ROOT)
1931                         clid = qid;
1932         } else
1933                 clid = TC_H_MAKE(qid, clid);
1934
1935         if (clid)
1936                 cl = cops->find(q, clid);
1937
1938         if (cl == 0) {
1939                 err = -ENOENT;
1940                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1941                     !(n->nlmsg_flags & NLM_F_CREATE))
1942                         goto out;
1943         } else {
1944                 switch (n->nlmsg_type) {
1945                 case RTM_NEWTCLASS:
1946                         err = -EEXIST;
1947                         if (n->nlmsg_flags & NLM_F_EXCL)
1948                                 goto out;
1949                         break;
1950                 case RTM_DELTCLASS:
1951                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1952                         /* Unbind the class with flilters with 0 */
1953                         tc_bind_tclass(q, portid, clid, 0);
1954                         goto out;
1955                 case RTM_GETTCLASS:
1956                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1957                         goto out;
1958                 default:
1959                         err = -EINVAL;
1960                         goto out;
1961                 }
1962         }
1963
1964         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1965                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1966                 return -EOPNOTSUPP;
1967         }
1968
1969         new_cl = cl;
1970         err = -EOPNOTSUPP;
1971         if (cops->change)
1972                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1973         if (err == 0) {
1974                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1975                 /* We just create a new class, need to do reverse binding. */
1976                 if (cl != new_cl)
1977                         tc_bind_tclass(q, portid, clid, new_cl);
1978         }
1979 out:
1980         return err;
1981 }
1982
1983 struct qdisc_dump_args {
1984         struct qdisc_walker     w;
1985         struct sk_buff          *skb;
1986         struct netlink_callback *cb;
1987 };
1988
1989 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1990                             struct qdisc_walker *arg)
1991 {
1992         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1993
1994         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1995                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1996                               RTM_NEWTCLASS);
1997 }
1998
1999 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2000                                 struct tcmsg *tcm, struct netlink_callback *cb,
2001                                 int *t_p, int s_t)
2002 {
2003         struct qdisc_dump_args arg;
2004
2005         if (tc_qdisc_dump_ignore(q, false) ||
2006             *t_p < s_t || !q->ops->cl_ops ||
2007             (tcm->tcm_parent &&
2008              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2009                 (*t_p)++;
2010                 return 0;
2011         }
2012         if (*t_p > s_t)
2013                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2014         arg.w.fn = qdisc_class_dump;
2015         arg.skb = skb;
2016         arg.cb = cb;
2017         arg.w.stop  = 0;
2018         arg.w.skip = cb->args[1];
2019         arg.w.count = 0;
2020         q->ops->cl_ops->walk(q, &arg.w);
2021         cb->args[1] = arg.w.count;
2022         if (arg.w.stop)
2023                 return -1;
2024         (*t_p)++;
2025         return 0;
2026 }
2027
2028 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2029                                struct tcmsg *tcm, struct netlink_callback *cb,
2030                                int *t_p, int s_t)
2031 {
2032         struct Qdisc *q;
2033         int b;
2034
2035         if (!root)
2036                 return 0;
2037
2038         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2039                 return -1;
2040
2041         if (!qdisc_dev(root))
2042                 return 0;
2043
2044         if (tcm->tcm_parent) {
2045                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2046                 if (q && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2047                         return -1;
2048                 return 0;
2049         }
2050         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2051                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2052                         return -1;
2053         }
2054
2055         return 0;
2056 }
2057
2058 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2059 {
2060         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2061         struct net *net = sock_net(skb->sk);
2062         struct netdev_queue *dev_queue;
2063         struct net_device *dev;
2064         int t, s_t;
2065
2066         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2067                 return 0;
2068         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2069         if (!dev)
2070                 return 0;
2071
2072         s_t = cb->args[0];
2073         t = 0;
2074
2075         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
2076                 goto done;
2077
2078         dev_queue = dev_ingress_queue(dev);
2079         if (dev_queue &&
2080             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2081                                 &t, s_t) < 0)
2082                 goto done;
2083
2084 done:
2085         cb->args[0] = t;
2086
2087         dev_put(dev);
2088         return skb->len;
2089 }
2090
2091 #ifdef CONFIG_PROC_FS
2092 static int psched_show(struct seq_file *seq, void *v)
2093 {
2094         seq_printf(seq, "%08x %08x %08x %08x\n",
2095                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2096                    1000000,
2097                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2098
2099         return 0;
2100 }
2101
2102 static int __net_init psched_net_init(struct net *net)
2103 {
2104         struct proc_dir_entry *e;
2105
2106         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2107         if (e == NULL)
2108                 return -ENOMEM;
2109
2110         return 0;
2111 }
2112
2113 static void __net_exit psched_net_exit(struct net *net)
2114 {
2115         remove_proc_entry("psched", net->proc_net);
2116 }
2117 #else
2118 static int __net_init psched_net_init(struct net *net)
2119 {
2120         return 0;
2121 }
2122
2123 static void __net_exit psched_net_exit(struct net *net)
2124 {
2125 }
2126 #endif
2127
2128 static struct pernet_operations psched_net_ops = {
2129         .init = psched_net_init,
2130         .exit = psched_net_exit,
2131 };
2132
2133 static int __init pktsched_init(void)
2134 {
2135         int err;
2136
2137         err = register_pernet_subsys(&psched_net_ops);
2138         if (err) {
2139                 pr_err("pktsched_init: "
2140                        "cannot initialize per netns operations\n");
2141                 return err;
2142         }
2143
2144         register_qdisc(&pfifo_fast_ops);
2145         register_qdisc(&pfifo_qdisc_ops);
2146         register_qdisc(&bfifo_qdisc_ops);
2147         register_qdisc(&pfifo_head_drop_qdisc_ops);
2148         register_qdisc(&mq_qdisc_ops);
2149         register_qdisc(&noqueue_qdisc_ops);
2150
2151         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2152         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2153         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2154                       0);
2155         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2156         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2157         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2158                       0);
2159
2160         return 0;
2161 }
2162
2163 subsys_initcall(pktsched_init);