Merge branch 'aux-bus-v11' of https://github.com/ajitkhaparde1/linux
[linux-block.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
332 out:
333         return q;
334 }
335
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338         unsigned long cl;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         return cops->leaf(p, cl);
348 }
349
350 /* Find queueing discipline by name */
351
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354         struct Qdisc_ops *q = NULL;
355
356         if (kind) {
357                 read_lock(&qdisc_mod_lock);
358                 for (q = qdisc_base; q; q = q->next) {
359                         if (nla_strcmp(kind, q->id) == 0) {
360                                 if (!try_module_get(q->owner))
361                                         q = NULL;
362                                 break;
363                         }
364                 }
365                 read_unlock(&qdisc_mod_lock);
366         }
367         return q;
368 }
369
370 /* The linklayer setting were not transferred from iproute2, in older
371  * versions, and the rate tables lookup systems have been dropped in
372  * the kernel. To keep backward compatible with older iproute2 tc
373  * utils, we detect the linklayer setting by detecting if the rate
374  * table were modified.
375  *
376  * For linklayer ATM table entries, the rate table will be aligned to
377  * 48 bytes, thus some table entries will contain the same value.  The
378  * mpu (min packet unit) is also encoded into the old rate table, thus
379  * starting from the mpu, we find low and high table entries for
380  * mapping this cell.  If these entries contain the same value, when
381  * the rate tables have been modified for linklayer ATM.
382  *
383  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384  * and then roundup to the next cell, calc the table entry one below,
385  * and compare.
386  */
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389         int low       = roundup(r->mpu, 48);
390         int high      = roundup(low+1, 48);
391         int cell_low  = low >> r->cell_log;
392         int cell_high = (high >> r->cell_log) - 1;
393
394         /* rtab is too inaccurate at rates > 100Mbit/s */
395         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396                 pr_debug("TC linklayer: Giving up ATM detection\n");
397                 return TC_LINKLAYER_ETHERNET;
398         }
399
400         if ((cell_high > cell_low) && (cell_high < 256)
401             && (rtab[cell_low] == rtab[cell_high])) {
402                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403                          cell_low, cell_high, rtab[cell_high]);
404                 return TC_LINKLAYER_ATM;
405         }
406         return TC_LINKLAYER_ETHERNET;
407 }
408
409 static struct qdisc_rate_table *qdisc_rtab_list;
410
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412                                         struct nlattr *tab,
413                                         struct netlink_ext_ack *extack)
414 {
415         struct qdisc_rate_table *rtab;
416
417         if (tab == NULL || r->rate == 0 ||
418             r->cell_log == 0 || r->cell_log >= 32 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485                                           extack);
486         if (err < 0)
487                 return ERR_PTR(err);
488         if (!tb[TCA_STAB_BASE]) {
489                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490                 return ERR_PTR(-EINVAL);
491         }
492
493         s = nla_data(tb[TCA_STAB_BASE]);
494
495         if (s->tsize > 0) {
496                 if (!tb[TCA_STAB_DATA]) {
497                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498                         return ERR_PTR(-EINVAL);
499                 }
500                 tab = nla_data(tb[TCA_STAB_DATA]);
501                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502         }
503
504         if (tsize != s->tsize || (!tab && tsize > 0)) {
505                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506                 return ERR_PTR(-EINVAL);
507         }
508
509         list_for_each_entry(stab, &qdisc_stab_list, list) {
510                 if (memcmp(&stab->szopts, s, sizeof(*s)))
511                         continue;
512                 if (tsize > 0 &&
513                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514                         continue;
515                 stab->refcnt++;
516                 return stab;
517         }
518
519         if (s->size_log > STAB_SIZE_LOG_MAX ||
520             s->cell_log > STAB_SIZE_LOG_MAX) {
521                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522                 return ERR_PTR(-EINVAL);
523         }
524
525         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526         if (!stab)
527                 return ERR_PTR(-ENOMEM);
528
529         stab->refcnt = 1;
530         stab->szopts = *s;
531         if (tsize > 0)
532                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533
534         list_add_tail(&stab->list, &qdisc_stab_list);
535
536         return stab;
537 }
538
539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541         if (!tab)
542                 return;
543
544         if (--tab->refcnt == 0) {
545                 list_del(&tab->list);
546                 kfree_rcu(tab, rcu);
547         }
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553         struct nlattr *nest;
554
555         nest = nla_nest_start_noflag(skb, TCA_STAB);
556         if (nest == NULL)
557                 goto nla_put_failure;
558         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559                 goto nla_put_failure;
560         nla_nest_end(skb, nest);
561
562         return skb->len;
563
564 nla_put_failure:
565         return -1;
566 }
567
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569                                const struct qdisc_size_table *stab)
570 {
571         int pkt_len, slot;
572
573         pkt_len = skb->len + stab->szopts.overhead;
574         if (unlikely(!stab->szopts.tsize))
575                 goto out;
576
577         slot = pkt_len + stab->szopts.cell_align;
578         if (unlikely(slot < 0))
579                 slot = 0;
580
581         slot >>= stab->szopts.cell_log;
582         if (likely(slot < stab->szopts.tsize))
583                 pkt_len = stab->data[slot];
584         else
585                 pkt_len = stab->data[stab->szopts.tsize - 1] *
586                                 (slot / stab->szopts.tsize) +
587                                 stab->data[slot % stab->szopts.tsize];
588
589         pkt_len <<= stab->szopts.size_log;
590 out:
591         if (unlikely(pkt_len < 1))
592                 pkt_len = 1;
593         qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601                         txt, qdisc->ops->id, qdisc->handle >> 16);
602                 qdisc->flags |= TCQ_F_WARN_NONWC;
603         }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610                                                  timer);
611
612         rcu_read_lock();
613         __netif_schedule(qdisc_root(wd->qdisc));
614         rcu_read_unlock();
615
616         return HRTIMER_NORESTART;
617 }
618
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620                                  clockid_t clockid)
621 {
622         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623         wd->timer.function = qdisc_watchdog;
624         wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635                                       u64 delta_ns)
636 {
637         if (test_bit(__QDISC_STATE_DEACTIVATED,
638                      &qdisc_root_sleeping(wd->qdisc)->state))
639                 return;
640
641         if (hrtimer_is_queued(&wd->timer)) {
642                 /* If timer is already set in [expires, expires + delta_ns],
643                  * do not reprogram it.
644                  */
645                 if (wd->last_expires - expires <= delta_ns)
646                         return;
647         }
648
649         wd->last_expires = expires;
650         hrtimer_start_range_ns(&wd->timer,
651                                ns_to_ktime(expires),
652                                delta_ns,
653                                HRTIMER_MODE_ABS_PINNED);
654 }
655 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
656
657 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
658 {
659         hrtimer_cancel(&wd->timer);
660 }
661 EXPORT_SYMBOL(qdisc_watchdog_cancel);
662
663 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
664 {
665         struct hlist_head *h;
666         unsigned int i;
667
668         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
669
670         if (h != NULL) {
671                 for (i = 0; i < n; i++)
672                         INIT_HLIST_HEAD(&h[i]);
673         }
674         return h;
675 }
676
677 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
678 {
679         struct Qdisc_class_common *cl;
680         struct hlist_node *next;
681         struct hlist_head *nhash, *ohash;
682         unsigned int nsize, nmask, osize;
683         unsigned int i, h;
684
685         /* Rehash when load factor exceeds 0.75 */
686         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
687                 return;
688         nsize = clhash->hashsize * 2;
689         nmask = nsize - 1;
690         nhash = qdisc_class_hash_alloc(nsize);
691         if (nhash == NULL)
692                 return;
693
694         ohash = clhash->hash;
695         osize = clhash->hashsize;
696
697         sch_tree_lock(sch);
698         for (i = 0; i < osize; i++) {
699                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
700                         h = qdisc_class_hash(cl->classid, nmask);
701                         hlist_add_head(&cl->hnode, &nhash[h]);
702                 }
703         }
704         clhash->hash     = nhash;
705         clhash->hashsize = nsize;
706         clhash->hashmask = nmask;
707         sch_tree_unlock(sch);
708
709         kvfree(ohash);
710 }
711 EXPORT_SYMBOL(qdisc_class_hash_grow);
712
713 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
714 {
715         unsigned int size = 4;
716
717         clhash->hash = qdisc_class_hash_alloc(size);
718         if (!clhash->hash)
719                 return -ENOMEM;
720         clhash->hashsize  = size;
721         clhash->hashmask  = size - 1;
722         clhash->hashelems = 0;
723         return 0;
724 }
725 EXPORT_SYMBOL(qdisc_class_hash_init);
726
727 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
728 {
729         kvfree(clhash->hash);
730 }
731 EXPORT_SYMBOL(qdisc_class_hash_destroy);
732
733 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
734                              struct Qdisc_class_common *cl)
735 {
736         unsigned int h;
737
738         INIT_HLIST_NODE(&cl->hnode);
739         h = qdisc_class_hash(cl->classid, clhash->hashmask);
740         hlist_add_head(&cl->hnode, &clhash->hash[h]);
741         clhash->hashelems++;
742 }
743 EXPORT_SYMBOL(qdisc_class_hash_insert);
744
745 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
746                              struct Qdisc_class_common *cl)
747 {
748         hlist_del(&cl->hnode);
749         clhash->hashelems--;
750 }
751 EXPORT_SYMBOL(qdisc_class_hash_remove);
752
753 /* Allocate an unique handle from space managed by kernel
754  * Possible range is [8000-FFFF]:0000 (0x8000 values)
755  */
756 static u32 qdisc_alloc_handle(struct net_device *dev)
757 {
758         int i = 0x8000;
759         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
760
761         do {
762                 autohandle += TC_H_MAKE(0x10000U, 0);
763                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
764                         autohandle = TC_H_MAKE(0x80000000U, 0);
765                 if (!qdisc_lookup(dev, autohandle))
766                         return autohandle;
767                 cond_resched();
768         } while (--i > 0);
769
770         return 0;
771 }
772
773 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
774 {
775         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
776         const struct Qdisc_class_ops *cops;
777         unsigned long cl;
778         u32 parentid;
779         bool notify;
780         int drops;
781
782         if (n == 0 && len == 0)
783                 return;
784         drops = max_t(int, n, 0);
785         rcu_read_lock();
786         while ((parentid = sch->parent)) {
787                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
788                         break;
789
790                 if (sch->flags & TCQ_F_NOPARENT)
791                         break;
792                 /* Notify parent qdisc only if child qdisc becomes empty.
793                  *
794                  * If child was empty even before update then backlog
795                  * counter is screwed and we skip notification because
796                  * parent class is already passive.
797                  *
798                  * If the original child was offloaded then it is allowed
799                  * to be seem as empty, so the parent is notified anyway.
800                  */
801                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
802                                                        !qdisc_is_offloaded);
803                 /* TODO: perform the search on a per txq basis */
804                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
805                 if (sch == NULL) {
806                         WARN_ON_ONCE(parentid != TC_H_ROOT);
807                         break;
808                 }
809                 cops = sch->ops->cl_ops;
810                 if (notify && cops->qlen_notify) {
811                         cl = cops->find(sch, parentid);
812                         cops->qlen_notify(sch, cl);
813                 }
814                 sch->q.qlen -= n;
815                 sch->qstats.backlog -= len;
816                 __qdisc_qstats_drop(sch, drops);
817         }
818         rcu_read_unlock();
819 }
820 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
821
822 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
823                               void *type_data)
824 {
825         struct net_device *dev = qdisc_dev(sch);
826         int err;
827
828         sch->flags &= ~TCQ_F_OFFLOADED;
829         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
830                 return 0;
831
832         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
833         if (err == -EOPNOTSUPP)
834                 return 0;
835
836         if (!err)
837                 sch->flags |= TCQ_F_OFFLOADED;
838
839         return err;
840 }
841 EXPORT_SYMBOL(qdisc_offload_dump_helper);
842
843 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
844                                 struct Qdisc *new, struct Qdisc *old,
845                                 enum tc_setup_type type, void *type_data,
846                                 struct netlink_ext_ack *extack)
847 {
848         bool any_qdisc_is_offloaded;
849         int err;
850
851         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
852                 return;
853
854         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
855
856         /* Don't report error if the graft is part of destroy operation. */
857         if (!err || !new || new == &noop_qdisc)
858                 return;
859
860         /* Don't report error if the parent, the old child and the new
861          * one are not offloaded.
862          */
863         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
864         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
865         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
866
867         if (any_qdisc_is_offloaded)
868                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
869 }
870 EXPORT_SYMBOL(qdisc_offload_graft_helper);
871
872 void qdisc_offload_query_caps(struct net_device *dev,
873                               enum tc_setup_type type,
874                               void *caps, size_t caps_len)
875 {
876         const struct net_device_ops *ops = dev->netdev_ops;
877         struct tc_query_caps_base base = {
878                 .type = type,
879                 .caps = caps,
880         };
881
882         memset(caps, 0, caps_len);
883
884         if (ops->ndo_setup_tc)
885                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
886 }
887 EXPORT_SYMBOL(qdisc_offload_query_caps);
888
889 static void qdisc_offload_graft_root(struct net_device *dev,
890                                      struct Qdisc *new, struct Qdisc *old,
891                                      struct netlink_ext_ack *extack)
892 {
893         struct tc_root_qopt_offload graft_offload = {
894                 .command        = TC_ROOT_GRAFT,
895                 .handle         = new ? new->handle : 0,
896                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
897                                   (old && old->flags & TCQ_F_INGRESS),
898         };
899
900         qdisc_offload_graft_helper(dev, NULL, new, old,
901                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
902 }
903
904 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
905                          u32 portid, u32 seq, u16 flags, int event,
906                          struct netlink_ext_ack *extack)
907 {
908         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
909         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
910         struct tcmsg *tcm;
911         struct nlmsghdr  *nlh;
912         unsigned char *b = skb_tail_pointer(skb);
913         struct gnet_dump d;
914         struct qdisc_size_table *stab;
915         u32 block_index;
916         __u32 qlen;
917
918         cond_resched();
919         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
920         if (!nlh)
921                 goto out_nlmsg_trim;
922         tcm = nlmsg_data(nlh);
923         tcm->tcm_family = AF_UNSPEC;
924         tcm->tcm__pad1 = 0;
925         tcm->tcm__pad2 = 0;
926         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
927         tcm->tcm_parent = clid;
928         tcm->tcm_handle = q->handle;
929         tcm->tcm_info = refcount_read(&q->refcnt);
930         if (nla_put_string(skb, TCA_KIND, q->ops->id))
931                 goto nla_put_failure;
932         if (q->ops->ingress_block_get) {
933                 block_index = q->ops->ingress_block_get(q);
934                 if (block_index &&
935                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
936                         goto nla_put_failure;
937         }
938         if (q->ops->egress_block_get) {
939                 block_index = q->ops->egress_block_get(q);
940                 if (block_index &&
941                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
942                         goto nla_put_failure;
943         }
944         if (q->ops->dump && q->ops->dump(q, skb) < 0)
945                 goto nla_put_failure;
946         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
947                 goto nla_put_failure;
948         qlen = qdisc_qlen_sum(q);
949
950         stab = rtnl_dereference(q->stab);
951         if (stab && qdisc_dump_stab(skb, stab) < 0)
952                 goto nla_put_failure;
953
954         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
955                                          NULL, &d, TCA_PAD) < 0)
956                 goto nla_put_failure;
957
958         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
959                 goto nla_put_failure;
960
961         if (qdisc_is_percpu_stats(q)) {
962                 cpu_bstats = q->cpu_bstats;
963                 cpu_qstats = q->cpu_qstats;
964         }
965
966         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
967             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
968             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
969                 goto nla_put_failure;
970
971         if (gnet_stats_finish_copy(&d) < 0)
972                 goto nla_put_failure;
973
974         if (extack && extack->_msg &&
975             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
976                 goto out_nlmsg_trim;
977
978         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
979
980         return skb->len;
981
982 out_nlmsg_trim:
983 nla_put_failure:
984         nlmsg_trim(skb, b);
985         return -1;
986 }
987
988 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
989 {
990         if (q->flags & TCQ_F_BUILTIN)
991                 return true;
992         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
993                 return true;
994
995         return false;
996 }
997
998 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
999                         struct nlmsghdr *n, u32 clid,
1000                         struct Qdisc *old, struct Qdisc *new,
1001                         struct netlink_ext_ack *extack)
1002 {
1003         struct sk_buff *skb;
1004         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1005
1006         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1007         if (!skb)
1008                 return -ENOBUFS;
1009
1010         if (old && !tc_qdisc_dump_ignore(old, false)) {
1011                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1012                                   0, RTM_DELQDISC, extack) < 0)
1013                         goto err_out;
1014         }
1015         if (new && !tc_qdisc_dump_ignore(new, false)) {
1016                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1017                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1018                         goto err_out;
1019         }
1020
1021         if (skb->len)
1022                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1023                                       n->nlmsg_flags & NLM_F_ECHO);
1024
1025 err_out:
1026         kfree_skb(skb);
1027         return -EINVAL;
1028 }
1029
1030 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1031                                struct nlmsghdr *n, u32 clid,
1032                                struct Qdisc *old, struct Qdisc *new,
1033                                struct netlink_ext_ack *extack)
1034 {
1035         if (new || old)
1036                 qdisc_notify(net, skb, n, clid, old, new, extack);
1037
1038         if (old)
1039                 qdisc_put(old);
1040 }
1041
1042 static void qdisc_clear_nolock(struct Qdisc *sch)
1043 {
1044         sch->flags &= ~TCQ_F_NOLOCK;
1045         if (!(sch->flags & TCQ_F_CPUSTATS))
1046                 return;
1047
1048         free_percpu(sch->cpu_bstats);
1049         free_percpu(sch->cpu_qstats);
1050         sch->cpu_bstats = NULL;
1051         sch->cpu_qstats = NULL;
1052         sch->flags &= ~TCQ_F_CPUSTATS;
1053 }
1054
1055 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1056  * to device "dev".
1057  *
1058  * When appropriate send a netlink notification using 'skb'
1059  * and "n".
1060  *
1061  * On success, destroy old qdisc.
1062  */
1063
1064 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1065                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1066                        struct Qdisc *new, struct Qdisc *old,
1067                        struct netlink_ext_ack *extack)
1068 {
1069         struct Qdisc *q = old;
1070         struct net *net = dev_net(dev);
1071
1072         if (parent == NULL) {
1073                 unsigned int i, num_q, ingress;
1074
1075                 ingress = 0;
1076                 num_q = dev->num_tx_queues;
1077                 if ((q && q->flags & TCQ_F_INGRESS) ||
1078                     (new && new->flags & TCQ_F_INGRESS)) {
1079                         num_q = 1;
1080                         ingress = 1;
1081                         if (!dev_ingress_queue(dev)) {
1082                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1083                                 return -ENOENT;
1084                         }
1085                 }
1086
1087                 if (dev->flags & IFF_UP)
1088                         dev_deactivate(dev);
1089
1090                 qdisc_offload_graft_root(dev, new, old, extack);
1091
1092                 if (new && new->ops->attach && !ingress)
1093                         goto skip;
1094
1095                 for (i = 0; i < num_q; i++) {
1096                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1097
1098                         if (!ingress)
1099                                 dev_queue = netdev_get_tx_queue(dev, i);
1100
1101                         old = dev_graft_qdisc(dev_queue, new);
1102                         if (new && i > 0)
1103                                 qdisc_refcount_inc(new);
1104
1105                         if (!ingress)
1106                                 qdisc_put(old);
1107                 }
1108
1109 skip:
1110                 if (!ingress) {
1111                         old = rtnl_dereference(dev->qdisc);
1112                         if (new && !new->ops->attach)
1113                                 qdisc_refcount_inc(new);
1114                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1115
1116                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1117
1118                         if (new && new->ops->attach)
1119                                 new->ops->attach(new);
1120                 } else {
1121                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1122                 }
1123
1124                 if (dev->flags & IFF_UP)
1125                         dev_activate(dev);
1126         } else {
1127                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1128                 unsigned long cl;
1129                 int err;
1130
1131                 /* Only support running class lockless if parent is lockless */
1132                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1133                         qdisc_clear_nolock(new);
1134
1135                 if (!cops || !cops->graft)
1136                         return -EOPNOTSUPP;
1137
1138                 cl = cops->find(parent, classid);
1139                 if (!cl) {
1140                         NL_SET_ERR_MSG(extack, "Specified class not found");
1141                         return -ENOENT;
1142                 }
1143
1144                 if (new && new->ops == &noqueue_qdisc_ops) {
1145                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1146                         return -EINVAL;
1147                 }
1148
1149                 err = cops->graft(parent, cl, new, &old, extack);
1150                 if (err)
1151                         return err;
1152                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1153         }
1154         return 0;
1155 }
1156
1157 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1158                                    struct netlink_ext_ack *extack)
1159 {
1160         u32 block_index;
1161
1162         if (tca[TCA_INGRESS_BLOCK]) {
1163                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1164
1165                 if (!block_index) {
1166                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1167                         return -EINVAL;
1168                 }
1169                 if (!sch->ops->ingress_block_set) {
1170                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1171                         return -EOPNOTSUPP;
1172                 }
1173                 sch->ops->ingress_block_set(sch, block_index);
1174         }
1175         if (tca[TCA_EGRESS_BLOCK]) {
1176                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1177
1178                 if (!block_index) {
1179                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1180                         return -EINVAL;
1181                 }
1182                 if (!sch->ops->egress_block_set) {
1183                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1184                         return -EOPNOTSUPP;
1185                 }
1186                 sch->ops->egress_block_set(sch, block_index);
1187         }
1188         return 0;
1189 }
1190
1191 /*
1192    Allocate and initialize new qdisc.
1193
1194    Parameters are passed via opt.
1195  */
1196
1197 static struct Qdisc *qdisc_create(struct net_device *dev,
1198                                   struct netdev_queue *dev_queue,
1199                                   u32 parent, u32 handle,
1200                                   struct nlattr **tca, int *errp,
1201                                   struct netlink_ext_ack *extack)
1202 {
1203         int err;
1204         struct nlattr *kind = tca[TCA_KIND];
1205         struct Qdisc *sch;
1206         struct Qdisc_ops *ops;
1207         struct qdisc_size_table *stab;
1208
1209         ops = qdisc_lookup_ops(kind);
1210 #ifdef CONFIG_MODULES
1211         if (ops == NULL && kind != NULL) {
1212                 char name[IFNAMSIZ];
1213                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1214                         /* We dropped the RTNL semaphore in order to
1215                          * perform the module load.  So, even if we
1216                          * succeeded in loading the module we have to
1217                          * tell the caller to replay the request.  We
1218                          * indicate this using -EAGAIN.
1219                          * We replay the request because the device may
1220                          * go away in the mean time.
1221                          */
1222                         rtnl_unlock();
1223                         request_module("sch_%s", name);
1224                         rtnl_lock();
1225                         ops = qdisc_lookup_ops(kind);
1226                         if (ops != NULL) {
1227                                 /* We will try again qdisc_lookup_ops,
1228                                  * so don't keep a reference.
1229                                  */
1230                                 module_put(ops->owner);
1231                                 err = -EAGAIN;
1232                                 goto err_out;
1233                         }
1234                 }
1235         }
1236 #endif
1237
1238         err = -ENOENT;
1239         if (!ops) {
1240                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1241                 goto err_out;
1242         }
1243
1244         sch = qdisc_alloc(dev_queue, ops, extack);
1245         if (IS_ERR(sch)) {
1246                 err = PTR_ERR(sch);
1247                 goto err_out2;
1248         }
1249
1250         sch->parent = parent;
1251
1252         if (handle == TC_H_INGRESS) {
1253                 sch->flags |= TCQ_F_INGRESS;
1254                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1255         } else {
1256                 if (handle == 0) {
1257                         handle = qdisc_alloc_handle(dev);
1258                         if (handle == 0) {
1259                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1260                                 err = -ENOSPC;
1261                                 goto err_out3;
1262                         }
1263                 }
1264                 if (!netif_is_multiqueue(dev))
1265                         sch->flags |= TCQ_F_ONETXQUEUE;
1266         }
1267
1268         sch->handle = handle;
1269
1270         /* This exist to keep backward compatible with a userspace
1271          * loophole, what allowed userspace to get IFF_NO_QUEUE
1272          * facility on older kernels by setting tx_queue_len=0 (prior
1273          * to qdisc init), and then forgot to reinit tx_queue_len
1274          * before again attaching a qdisc.
1275          */
1276         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1277                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1278                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1279         }
1280
1281         err = qdisc_block_indexes_set(sch, tca, extack);
1282         if (err)
1283                 goto err_out3;
1284
1285         if (ops->init) {
1286                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1287                 if (err != 0)
1288                         goto err_out5;
1289         }
1290
1291         if (tca[TCA_STAB]) {
1292                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1293                 if (IS_ERR(stab)) {
1294                         err = PTR_ERR(stab);
1295                         goto err_out4;
1296                 }
1297                 rcu_assign_pointer(sch->stab, stab);
1298         }
1299         if (tca[TCA_RATE]) {
1300                 err = -EOPNOTSUPP;
1301                 if (sch->flags & TCQ_F_MQROOT) {
1302                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1303                         goto err_out4;
1304                 }
1305
1306                 err = gen_new_estimator(&sch->bstats,
1307                                         sch->cpu_bstats,
1308                                         &sch->rate_est,
1309                                         NULL,
1310                                         true,
1311                                         tca[TCA_RATE]);
1312                 if (err) {
1313                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1314                         goto err_out4;
1315                 }
1316         }
1317
1318         qdisc_hash_add(sch, false);
1319         trace_qdisc_create(ops, dev, parent);
1320
1321         return sch;
1322
1323 err_out5:
1324         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1325         if (ops->destroy)
1326                 ops->destroy(sch);
1327 err_out3:
1328         netdev_put(dev, &sch->dev_tracker);
1329         qdisc_free(sch);
1330 err_out2:
1331         module_put(ops->owner);
1332 err_out:
1333         *errp = err;
1334         return NULL;
1335
1336 err_out4:
1337         /*
1338          * Any broken qdiscs that would require a ops->reset() here?
1339          * The qdisc was never in action so it shouldn't be necessary.
1340          */
1341         qdisc_put_stab(rtnl_dereference(sch->stab));
1342         if (ops->destroy)
1343                 ops->destroy(sch);
1344         goto err_out3;
1345 }
1346
1347 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1348                         struct netlink_ext_ack *extack)
1349 {
1350         struct qdisc_size_table *ostab, *stab = NULL;
1351         int err = 0;
1352
1353         if (tca[TCA_OPTIONS]) {
1354                 if (!sch->ops->change) {
1355                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1356                         return -EINVAL;
1357                 }
1358                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1359                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1360                         return -EOPNOTSUPP;
1361                 }
1362                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1363                 if (err)
1364                         return err;
1365         }
1366
1367         if (tca[TCA_STAB]) {
1368                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1369                 if (IS_ERR(stab))
1370                         return PTR_ERR(stab);
1371         }
1372
1373         ostab = rtnl_dereference(sch->stab);
1374         rcu_assign_pointer(sch->stab, stab);
1375         qdisc_put_stab(ostab);
1376
1377         if (tca[TCA_RATE]) {
1378                 /* NB: ignores errors from replace_estimator
1379                    because change can't be undone. */
1380                 if (sch->flags & TCQ_F_MQROOT)
1381                         goto out;
1382                 gen_replace_estimator(&sch->bstats,
1383                                       sch->cpu_bstats,
1384                                       &sch->rate_est,
1385                                       NULL,
1386                                       true,
1387                                       tca[TCA_RATE]);
1388         }
1389 out:
1390         return 0;
1391 }
1392
1393 struct check_loop_arg {
1394         struct qdisc_walker     w;
1395         struct Qdisc            *p;
1396         int                     depth;
1397 };
1398
1399 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1400                          struct qdisc_walker *w);
1401
1402 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1403 {
1404         struct check_loop_arg   arg;
1405
1406         if (q->ops->cl_ops == NULL)
1407                 return 0;
1408
1409         arg.w.stop = arg.w.skip = arg.w.count = 0;
1410         arg.w.fn = check_loop_fn;
1411         arg.depth = depth;
1412         arg.p = p;
1413         q->ops->cl_ops->walk(q, &arg.w);
1414         return arg.w.stop ? -ELOOP : 0;
1415 }
1416
1417 static int
1418 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1419 {
1420         struct Qdisc *leaf;
1421         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1422         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1423
1424         leaf = cops->leaf(q, cl);
1425         if (leaf) {
1426                 if (leaf == arg->p || arg->depth > 7)
1427                         return -ELOOP;
1428                 return check_loop(leaf, arg->p, arg->depth + 1);
1429         }
1430         return 0;
1431 }
1432
1433 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1434         [TCA_KIND]              = { .type = NLA_STRING },
1435         [TCA_RATE]              = { .type = NLA_BINARY,
1436                                     .len = sizeof(struct tc_estimator) },
1437         [TCA_STAB]              = { .type = NLA_NESTED },
1438         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1439         [TCA_CHAIN]             = { .type = NLA_U32 },
1440         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1441         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1442 };
1443
1444 /*
1445  * Delete/get qdisc.
1446  */
1447
1448 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1449                         struct netlink_ext_ack *extack)
1450 {
1451         struct net *net = sock_net(skb->sk);
1452         struct tcmsg *tcm = nlmsg_data(n);
1453         struct nlattr *tca[TCA_MAX + 1];
1454         struct net_device *dev;
1455         u32 clid;
1456         struct Qdisc *q = NULL;
1457         struct Qdisc *p = NULL;
1458         int err;
1459
1460         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1461                                      rtm_tca_policy, extack);
1462         if (err < 0)
1463                 return err;
1464
1465         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1466         if (!dev)
1467                 return -ENODEV;
1468
1469         clid = tcm->tcm_parent;
1470         if (clid) {
1471                 if (clid != TC_H_ROOT) {
1472                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1473                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1474                                 if (!p) {
1475                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1476                                         return -ENOENT;
1477                                 }
1478                                 q = qdisc_leaf(p, clid);
1479                         } else if (dev_ingress_queue(dev)) {
1480                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1481                         }
1482                 } else {
1483                         q = rtnl_dereference(dev->qdisc);
1484                 }
1485                 if (!q) {
1486                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1487                         return -ENOENT;
1488                 }
1489
1490                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1491                         NL_SET_ERR_MSG(extack, "Invalid handle");
1492                         return -EINVAL;
1493                 }
1494         } else {
1495                 q = qdisc_lookup(dev, tcm->tcm_handle);
1496                 if (!q) {
1497                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1498                         return -ENOENT;
1499                 }
1500         }
1501
1502         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1503                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1504                 return -EINVAL;
1505         }
1506
1507         if (n->nlmsg_type == RTM_DELQDISC) {
1508                 if (!clid) {
1509                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1510                         return -EINVAL;
1511                 }
1512                 if (q->handle == 0) {
1513                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1514                         return -ENOENT;
1515                 }
1516                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1517                 if (err != 0)
1518                         return err;
1519         } else {
1520                 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1521         }
1522         return 0;
1523 }
1524
1525 /*
1526  * Create/change qdisc.
1527  */
1528
1529 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1530                            struct netlink_ext_ack *extack)
1531 {
1532         struct net *net = sock_net(skb->sk);
1533         struct tcmsg *tcm;
1534         struct nlattr *tca[TCA_MAX + 1];
1535         struct net_device *dev;
1536         u32 clid;
1537         struct Qdisc *q, *p;
1538         int err;
1539
1540 replay:
1541         /* Reinit, just in case something touches this. */
1542         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1543                                      rtm_tca_policy, extack);
1544         if (err < 0)
1545                 return err;
1546
1547         tcm = nlmsg_data(n);
1548         clid = tcm->tcm_parent;
1549         q = p = NULL;
1550
1551         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1552         if (!dev)
1553                 return -ENODEV;
1554
1555
1556         if (clid) {
1557                 if (clid != TC_H_ROOT) {
1558                         if (clid != TC_H_INGRESS) {
1559                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1560                                 if (!p) {
1561                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1562                                         return -ENOENT;
1563                                 }
1564                                 q = qdisc_leaf(p, clid);
1565                         } else if (dev_ingress_queue_create(dev)) {
1566                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1567                         }
1568                 } else {
1569                         q = rtnl_dereference(dev->qdisc);
1570                 }
1571
1572                 /* It may be default qdisc, ignore it */
1573                 if (q && q->handle == 0)
1574                         q = NULL;
1575
1576                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1577                         if (tcm->tcm_handle) {
1578                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1579                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1580                                         return -EEXIST;
1581                                 }
1582                                 if (TC_H_MIN(tcm->tcm_handle)) {
1583                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1584                                         return -EINVAL;
1585                                 }
1586                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1587                                 if (!q)
1588                                         goto create_n_graft;
1589                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1590                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1591                                         return -EEXIST;
1592                                 }
1593                                 if (tca[TCA_KIND] &&
1594                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1595                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1596                                         return -EINVAL;
1597                                 }
1598                                 if (q == p ||
1599                                     (p && check_loop(q, p, 0))) {
1600                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1601                                         return -ELOOP;
1602                                 }
1603                                 qdisc_refcount_inc(q);
1604                                 goto graft;
1605                         } else {
1606                                 if (!q)
1607                                         goto create_n_graft;
1608
1609                                 /* This magic test requires explanation.
1610                                  *
1611                                  *   We know, that some child q is already
1612                                  *   attached to this parent and have choice:
1613                                  *   either to change it or to create/graft new one.
1614                                  *
1615                                  *   1. We are allowed to create/graft only
1616                                  *   if CREATE and REPLACE flags are set.
1617                                  *
1618                                  *   2. If EXCL is set, requestor wanted to say,
1619                                  *   that qdisc tcm_handle is not expected
1620                                  *   to exist, so that we choose create/graft too.
1621                                  *
1622                                  *   3. The last case is when no flags are set.
1623                                  *   Alas, it is sort of hole in API, we
1624                                  *   cannot decide what to do unambiguously.
1625                                  *   For now we select create/graft, if
1626                                  *   user gave KIND, which does not match existing.
1627                                  */
1628                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1629                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1630                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1631                                      (tca[TCA_KIND] &&
1632                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1633                                         goto create_n_graft;
1634                         }
1635                 }
1636         } else {
1637                 if (!tcm->tcm_handle) {
1638                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1639                         return -EINVAL;
1640                 }
1641                 q = qdisc_lookup(dev, tcm->tcm_handle);
1642         }
1643
1644         /* Change qdisc parameters */
1645         if (!q) {
1646                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1647                 return -ENOENT;
1648         }
1649         if (n->nlmsg_flags & NLM_F_EXCL) {
1650                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1651                 return -EEXIST;
1652         }
1653         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1654                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1655                 return -EINVAL;
1656         }
1657         err = qdisc_change(q, tca, extack);
1658         if (err == 0)
1659                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1660         return err;
1661
1662 create_n_graft:
1663         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1664                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1665                 return -ENOENT;
1666         }
1667         if (clid == TC_H_INGRESS) {
1668                 if (dev_ingress_queue(dev)) {
1669                         q = qdisc_create(dev, dev_ingress_queue(dev),
1670                                          tcm->tcm_parent, tcm->tcm_parent,
1671                                          tca, &err, extack);
1672                 } else {
1673                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1674                         err = -ENOENT;
1675                 }
1676         } else {
1677                 struct netdev_queue *dev_queue;
1678
1679                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1680                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1681                 else if (p)
1682                         dev_queue = p->dev_queue;
1683                 else
1684                         dev_queue = netdev_get_tx_queue(dev, 0);
1685
1686                 q = qdisc_create(dev, dev_queue,
1687                                  tcm->tcm_parent, tcm->tcm_handle,
1688                                  tca, &err, extack);
1689         }
1690         if (q == NULL) {
1691                 if (err == -EAGAIN)
1692                         goto replay;
1693                 return err;
1694         }
1695
1696 graft:
1697         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1698         if (err) {
1699                 if (q)
1700                         qdisc_put(q);
1701                 return err;
1702         }
1703
1704         return 0;
1705 }
1706
1707 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1708                               struct netlink_callback *cb,
1709                               int *q_idx_p, int s_q_idx, bool recur,
1710                               bool dump_invisible)
1711 {
1712         int ret = 0, q_idx = *q_idx_p;
1713         struct Qdisc *q;
1714         int b;
1715
1716         if (!root)
1717                 return 0;
1718
1719         q = root;
1720         if (q_idx < s_q_idx) {
1721                 q_idx++;
1722         } else {
1723                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1724                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1725                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1726                                   RTM_NEWQDISC, NULL) <= 0)
1727                         goto done;
1728                 q_idx++;
1729         }
1730
1731         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1732          * itself has already been dumped.
1733          *
1734          * If we've already dumped the top-level (ingress) qdisc above and the global
1735          * qdisc hashtable, we don't want to hit it again
1736          */
1737         if (!qdisc_dev(root) || !recur)
1738                 goto out;
1739
1740         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1741                 if (q_idx < s_q_idx) {
1742                         q_idx++;
1743                         continue;
1744                 }
1745                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1746                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1747                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1748                                   RTM_NEWQDISC, NULL) <= 0)
1749                         goto done;
1750                 q_idx++;
1751         }
1752
1753 out:
1754         *q_idx_p = q_idx;
1755         return ret;
1756 done:
1757         ret = -1;
1758         goto out;
1759 }
1760
1761 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1762 {
1763         struct net *net = sock_net(skb->sk);
1764         int idx, q_idx;
1765         int s_idx, s_q_idx;
1766         struct net_device *dev;
1767         const struct nlmsghdr *nlh = cb->nlh;
1768         struct nlattr *tca[TCA_MAX + 1];
1769         int err;
1770
1771         s_idx = cb->args[0];
1772         s_q_idx = q_idx = cb->args[1];
1773
1774         idx = 0;
1775         ASSERT_RTNL();
1776
1777         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1778                                      rtm_tca_policy, cb->extack);
1779         if (err < 0)
1780                 return err;
1781
1782         for_each_netdev(net, dev) {
1783                 struct netdev_queue *dev_queue;
1784
1785                 if (idx < s_idx)
1786                         goto cont;
1787                 if (idx > s_idx)
1788                         s_q_idx = 0;
1789                 q_idx = 0;
1790
1791                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1792                                        skb, cb, &q_idx, s_q_idx,
1793                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1794                         goto done;
1795
1796                 dev_queue = dev_ingress_queue(dev);
1797                 if (dev_queue &&
1798                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1799                                        &q_idx, s_q_idx, false,
1800                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1801                         goto done;
1802
1803 cont:
1804                 idx++;
1805         }
1806
1807 done:
1808         cb->args[0] = idx;
1809         cb->args[1] = q_idx;
1810
1811         return skb->len;
1812 }
1813
1814
1815
1816 /************************************************
1817  *      Traffic classes manipulation.           *
1818  ************************************************/
1819
1820 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1821                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1822                           int event, struct netlink_ext_ack *extack)
1823 {
1824         struct tcmsg *tcm;
1825         struct nlmsghdr  *nlh;
1826         unsigned char *b = skb_tail_pointer(skb);
1827         struct gnet_dump d;
1828         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1829
1830         cond_resched();
1831         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1832         if (!nlh)
1833                 goto out_nlmsg_trim;
1834         tcm = nlmsg_data(nlh);
1835         tcm->tcm_family = AF_UNSPEC;
1836         tcm->tcm__pad1 = 0;
1837         tcm->tcm__pad2 = 0;
1838         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1839         tcm->tcm_parent = q->handle;
1840         tcm->tcm_handle = q->handle;
1841         tcm->tcm_info = 0;
1842         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1843                 goto nla_put_failure;
1844         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1845                 goto nla_put_failure;
1846
1847         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1848                                          NULL, &d, TCA_PAD) < 0)
1849                 goto nla_put_failure;
1850
1851         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1852                 goto nla_put_failure;
1853
1854         if (gnet_stats_finish_copy(&d) < 0)
1855                 goto nla_put_failure;
1856
1857         if (extack && extack->_msg &&
1858             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1859                 goto out_nlmsg_trim;
1860
1861         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1862
1863         return skb->len;
1864
1865 out_nlmsg_trim:
1866 nla_put_failure:
1867         nlmsg_trim(skb, b);
1868         return -1;
1869 }
1870
1871 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1872                          struct nlmsghdr *n, struct Qdisc *q,
1873                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1874 {
1875         struct sk_buff *skb;
1876         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1877
1878         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1879         if (!skb)
1880                 return -ENOBUFS;
1881
1882         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1883                 kfree_skb(skb);
1884                 return -EINVAL;
1885         }
1886
1887         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1888                               n->nlmsg_flags & NLM_F_ECHO);
1889 }
1890
1891 static int tclass_del_notify(struct net *net,
1892                              const struct Qdisc_class_ops *cops,
1893                              struct sk_buff *oskb, struct nlmsghdr *n,
1894                              struct Qdisc *q, unsigned long cl,
1895                              struct netlink_ext_ack *extack)
1896 {
1897         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1898         struct sk_buff *skb;
1899         int err = 0;
1900
1901         if (!cops->delete)
1902                 return -EOPNOTSUPP;
1903
1904         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1905         if (!skb)
1906                 return -ENOBUFS;
1907
1908         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1909                            RTM_DELTCLASS, extack) < 0) {
1910                 kfree_skb(skb);
1911                 return -EINVAL;
1912         }
1913
1914         err = cops->delete(q, cl, extack);
1915         if (err) {
1916                 kfree_skb(skb);
1917                 return err;
1918         }
1919
1920         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1921                              n->nlmsg_flags & NLM_F_ECHO);
1922         return err;
1923 }
1924
1925 #ifdef CONFIG_NET_CLS
1926
1927 struct tcf_bind_args {
1928         struct tcf_walker w;
1929         unsigned long base;
1930         unsigned long cl;
1931         u32 classid;
1932 };
1933
1934 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1935 {
1936         struct tcf_bind_args *a = (void *)arg;
1937
1938         if (n && tp->ops->bind_class) {
1939                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1940
1941                 sch_tree_lock(q);
1942                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1943                 sch_tree_unlock(q);
1944         }
1945         return 0;
1946 }
1947
1948 struct tc_bind_class_args {
1949         struct qdisc_walker w;
1950         unsigned long new_cl;
1951         u32 portid;
1952         u32 clid;
1953 };
1954
1955 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1956                                 struct qdisc_walker *w)
1957 {
1958         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1959         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1960         struct tcf_block *block;
1961         struct tcf_chain *chain;
1962
1963         block = cops->tcf_block(q, cl, NULL);
1964         if (!block)
1965                 return 0;
1966         for (chain = tcf_get_next_chain(block, NULL);
1967              chain;
1968              chain = tcf_get_next_chain(block, chain)) {
1969                 struct tcf_proto *tp;
1970
1971                 for (tp = tcf_get_next_proto(chain, NULL);
1972                      tp; tp = tcf_get_next_proto(chain, tp)) {
1973                         struct tcf_bind_args arg = {};
1974
1975                         arg.w.fn = tcf_node_bind;
1976                         arg.classid = a->clid;
1977                         arg.base = cl;
1978                         arg.cl = a->new_cl;
1979                         tp->ops->walk(tp, &arg.w, true);
1980                 }
1981         }
1982
1983         return 0;
1984 }
1985
1986 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1987                            unsigned long new_cl)
1988 {
1989         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1990         struct tc_bind_class_args args = {};
1991
1992         if (!cops->tcf_block)
1993                 return;
1994         args.portid = portid;
1995         args.clid = clid;
1996         args.new_cl = new_cl;
1997         args.w.fn = tc_bind_class_walker;
1998         q->ops->cl_ops->walk(q, &args.w);
1999 }
2000
2001 #else
2002
2003 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2004                            unsigned long new_cl)
2005 {
2006 }
2007
2008 #endif
2009
2010 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2011                          struct netlink_ext_ack *extack)
2012 {
2013         struct net *net = sock_net(skb->sk);
2014         struct tcmsg *tcm = nlmsg_data(n);
2015         struct nlattr *tca[TCA_MAX + 1];
2016         struct net_device *dev;
2017         struct Qdisc *q = NULL;
2018         const struct Qdisc_class_ops *cops;
2019         unsigned long cl = 0;
2020         unsigned long new_cl;
2021         u32 portid;
2022         u32 clid;
2023         u32 qid;
2024         int err;
2025
2026         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2027                                      rtm_tca_policy, extack);
2028         if (err < 0)
2029                 return err;
2030
2031         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2032         if (!dev)
2033                 return -ENODEV;
2034
2035         /*
2036            parent == TC_H_UNSPEC - unspecified parent.
2037            parent == TC_H_ROOT   - class is root, which has no parent.
2038            parent == X:0         - parent is root class.
2039            parent == X:Y         - parent is a node in hierarchy.
2040            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2041
2042            handle == 0:0         - generate handle from kernel pool.
2043            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2044            handle == X:Y         - clear.
2045            handle == X:0         - root class.
2046          */
2047
2048         /* Step 1. Determine qdisc handle X:0 */
2049
2050         portid = tcm->tcm_parent;
2051         clid = tcm->tcm_handle;
2052         qid = TC_H_MAJ(clid);
2053
2054         if (portid != TC_H_ROOT) {
2055                 u32 qid1 = TC_H_MAJ(portid);
2056
2057                 if (qid && qid1) {
2058                         /* If both majors are known, they must be identical. */
2059                         if (qid != qid1)
2060                                 return -EINVAL;
2061                 } else if (qid1) {
2062                         qid = qid1;
2063                 } else if (qid == 0)
2064                         qid = rtnl_dereference(dev->qdisc)->handle;
2065
2066                 /* Now qid is genuine qdisc handle consistent
2067                  * both with parent and child.
2068                  *
2069                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2070                  */
2071                 if (portid)
2072                         portid = TC_H_MAKE(qid, portid);
2073         } else {
2074                 if (qid == 0)
2075                         qid = rtnl_dereference(dev->qdisc)->handle;
2076         }
2077
2078         /* OK. Locate qdisc */
2079         q = qdisc_lookup(dev, qid);
2080         if (!q)
2081                 return -ENOENT;
2082
2083         /* An check that it supports classes */
2084         cops = q->ops->cl_ops;
2085         if (cops == NULL)
2086                 return -EINVAL;
2087
2088         /* Now try to get class */
2089         if (clid == 0) {
2090                 if (portid == TC_H_ROOT)
2091                         clid = qid;
2092         } else
2093                 clid = TC_H_MAKE(qid, clid);
2094
2095         if (clid)
2096                 cl = cops->find(q, clid);
2097
2098         if (cl == 0) {
2099                 err = -ENOENT;
2100                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2101                     !(n->nlmsg_flags & NLM_F_CREATE))
2102                         goto out;
2103         } else {
2104                 switch (n->nlmsg_type) {
2105                 case RTM_NEWTCLASS:
2106                         err = -EEXIST;
2107                         if (n->nlmsg_flags & NLM_F_EXCL)
2108                                 goto out;
2109                         break;
2110                 case RTM_DELTCLASS:
2111                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2112                         /* Unbind the class with flilters with 0 */
2113                         tc_bind_tclass(q, portid, clid, 0);
2114                         goto out;
2115                 case RTM_GETTCLASS:
2116                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2117                         goto out;
2118                 default:
2119                         err = -EINVAL;
2120                         goto out;
2121                 }
2122         }
2123
2124         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2125                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2126                 return -EOPNOTSUPP;
2127         }
2128
2129         new_cl = cl;
2130         err = -EOPNOTSUPP;
2131         if (cops->change)
2132                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2133         if (err == 0) {
2134                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2135                 /* We just create a new class, need to do reverse binding. */
2136                 if (cl != new_cl)
2137                         tc_bind_tclass(q, portid, clid, new_cl);
2138         }
2139 out:
2140         return err;
2141 }
2142
2143 struct qdisc_dump_args {
2144         struct qdisc_walker     w;
2145         struct sk_buff          *skb;
2146         struct netlink_callback *cb;
2147 };
2148
2149 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2150                             struct qdisc_walker *arg)
2151 {
2152         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2153
2154         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2155                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2156                               RTM_NEWTCLASS, NULL);
2157 }
2158
2159 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2160                                 struct tcmsg *tcm, struct netlink_callback *cb,
2161                                 int *t_p, int s_t)
2162 {
2163         struct qdisc_dump_args arg;
2164
2165         if (tc_qdisc_dump_ignore(q, false) ||
2166             *t_p < s_t || !q->ops->cl_ops ||
2167             (tcm->tcm_parent &&
2168              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2169                 (*t_p)++;
2170                 return 0;
2171         }
2172         if (*t_p > s_t)
2173                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2174         arg.w.fn = qdisc_class_dump;
2175         arg.skb = skb;
2176         arg.cb = cb;
2177         arg.w.stop  = 0;
2178         arg.w.skip = cb->args[1];
2179         arg.w.count = 0;
2180         q->ops->cl_ops->walk(q, &arg.w);
2181         cb->args[1] = arg.w.count;
2182         if (arg.w.stop)
2183                 return -1;
2184         (*t_p)++;
2185         return 0;
2186 }
2187
2188 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2189                                struct tcmsg *tcm, struct netlink_callback *cb,
2190                                int *t_p, int s_t, bool recur)
2191 {
2192         struct Qdisc *q;
2193         int b;
2194
2195         if (!root)
2196                 return 0;
2197
2198         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2199                 return -1;
2200
2201         if (!qdisc_dev(root) || !recur)
2202                 return 0;
2203
2204         if (tcm->tcm_parent) {
2205                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2206                 if (q && q != root &&
2207                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2208                         return -1;
2209                 return 0;
2210         }
2211         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2212                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2213                         return -1;
2214         }
2215
2216         return 0;
2217 }
2218
2219 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2220 {
2221         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2222         struct net *net = sock_net(skb->sk);
2223         struct netdev_queue *dev_queue;
2224         struct net_device *dev;
2225         int t, s_t;
2226
2227         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2228                 return 0;
2229         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2230         if (!dev)
2231                 return 0;
2232
2233         s_t = cb->args[0];
2234         t = 0;
2235
2236         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2237                                 skb, tcm, cb, &t, s_t, true) < 0)
2238                 goto done;
2239
2240         dev_queue = dev_ingress_queue(dev);
2241         if (dev_queue &&
2242             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2243                                 &t, s_t, false) < 0)
2244                 goto done;
2245
2246 done:
2247         cb->args[0] = t;
2248
2249         dev_put(dev);
2250         return skb->len;
2251 }
2252
2253 #ifdef CONFIG_PROC_FS
2254 static int psched_show(struct seq_file *seq, void *v)
2255 {
2256         seq_printf(seq, "%08x %08x %08x %08x\n",
2257                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2258                    1000000,
2259                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2260
2261         return 0;
2262 }
2263
2264 static int __net_init psched_net_init(struct net *net)
2265 {
2266         struct proc_dir_entry *e;
2267
2268         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2269         if (e == NULL)
2270                 return -ENOMEM;
2271
2272         return 0;
2273 }
2274
2275 static void __net_exit psched_net_exit(struct net *net)
2276 {
2277         remove_proc_entry("psched", net->proc_net);
2278 }
2279 #else
2280 static int __net_init psched_net_init(struct net *net)
2281 {
2282         return 0;
2283 }
2284
2285 static void __net_exit psched_net_exit(struct net *net)
2286 {
2287 }
2288 #endif
2289
2290 static struct pernet_operations psched_net_ops = {
2291         .init = psched_net_init,
2292         .exit = psched_net_exit,
2293 };
2294
2295 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2296
2297 static int __init pktsched_init(void)
2298 {
2299         int err;
2300
2301         err = register_pernet_subsys(&psched_net_ops);
2302         if (err) {
2303                 pr_err("pktsched_init: "
2304                        "cannot initialize per netns operations\n");
2305                 return err;
2306         }
2307
2308         register_qdisc(&pfifo_fast_ops);
2309         register_qdisc(&pfifo_qdisc_ops);
2310         register_qdisc(&bfifo_qdisc_ops);
2311         register_qdisc(&pfifo_head_drop_qdisc_ops);
2312         register_qdisc(&mq_qdisc_ops);
2313         register_qdisc(&noqueue_qdisc_ops);
2314
2315         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2316         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2317         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2318                       0);
2319         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2320         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2321         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2322                       0);
2323
2324         tc_wrapper_init();
2325
2326         return 0;
2327 }
2328
2329 subsys_initcall(pktsched_init);