Merge tag '6.4-rc-ksmbd-server-fixes-part2' of git://git.samba.org/ksmbd
[linux-block.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34 #include <net/tc_wrapper.h>
35
36 #include <trace/events/qdisc.h>
37
38 /*
39
40    Short review.
41    -------------
42
43    This file consists of two interrelated parts:
44
45    1. queueing disciplines manager frontend.
46    2. traffic classes manager frontend.
47
48    Generally, queueing discipline ("qdisc") is a black box,
49    which is able to enqueue packets and to dequeue them (when
50    device is ready to send something) in order and at times
51    determined by algorithm hidden in it.
52
53    qdisc's are divided to two categories:
54    - "queues", which have no internal structure visible from outside.
55    - "schedulers", which split all the packets to "traffic classes",
56      using "packet classifiers" (look at cls_api.c)
57
58    In turn, classes may have child qdiscs (as rule, queues)
59    attached to them etc. etc. etc.
60
61    The goal of the routines in this file is to translate
62    information supplied by user in the form of handles
63    to more intelligible for kernel form, to make some sanity
64    checks and part of work, which is common to all qdiscs
65    and to provide rtnetlink notifications.
66
67    All real intelligent work is done inside qdisc modules.
68
69
70
71    Every discipline has two major routines: enqueue and dequeue.
72
73    ---dequeue
74
75    dequeue usually returns a skb to send. It is allowed to return NULL,
76    but it does not mean that queue is empty, it just means that
77    discipline does not want to send anything this time.
78    Queue is really empty if q->q.qlen == 0.
79    For complicated disciplines with multiple queues q->q is not
80    real packet queue, but however q->q.qlen must be valid.
81
82    ---enqueue
83
84    enqueue returns 0, if packet was enqueued successfully.
85    If packet (this one or another one) was dropped, it returns
86    not zero error code.
87    NET_XMIT_DROP        - this packet dropped
88      Expected action: do not backoff, but wait until queue will clear.
89    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
90      Expected action: backoff or ignore
91
92    Auxiliary routines:
93
94    ---peek
95
96    like dequeue but without removing a packet from the queue
97
98    ---reset
99
100    returns qdisc to initial state: purge all buffers, clear all
101    timers, counters (except for statistics) etc.
102
103    ---init
104
105    initializes newly created qdisc.
106
107    ---destroy
108
109    destroys resources allocated by init and during lifetime of qdisc.
110
111    ---change
112
113    changes qdisc parameters.
114  */
115
116 /* Protects list of registered TC modules. It is pure SMP lock. */
117 static DEFINE_RWLOCK(qdisc_mod_lock);
118
119
120 /************************************************
121  *      Queueing disciplines manipulation.      *
122  ************************************************/
123
124
125 /* The list of all installed queueing disciplines. */
126
127 static struct Qdisc_ops *qdisc_base;
128
129 /* Register/unregister queueing discipline */
130
131 int register_qdisc(struct Qdisc_ops *qops)
132 {
133         struct Qdisc_ops *q, **qp;
134         int rc = -EEXIST;
135
136         write_lock(&qdisc_mod_lock);
137         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
138                 if (!strcmp(qops->id, q->id))
139                         goto out;
140
141         if (qops->enqueue == NULL)
142                 qops->enqueue = noop_qdisc_ops.enqueue;
143         if (qops->peek == NULL) {
144                 if (qops->dequeue == NULL)
145                         qops->peek = noop_qdisc_ops.peek;
146                 else
147                         goto out_einval;
148         }
149         if (qops->dequeue == NULL)
150                 qops->dequeue = noop_qdisc_ops.dequeue;
151
152         if (qops->cl_ops) {
153                 const struct Qdisc_class_ops *cops = qops->cl_ops;
154
155                 if (!(cops->find && cops->walk && cops->leaf))
156                         goto out_einval;
157
158                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
159                         goto out_einval;
160         }
161
162         qops->next = NULL;
163         *qp = qops;
164         rc = 0;
165 out:
166         write_unlock(&qdisc_mod_lock);
167         return rc;
168
169 out_einval:
170         rc = -EINVAL;
171         goto out;
172 }
173 EXPORT_SYMBOL(register_qdisc);
174
175 void unregister_qdisc(struct Qdisc_ops *qops)
176 {
177         struct Qdisc_ops *q, **qp;
178         int err = -ENOENT;
179
180         write_lock(&qdisc_mod_lock);
181         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
182                 if (q == qops)
183                         break;
184         if (q) {
185                 *qp = q->next;
186                 q->next = NULL;
187                 err = 0;
188         }
189         write_unlock(&qdisc_mod_lock);
190
191         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
192 }
193 EXPORT_SYMBOL(unregister_qdisc);
194
195 /* Get default qdisc if not otherwise specified */
196 void qdisc_get_default(char *name, size_t len)
197 {
198         read_lock(&qdisc_mod_lock);
199         strscpy(name, default_qdisc_ops->id, len);
200         read_unlock(&qdisc_mod_lock);
201 }
202
203 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
204 {
205         struct Qdisc_ops *q = NULL;
206
207         for (q = qdisc_base; q; q = q->next) {
208                 if (!strcmp(name, q->id)) {
209                         if (!try_module_get(q->owner))
210                                 q = NULL;
211                         break;
212                 }
213         }
214
215         return q;
216 }
217
218 /* Set new default qdisc to use */
219 int qdisc_set_default(const char *name)
220 {
221         const struct Qdisc_ops *ops;
222
223         if (!capable(CAP_NET_ADMIN))
224                 return -EPERM;
225
226         write_lock(&qdisc_mod_lock);
227         ops = qdisc_lookup_default(name);
228         if (!ops) {
229                 /* Not found, drop lock and try to load module */
230                 write_unlock(&qdisc_mod_lock);
231                 request_module("sch_%s", name);
232                 write_lock(&qdisc_mod_lock);
233
234                 ops = qdisc_lookup_default(name);
235         }
236
237         if (ops) {
238                 /* Set new default */
239                 module_put(default_qdisc_ops->owner);
240                 default_qdisc_ops = ops;
241         }
242         write_unlock(&qdisc_mod_lock);
243
244         return ops ? 0 : -ENOENT;
245 }
246
247 #ifdef CONFIG_NET_SCH_DEFAULT
248 /* Set default value from kernel config */
249 static int __init sch_default_qdisc(void)
250 {
251         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
252 }
253 late_initcall(sch_default_qdisc);
254 #endif
255
256 /* We know handle. Find qdisc among all qdisc's attached to device
257  * (root qdisc, all its children, children of children etc.)
258  * Note: caller either uses rtnl or rcu_read_lock()
259  */
260
261 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
262 {
263         struct Qdisc *q;
264
265         if (!qdisc_dev(root))
266                 return (root->handle == handle ? root : NULL);
267
268         if (!(root->flags & TCQ_F_BUILTIN) &&
269             root->handle == handle)
270                 return root;
271
272         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
273                                    lockdep_rtnl_is_held()) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
319 {
320         struct netdev_queue *nq;
321         struct Qdisc *q;
322
323         if (!handle)
324                 return NULL;
325         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
326         if (q)
327                 goto out;
328
329         nq = dev_ingress_queue_rcu(dev);
330         if (nq)
331                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
332 out:
333         return q;
334 }
335
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338         unsigned long cl;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         return cops->leaf(p, cl);
348 }
349
350 /* Find queueing discipline by name */
351
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354         struct Qdisc_ops *q = NULL;
355
356         if (kind) {
357                 read_lock(&qdisc_mod_lock);
358                 for (q = qdisc_base; q; q = q->next) {
359                         if (nla_strcmp(kind, q->id) == 0) {
360                                 if (!try_module_get(q->owner))
361                                         q = NULL;
362                                 break;
363                         }
364                 }
365                 read_unlock(&qdisc_mod_lock);
366         }
367         return q;
368 }
369
370 /* The linklayer setting were not transferred from iproute2, in older
371  * versions, and the rate tables lookup systems have been dropped in
372  * the kernel. To keep backward compatible with older iproute2 tc
373  * utils, we detect the linklayer setting by detecting if the rate
374  * table were modified.
375  *
376  * For linklayer ATM table entries, the rate table will be aligned to
377  * 48 bytes, thus some table entries will contain the same value.  The
378  * mpu (min packet unit) is also encoded into the old rate table, thus
379  * starting from the mpu, we find low and high table entries for
380  * mapping this cell.  If these entries contain the same value, when
381  * the rate tables have been modified for linklayer ATM.
382  *
383  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384  * and then roundup to the next cell, calc the table entry one below,
385  * and compare.
386  */
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389         int low       = roundup(r->mpu, 48);
390         int high      = roundup(low+1, 48);
391         int cell_low  = low >> r->cell_log;
392         int cell_high = (high >> r->cell_log) - 1;
393
394         /* rtab is too inaccurate at rates > 100Mbit/s */
395         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396                 pr_debug("TC linklayer: Giving up ATM detection\n");
397                 return TC_LINKLAYER_ETHERNET;
398         }
399
400         if ((cell_high > cell_low) && (cell_high < 256)
401             && (rtab[cell_low] == rtab[cell_high])) {
402                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403                          cell_low, cell_high, rtab[cell_high]);
404                 return TC_LINKLAYER_ATM;
405         }
406         return TC_LINKLAYER_ETHERNET;
407 }
408
409 static struct qdisc_rate_table *qdisc_rtab_list;
410
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412                                         struct nlattr *tab,
413                                         struct netlink_ext_ack *extack)
414 {
415         struct qdisc_rate_table *rtab;
416
417         if (tab == NULL || r->rate == 0 ||
418             r->cell_log == 0 || r->cell_log >= 32 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485                                           extack);
486         if (err < 0)
487                 return ERR_PTR(err);
488         if (!tb[TCA_STAB_BASE]) {
489                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490                 return ERR_PTR(-EINVAL);
491         }
492
493         s = nla_data(tb[TCA_STAB_BASE]);
494
495         if (s->tsize > 0) {
496                 if (!tb[TCA_STAB_DATA]) {
497                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498                         return ERR_PTR(-EINVAL);
499                 }
500                 tab = nla_data(tb[TCA_STAB_DATA]);
501                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502         }
503
504         if (tsize != s->tsize || (!tab && tsize > 0)) {
505                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506                 return ERR_PTR(-EINVAL);
507         }
508
509         list_for_each_entry(stab, &qdisc_stab_list, list) {
510                 if (memcmp(&stab->szopts, s, sizeof(*s)))
511                         continue;
512                 if (tsize > 0 &&
513                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514                         continue;
515                 stab->refcnt++;
516                 return stab;
517         }
518
519         if (s->size_log > STAB_SIZE_LOG_MAX ||
520             s->cell_log > STAB_SIZE_LOG_MAX) {
521                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522                 return ERR_PTR(-EINVAL);
523         }
524
525         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526         if (!stab)
527                 return ERR_PTR(-ENOMEM);
528
529         stab->refcnt = 1;
530         stab->szopts = *s;
531         if (tsize > 0)
532                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533
534         list_add_tail(&stab->list, &qdisc_stab_list);
535
536         return stab;
537 }
538
539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541         if (!tab)
542                 return;
543
544         if (--tab->refcnt == 0) {
545                 list_del(&tab->list);
546                 kfree_rcu(tab, rcu);
547         }
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553         struct nlattr *nest;
554
555         nest = nla_nest_start_noflag(skb, TCA_STAB);
556         if (nest == NULL)
557                 goto nla_put_failure;
558         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559                 goto nla_put_failure;
560         nla_nest_end(skb, nest);
561
562         return skb->len;
563
564 nla_put_failure:
565         return -1;
566 }
567
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569                                const struct qdisc_size_table *stab)
570 {
571         int pkt_len, slot;
572
573         pkt_len = skb->len + stab->szopts.overhead;
574         if (unlikely(!stab->szopts.tsize))
575                 goto out;
576
577         slot = pkt_len + stab->szopts.cell_align;
578         if (unlikely(slot < 0))
579                 slot = 0;
580
581         slot >>= stab->szopts.cell_log;
582         if (likely(slot < stab->szopts.tsize))
583                 pkt_len = stab->data[slot];
584         else
585                 pkt_len = stab->data[stab->szopts.tsize - 1] *
586                                 (slot / stab->szopts.tsize) +
587                                 stab->data[slot % stab->szopts.tsize];
588
589         pkt_len <<= stab->szopts.size_log;
590 out:
591         if (unlikely(pkt_len < 1))
592                 pkt_len = 1;
593         qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601                         txt, qdisc->ops->id, qdisc->handle >> 16);
602                 qdisc->flags |= TCQ_F_WARN_NONWC;
603         }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610                                                  timer);
611
612         rcu_read_lock();
613         __netif_schedule(qdisc_root(wd->qdisc));
614         rcu_read_unlock();
615
616         return HRTIMER_NORESTART;
617 }
618
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620                                  clockid_t clockid)
621 {
622         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623         wd->timer.function = qdisc_watchdog;
624         wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635                                       u64 delta_ns)
636 {
637         if (test_bit(__QDISC_STATE_DEACTIVATED,
638                      &qdisc_root_sleeping(wd->qdisc)->state))
639                 return;
640
641         if (hrtimer_is_queued(&wd->timer)) {
642                 u64 softexpires;
643
644                 softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer));
645                 /* If timer is already set in [expires, expires + delta_ns],
646                  * do not reprogram it.
647                  */
648                 if (softexpires - expires <= delta_ns)
649                         return;
650         }
651
652         hrtimer_start_range_ns(&wd->timer,
653                                ns_to_ktime(expires),
654                                delta_ns,
655                                HRTIMER_MODE_ABS_PINNED);
656 }
657 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
658
659 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
660 {
661         hrtimer_cancel(&wd->timer);
662 }
663 EXPORT_SYMBOL(qdisc_watchdog_cancel);
664
665 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
666 {
667         struct hlist_head *h;
668         unsigned int i;
669
670         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
671
672         if (h != NULL) {
673                 for (i = 0; i < n; i++)
674                         INIT_HLIST_HEAD(&h[i]);
675         }
676         return h;
677 }
678
679 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
680 {
681         struct Qdisc_class_common *cl;
682         struct hlist_node *next;
683         struct hlist_head *nhash, *ohash;
684         unsigned int nsize, nmask, osize;
685         unsigned int i, h;
686
687         /* Rehash when load factor exceeds 0.75 */
688         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
689                 return;
690         nsize = clhash->hashsize * 2;
691         nmask = nsize - 1;
692         nhash = qdisc_class_hash_alloc(nsize);
693         if (nhash == NULL)
694                 return;
695
696         ohash = clhash->hash;
697         osize = clhash->hashsize;
698
699         sch_tree_lock(sch);
700         for (i = 0; i < osize; i++) {
701                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
702                         h = qdisc_class_hash(cl->classid, nmask);
703                         hlist_add_head(&cl->hnode, &nhash[h]);
704                 }
705         }
706         clhash->hash     = nhash;
707         clhash->hashsize = nsize;
708         clhash->hashmask = nmask;
709         sch_tree_unlock(sch);
710
711         kvfree(ohash);
712 }
713 EXPORT_SYMBOL(qdisc_class_hash_grow);
714
715 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
716 {
717         unsigned int size = 4;
718
719         clhash->hash = qdisc_class_hash_alloc(size);
720         if (!clhash->hash)
721                 return -ENOMEM;
722         clhash->hashsize  = size;
723         clhash->hashmask  = size - 1;
724         clhash->hashelems = 0;
725         return 0;
726 }
727 EXPORT_SYMBOL(qdisc_class_hash_init);
728
729 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
730 {
731         kvfree(clhash->hash);
732 }
733 EXPORT_SYMBOL(qdisc_class_hash_destroy);
734
735 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
736                              struct Qdisc_class_common *cl)
737 {
738         unsigned int h;
739
740         INIT_HLIST_NODE(&cl->hnode);
741         h = qdisc_class_hash(cl->classid, clhash->hashmask);
742         hlist_add_head(&cl->hnode, &clhash->hash[h]);
743         clhash->hashelems++;
744 }
745 EXPORT_SYMBOL(qdisc_class_hash_insert);
746
747 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
748                              struct Qdisc_class_common *cl)
749 {
750         hlist_del(&cl->hnode);
751         clhash->hashelems--;
752 }
753 EXPORT_SYMBOL(qdisc_class_hash_remove);
754
755 /* Allocate an unique handle from space managed by kernel
756  * Possible range is [8000-FFFF]:0000 (0x8000 values)
757  */
758 static u32 qdisc_alloc_handle(struct net_device *dev)
759 {
760         int i = 0x8000;
761         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
762
763         do {
764                 autohandle += TC_H_MAKE(0x10000U, 0);
765                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
766                         autohandle = TC_H_MAKE(0x80000000U, 0);
767                 if (!qdisc_lookup(dev, autohandle))
768                         return autohandle;
769                 cond_resched();
770         } while (--i > 0);
771
772         return 0;
773 }
774
775 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
776 {
777         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
778         const struct Qdisc_class_ops *cops;
779         unsigned long cl;
780         u32 parentid;
781         bool notify;
782         int drops;
783
784         if (n == 0 && len == 0)
785                 return;
786         drops = max_t(int, n, 0);
787         rcu_read_lock();
788         while ((parentid = sch->parent)) {
789                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
790                         break;
791
792                 if (sch->flags & TCQ_F_NOPARENT)
793                         break;
794                 /* Notify parent qdisc only if child qdisc becomes empty.
795                  *
796                  * If child was empty even before update then backlog
797                  * counter is screwed and we skip notification because
798                  * parent class is already passive.
799                  *
800                  * If the original child was offloaded then it is allowed
801                  * to be seem as empty, so the parent is notified anyway.
802                  */
803                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
804                                                        !qdisc_is_offloaded);
805                 /* TODO: perform the search on a per txq basis */
806                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
807                 if (sch == NULL) {
808                         WARN_ON_ONCE(parentid != TC_H_ROOT);
809                         break;
810                 }
811                 cops = sch->ops->cl_ops;
812                 if (notify && cops->qlen_notify) {
813                         cl = cops->find(sch, parentid);
814                         cops->qlen_notify(sch, cl);
815                 }
816                 sch->q.qlen -= n;
817                 sch->qstats.backlog -= len;
818                 __qdisc_qstats_drop(sch, drops);
819         }
820         rcu_read_unlock();
821 }
822 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
823
824 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
825                               void *type_data)
826 {
827         struct net_device *dev = qdisc_dev(sch);
828         int err;
829
830         sch->flags &= ~TCQ_F_OFFLOADED;
831         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
832                 return 0;
833
834         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
835         if (err == -EOPNOTSUPP)
836                 return 0;
837
838         if (!err)
839                 sch->flags |= TCQ_F_OFFLOADED;
840
841         return err;
842 }
843 EXPORT_SYMBOL(qdisc_offload_dump_helper);
844
845 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
846                                 struct Qdisc *new, struct Qdisc *old,
847                                 enum tc_setup_type type, void *type_data,
848                                 struct netlink_ext_ack *extack)
849 {
850         bool any_qdisc_is_offloaded;
851         int err;
852
853         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
854                 return;
855
856         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
857
858         /* Don't report error if the graft is part of destroy operation. */
859         if (!err || !new || new == &noop_qdisc)
860                 return;
861
862         /* Don't report error if the parent, the old child and the new
863          * one are not offloaded.
864          */
865         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
866         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
867         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
868
869         if (any_qdisc_is_offloaded)
870                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
871 }
872 EXPORT_SYMBOL(qdisc_offload_graft_helper);
873
874 void qdisc_offload_query_caps(struct net_device *dev,
875                               enum tc_setup_type type,
876                               void *caps, size_t caps_len)
877 {
878         const struct net_device_ops *ops = dev->netdev_ops;
879         struct tc_query_caps_base base = {
880                 .type = type,
881                 .caps = caps,
882         };
883
884         memset(caps, 0, caps_len);
885
886         if (ops->ndo_setup_tc)
887                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
888 }
889 EXPORT_SYMBOL(qdisc_offload_query_caps);
890
891 static void qdisc_offload_graft_root(struct net_device *dev,
892                                      struct Qdisc *new, struct Qdisc *old,
893                                      struct netlink_ext_ack *extack)
894 {
895         struct tc_root_qopt_offload graft_offload = {
896                 .command        = TC_ROOT_GRAFT,
897                 .handle         = new ? new->handle : 0,
898                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
899                                   (old && old->flags & TCQ_F_INGRESS),
900         };
901
902         qdisc_offload_graft_helper(dev, NULL, new, old,
903                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
904 }
905
906 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
907                          u32 portid, u32 seq, u16 flags, int event,
908                          struct netlink_ext_ack *extack)
909 {
910         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
911         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
912         struct tcmsg *tcm;
913         struct nlmsghdr  *nlh;
914         unsigned char *b = skb_tail_pointer(skb);
915         struct gnet_dump d;
916         struct qdisc_size_table *stab;
917         u32 block_index;
918         __u32 qlen;
919
920         cond_resched();
921         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
922         if (!nlh)
923                 goto out_nlmsg_trim;
924         tcm = nlmsg_data(nlh);
925         tcm->tcm_family = AF_UNSPEC;
926         tcm->tcm__pad1 = 0;
927         tcm->tcm__pad2 = 0;
928         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
929         tcm->tcm_parent = clid;
930         tcm->tcm_handle = q->handle;
931         tcm->tcm_info = refcount_read(&q->refcnt);
932         if (nla_put_string(skb, TCA_KIND, q->ops->id))
933                 goto nla_put_failure;
934         if (q->ops->ingress_block_get) {
935                 block_index = q->ops->ingress_block_get(q);
936                 if (block_index &&
937                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
938                         goto nla_put_failure;
939         }
940         if (q->ops->egress_block_get) {
941                 block_index = q->ops->egress_block_get(q);
942                 if (block_index &&
943                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
944                         goto nla_put_failure;
945         }
946         if (q->ops->dump && q->ops->dump(q, skb) < 0)
947                 goto nla_put_failure;
948         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
949                 goto nla_put_failure;
950         qlen = qdisc_qlen_sum(q);
951
952         stab = rtnl_dereference(q->stab);
953         if (stab && qdisc_dump_stab(skb, stab) < 0)
954                 goto nla_put_failure;
955
956         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
957                                          NULL, &d, TCA_PAD) < 0)
958                 goto nla_put_failure;
959
960         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
961                 goto nla_put_failure;
962
963         if (qdisc_is_percpu_stats(q)) {
964                 cpu_bstats = q->cpu_bstats;
965                 cpu_qstats = q->cpu_qstats;
966         }
967
968         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
969             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
970             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
971                 goto nla_put_failure;
972
973         if (gnet_stats_finish_copy(&d) < 0)
974                 goto nla_put_failure;
975
976         if (extack && extack->_msg &&
977             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
978                 goto out_nlmsg_trim;
979
980         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
981
982         return skb->len;
983
984 out_nlmsg_trim:
985 nla_put_failure:
986         nlmsg_trim(skb, b);
987         return -1;
988 }
989
990 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
991 {
992         if (q->flags & TCQ_F_BUILTIN)
993                 return true;
994         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
995                 return true;
996
997         return false;
998 }
999
1000 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1001                         struct nlmsghdr *n, u32 clid,
1002                         struct Qdisc *old, struct Qdisc *new,
1003                         struct netlink_ext_ack *extack)
1004 {
1005         struct sk_buff *skb;
1006         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1007
1008         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1009         if (!skb)
1010                 return -ENOBUFS;
1011
1012         if (old && !tc_qdisc_dump_ignore(old, false)) {
1013                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1014                                   0, RTM_DELQDISC, extack) < 0)
1015                         goto err_out;
1016         }
1017         if (new && !tc_qdisc_dump_ignore(new, false)) {
1018                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1019                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1020                         goto err_out;
1021         }
1022
1023         if (skb->len)
1024                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1025                                       n->nlmsg_flags & NLM_F_ECHO);
1026
1027 err_out:
1028         kfree_skb(skb);
1029         return -EINVAL;
1030 }
1031
1032 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1033                                struct nlmsghdr *n, u32 clid,
1034                                struct Qdisc *old, struct Qdisc *new,
1035                                struct netlink_ext_ack *extack)
1036 {
1037         if (new || old)
1038                 qdisc_notify(net, skb, n, clid, old, new, extack);
1039
1040         if (old)
1041                 qdisc_put(old);
1042 }
1043
1044 static void qdisc_clear_nolock(struct Qdisc *sch)
1045 {
1046         sch->flags &= ~TCQ_F_NOLOCK;
1047         if (!(sch->flags & TCQ_F_CPUSTATS))
1048                 return;
1049
1050         free_percpu(sch->cpu_bstats);
1051         free_percpu(sch->cpu_qstats);
1052         sch->cpu_bstats = NULL;
1053         sch->cpu_qstats = NULL;
1054         sch->flags &= ~TCQ_F_CPUSTATS;
1055 }
1056
1057 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1058  * to device "dev".
1059  *
1060  * When appropriate send a netlink notification using 'skb'
1061  * and "n".
1062  *
1063  * On success, destroy old qdisc.
1064  */
1065
1066 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1067                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1068                        struct Qdisc *new, struct Qdisc *old,
1069                        struct netlink_ext_ack *extack)
1070 {
1071         struct Qdisc *q = old;
1072         struct net *net = dev_net(dev);
1073
1074         if (parent == NULL) {
1075                 unsigned int i, num_q, ingress;
1076
1077                 ingress = 0;
1078                 num_q = dev->num_tx_queues;
1079                 if ((q && q->flags & TCQ_F_INGRESS) ||
1080                     (new && new->flags & TCQ_F_INGRESS)) {
1081                         num_q = 1;
1082                         ingress = 1;
1083                         if (!dev_ingress_queue(dev)) {
1084                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1085                                 return -ENOENT;
1086                         }
1087                 }
1088
1089                 if (dev->flags & IFF_UP)
1090                         dev_deactivate(dev);
1091
1092                 qdisc_offload_graft_root(dev, new, old, extack);
1093
1094                 if (new && new->ops->attach && !ingress)
1095                         goto skip;
1096
1097                 for (i = 0; i < num_q; i++) {
1098                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1099
1100                         if (!ingress)
1101                                 dev_queue = netdev_get_tx_queue(dev, i);
1102
1103                         old = dev_graft_qdisc(dev_queue, new);
1104                         if (new && i > 0)
1105                                 qdisc_refcount_inc(new);
1106
1107                         if (!ingress)
1108                                 qdisc_put(old);
1109                 }
1110
1111 skip:
1112                 if (!ingress) {
1113                         old = rtnl_dereference(dev->qdisc);
1114                         if (new && !new->ops->attach)
1115                                 qdisc_refcount_inc(new);
1116                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1117
1118                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1119
1120                         if (new && new->ops->attach)
1121                                 new->ops->attach(new);
1122                 } else {
1123                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1124                 }
1125
1126                 if (dev->flags & IFF_UP)
1127                         dev_activate(dev);
1128         } else {
1129                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1130                 unsigned long cl;
1131                 int err;
1132
1133                 /* Only support running class lockless if parent is lockless */
1134                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1135                         qdisc_clear_nolock(new);
1136
1137                 if (!cops || !cops->graft)
1138                         return -EOPNOTSUPP;
1139
1140                 cl = cops->find(parent, classid);
1141                 if (!cl) {
1142                         NL_SET_ERR_MSG(extack, "Specified class not found");
1143                         return -ENOENT;
1144                 }
1145
1146                 if (new && new->ops == &noqueue_qdisc_ops) {
1147                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1148                         return -EINVAL;
1149                 }
1150
1151                 err = cops->graft(parent, cl, new, &old, extack);
1152                 if (err)
1153                         return err;
1154                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1155         }
1156         return 0;
1157 }
1158
1159 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1160                                    struct netlink_ext_ack *extack)
1161 {
1162         u32 block_index;
1163
1164         if (tca[TCA_INGRESS_BLOCK]) {
1165                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1166
1167                 if (!block_index) {
1168                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1169                         return -EINVAL;
1170                 }
1171                 if (!sch->ops->ingress_block_set) {
1172                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1173                         return -EOPNOTSUPP;
1174                 }
1175                 sch->ops->ingress_block_set(sch, block_index);
1176         }
1177         if (tca[TCA_EGRESS_BLOCK]) {
1178                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1179
1180                 if (!block_index) {
1181                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1182                         return -EINVAL;
1183                 }
1184                 if (!sch->ops->egress_block_set) {
1185                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1186                         return -EOPNOTSUPP;
1187                 }
1188                 sch->ops->egress_block_set(sch, block_index);
1189         }
1190         return 0;
1191 }
1192
1193 /*
1194    Allocate and initialize new qdisc.
1195
1196    Parameters are passed via opt.
1197  */
1198
1199 static struct Qdisc *qdisc_create(struct net_device *dev,
1200                                   struct netdev_queue *dev_queue,
1201                                   u32 parent, u32 handle,
1202                                   struct nlattr **tca, int *errp,
1203                                   struct netlink_ext_ack *extack)
1204 {
1205         int err;
1206         struct nlattr *kind = tca[TCA_KIND];
1207         struct Qdisc *sch;
1208         struct Qdisc_ops *ops;
1209         struct qdisc_size_table *stab;
1210
1211         ops = qdisc_lookup_ops(kind);
1212 #ifdef CONFIG_MODULES
1213         if (ops == NULL && kind != NULL) {
1214                 char name[IFNAMSIZ];
1215                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1216                         /* We dropped the RTNL semaphore in order to
1217                          * perform the module load.  So, even if we
1218                          * succeeded in loading the module we have to
1219                          * tell the caller to replay the request.  We
1220                          * indicate this using -EAGAIN.
1221                          * We replay the request because the device may
1222                          * go away in the mean time.
1223                          */
1224                         rtnl_unlock();
1225                         request_module("sch_%s", name);
1226                         rtnl_lock();
1227                         ops = qdisc_lookup_ops(kind);
1228                         if (ops != NULL) {
1229                                 /* We will try again qdisc_lookup_ops,
1230                                  * so don't keep a reference.
1231                                  */
1232                                 module_put(ops->owner);
1233                                 err = -EAGAIN;
1234                                 goto err_out;
1235                         }
1236                 }
1237         }
1238 #endif
1239
1240         err = -ENOENT;
1241         if (!ops) {
1242                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1243                 goto err_out;
1244         }
1245
1246         sch = qdisc_alloc(dev_queue, ops, extack);
1247         if (IS_ERR(sch)) {
1248                 err = PTR_ERR(sch);
1249                 goto err_out2;
1250         }
1251
1252         sch->parent = parent;
1253
1254         if (handle == TC_H_INGRESS) {
1255                 sch->flags |= TCQ_F_INGRESS;
1256                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1257         } else {
1258                 if (handle == 0) {
1259                         handle = qdisc_alloc_handle(dev);
1260                         if (handle == 0) {
1261                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1262                                 err = -ENOSPC;
1263                                 goto err_out3;
1264                         }
1265                 }
1266                 if (!netif_is_multiqueue(dev))
1267                         sch->flags |= TCQ_F_ONETXQUEUE;
1268         }
1269
1270         sch->handle = handle;
1271
1272         /* This exist to keep backward compatible with a userspace
1273          * loophole, what allowed userspace to get IFF_NO_QUEUE
1274          * facility on older kernels by setting tx_queue_len=0 (prior
1275          * to qdisc init), and then forgot to reinit tx_queue_len
1276          * before again attaching a qdisc.
1277          */
1278         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1279                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1280                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1281         }
1282
1283         err = qdisc_block_indexes_set(sch, tca, extack);
1284         if (err)
1285                 goto err_out3;
1286
1287         if (tca[TCA_STAB]) {
1288                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1289                 if (IS_ERR(stab)) {
1290                         err = PTR_ERR(stab);
1291                         goto err_out3;
1292                 }
1293                 rcu_assign_pointer(sch->stab, stab);
1294         }
1295
1296         if (ops->init) {
1297                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1298                 if (err != 0)
1299                         goto err_out4;
1300         }
1301
1302         if (tca[TCA_RATE]) {
1303                 err = -EOPNOTSUPP;
1304                 if (sch->flags & TCQ_F_MQROOT) {
1305                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1306                         goto err_out4;
1307                 }
1308
1309                 err = gen_new_estimator(&sch->bstats,
1310                                         sch->cpu_bstats,
1311                                         &sch->rate_est,
1312                                         NULL,
1313                                         true,
1314                                         tca[TCA_RATE]);
1315                 if (err) {
1316                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1317                         goto err_out4;
1318                 }
1319         }
1320
1321         qdisc_hash_add(sch, false);
1322         trace_qdisc_create(ops, dev, parent);
1323
1324         return sch;
1325
1326 err_out4:
1327         /* Even if ops->init() failed, we call ops->destroy()
1328          * like qdisc_create_dflt().
1329          */
1330         if (ops->destroy)
1331                 ops->destroy(sch);
1332         qdisc_put_stab(rtnl_dereference(sch->stab));
1333 err_out3:
1334         netdev_put(dev, &sch->dev_tracker);
1335         qdisc_free(sch);
1336 err_out2:
1337         module_put(ops->owner);
1338 err_out:
1339         *errp = err;
1340         return NULL;
1341 }
1342
1343 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1344                         struct netlink_ext_ack *extack)
1345 {
1346         struct qdisc_size_table *ostab, *stab = NULL;
1347         int err = 0;
1348
1349         if (tca[TCA_OPTIONS]) {
1350                 if (!sch->ops->change) {
1351                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1352                         return -EINVAL;
1353                 }
1354                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1355                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1356                         return -EOPNOTSUPP;
1357                 }
1358                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1359                 if (err)
1360                         return err;
1361         }
1362
1363         if (tca[TCA_STAB]) {
1364                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1365                 if (IS_ERR(stab))
1366                         return PTR_ERR(stab);
1367         }
1368
1369         ostab = rtnl_dereference(sch->stab);
1370         rcu_assign_pointer(sch->stab, stab);
1371         qdisc_put_stab(ostab);
1372
1373         if (tca[TCA_RATE]) {
1374                 /* NB: ignores errors from replace_estimator
1375                    because change can't be undone. */
1376                 if (sch->flags & TCQ_F_MQROOT)
1377                         goto out;
1378                 gen_replace_estimator(&sch->bstats,
1379                                       sch->cpu_bstats,
1380                                       &sch->rate_est,
1381                                       NULL,
1382                                       true,
1383                                       tca[TCA_RATE]);
1384         }
1385 out:
1386         return 0;
1387 }
1388
1389 struct check_loop_arg {
1390         struct qdisc_walker     w;
1391         struct Qdisc            *p;
1392         int                     depth;
1393 };
1394
1395 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1396                          struct qdisc_walker *w);
1397
1398 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1399 {
1400         struct check_loop_arg   arg;
1401
1402         if (q->ops->cl_ops == NULL)
1403                 return 0;
1404
1405         arg.w.stop = arg.w.skip = arg.w.count = 0;
1406         arg.w.fn = check_loop_fn;
1407         arg.depth = depth;
1408         arg.p = p;
1409         q->ops->cl_ops->walk(q, &arg.w);
1410         return arg.w.stop ? -ELOOP : 0;
1411 }
1412
1413 static int
1414 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1415 {
1416         struct Qdisc *leaf;
1417         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1418         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1419
1420         leaf = cops->leaf(q, cl);
1421         if (leaf) {
1422                 if (leaf == arg->p || arg->depth > 7)
1423                         return -ELOOP;
1424                 return check_loop(leaf, arg->p, arg->depth + 1);
1425         }
1426         return 0;
1427 }
1428
1429 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1430         [TCA_KIND]              = { .type = NLA_STRING },
1431         [TCA_RATE]              = { .type = NLA_BINARY,
1432                                     .len = sizeof(struct tc_estimator) },
1433         [TCA_STAB]              = { .type = NLA_NESTED },
1434         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1435         [TCA_CHAIN]             = { .type = NLA_U32 },
1436         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1437         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1438 };
1439
1440 /*
1441  * Delete/get qdisc.
1442  */
1443
1444 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1445                         struct netlink_ext_ack *extack)
1446 {
1447         struct net *net = sock_net(skb->sk);
1448         struct tcmsg *tcm = nlmsg_data(n);
1449         struct nlattr *tca[TCA_MAX + 1];
1450         struct net_device *dev;
1451         u32 clid;
1452         struct Qdisc *q = NULL;
1453         struct Qdisc *p = NULL;
1454         int err;
1455
1456         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1457                                      rtm_tca_policy, extack);
1458         if (err < 0)
1459                 return err;
1460
1461         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1462         if (!dev)
1463                 return -ENODEV;
1464
1465         clid = tcm->tcm_parent;
1466         if (clid) {
1467                 if (clid != TC_H_ROOT) {
1468                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1469                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1470                                 if (!p) {
1471                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1472                                         return -ENOENT;
1473                                 }
1474                                 q = qdisc_leaf(p, clid);
1475                         } else if (dev_ingress_queue(dev)) {
1476                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1477                         }
1478                 } else {
1479                         q = rtnl_dereference(dev->qdisc);
1480                 }
1481                 if (!q) {
1482                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1483                         return -ENOENT;
1484                 }
1485
1486                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1487                         NL_SET_ERR_MSG(extack, "Invalid handle");
1488                         return -EINVAL;
1489                 }
1490         } else {
1491                 q = qdisc_lookup(dev, tcm->tcm_handle);
1492                 if (!q) {
1493                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1494                         return -ENOENT;
1495                 }
1496         }
1497
1498         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1499                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1500                 return -EINVAL;
1501         }
1502
1503         if (n->nlmsg_type == RTM_DELQDISC) {
1504                 if (!clid) {
1505                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1506                         return -EINVAL;
1507                 }
1508                 if (q->handle == 0) {
1509                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1510                         return -ENOENT;
1511                 }
1512                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1513                 if (err != 0)
1514                         return err;
1515         } else {
1516                 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1517         }
1518         return 0;
1519 }
1520
1521 /*
1522  * Create/change qdisc.
1523  */
1524
1525 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1526                            struct netlink_ext_ack *extack)
1527 {
1528         struct net *net = sock_net(skb->sk);
1529         struct tcmsg *tcm;
1530         struct nlattr *tca[TCA_MAX + 1];
1531         struct net_device *dev;
1532         u32 clid;
1533         struct Qdisc *q, *p;
1534         int err;
1535
1536 replay:
1537         /* Reinit, just in case something touches this. */
1538         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1539                                      rtm_tca_policy, extack);
1540         if (err < 0)
1541                 return err;
1542
1543         tcm = nlmsg_data(n);
1544         clid = tcm->tcm_parent;
1545         q = p = NULL;
1546
1547         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1548         if (!dev)
1549                 return -ENODEV;
1550
1551
1552         if (clid) {
1553                 if (clid != TC_H_ROOT) {
1554                         if (clid != TC_H_INGRESS) {
1555                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1556                                 if (!p) {
1557                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1558                                         return -ENOENT;
1559                                 }
1560                                 q = qdisc_leaf(p, clid);
1561                         } else if (dev_ingress_queue_create(dev)) {
1562                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1563                         }
1564                 } else {
1565                         q = rtnl_dereference(dev->qdisc);
1566                 }
1567
1568                 /* It may be default qdisc, ignore it */
1569                 if (q && q->handle == 0)
1570                         q = NULL;
1571
1572                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1573                         if (tcm->tcm_handle) {
1574                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1575                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1576                                         return -EEXIST;
1577                                 }
1578                                 if (TC_H_MIN(tcm->tcm_handle)) {
1579                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1580                                         return -EINVAL;
1581                                 }
1582                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1583                                 if (!q)
1584                                         goto create_n_graft;
1585                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1586                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1587                                         return -EEXIST;
1588                                 }
1589                                 if (tca[TCA_KIND] &&
1590                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1591                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1592                                         return -EINVAL;
1593                                 }
1594                                 if (q == p ||
1595                                     (p && check_loop(q, p, 0))) {
1596                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1597                                         return -ELOOP;
1598                                 }
1599                                 qdisc_refcount_inc(q);
1600                                 goto graft;
1601                         } else {
1602                                 if (!q)
1603                                         goto create_n_graft;
1604
1605                                 /* This magic test requires explanation.
1606                                  *
1607                                  *   We know, that some child q is already
1608                                  *   attached to this parent and have choice:
1609                                  *   either to change it or to create/graft new one.
1610                                  *
1611                                  *   1. We are allowed to create/graft only
1612                                  *   if CREATE and REPLACE flags are set.
1613                                  *
1614                                  *   2. If EXCL is set, requestor wanted to say,
1615                                  *   that qdisc tcm_handle is not expected
1616                                  *   to exist, so that we choose create/graft too.
1617                                  *
1618                                  *   3. The last case is when no flags are set.
1619                                  *   Alas, it is sort of hole in API, we
1620                                  *   cannot decide what to do unambiguously.
1621                                  *   For now we select create/graft, if
1622                                  *   user gave KIND, which does not match existing.
1623                                  */
1624                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1625                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1626                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1627                                      (tca[TCA_KIND] &&
1628                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1629                                         goto create_n_graft;
1630                         }
1631                 }
1632         } else {
1633                 if (!tcm->tcm_handle) {
1634                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1635                         return -EINVAL;
1636                 }
1637                 q = qdisc_lookup(dev, tcm->tcm_handle);
1638         }
1639
1640         /* Change qdisc parameters */
1641         if (!q) {
1642                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1643                 return -ENOENT;
1644         }
1645         if (n->nlmsg_flags & NLM_F_EXCL) {
1646                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1647                 return -EEXIST;
1648         }
1649         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1650                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1651                 return -EINVAL;
1652         }
1653         err = qdisc_change(q, tca, extack);
1654         if (err == 0)
1655                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1656         return err;
1657
1658 create_n_graft:
1659         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1660                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1661                 return -ENOENT;
1662         }
1663         if (clid == TC_H_INGRESS) {
1664                 if (dev_ingress_queue(dev)) {
1665                         q = qdisc_create(dev, dev_ingress_queue(dev),
1666                                          tcm->tcm_parent, tcm->tcm_parent,
1667                                          tca, &err, extack);
1668                 } else {
1669                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1670                         err = -ENOENT;
1671                 }
1672         } else {
1673                 struct netdev_queue *dev_queue;
1674
1675                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1676                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1677                 else if (p)
1678                         dev_queue = p->dev_queue;
1679                 else
1680                         dev_queue = netdev_get_tx_queue(dev, 0);
1681
1682                 q = qdisc_create(dev, dev_queue,
1683                                  tcm->tcm_parent, tcm->tcm_handle,
1684                                  tca, &err, extack);
1685         }
1686         if (q == NULL) {
1687                 if (err == -EAGAIN)
1688                         goto replay;
1689                 return err;
1690         }
1691
1692 graft:
1693         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1694         if (err) {
1695                 if (q)
1696                         qdisc_put(q);
1697                 return err;
1698         }
1699
1700         return 0;
1701 }
1702
1703 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1704                               struct netlink_callback *cb,
1705                               int *q_idx_p, int s_q_idx, bool recur,
1706                               bool dump_invisible)
1707 {
1708         int ret = 0, q_idx = *q_idx_p;
1709         struct Qdisc *q;
1710         int b;
1711
1712         if (!root)
1713                 return 0;
1714
1715         q = root;
1716         if (q_idx < s_q_idx) {
1717                 q_idx++;
1718         } else {
1719                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1720                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1721                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1722                                   RTM_NEWQDISC, NULL) <= 0)
1723                         goto done;
1724                 q_idx++;
1725         }
1726
1727         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1728          * itself has already been dumped.
1729          *
1730          * If we've already dumped the top-level (ingress) qdisc above and the global
1731          * qdisc hashtable, we don't want to hit it again
1732          */
1733         if (!qdisc_dev(root) || !recur)
1734                 goto out;
1735
1736         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1737                 if (q_idx < s_q_idx) {
1738                         q_idx++;
1739                         continue;
1740                 }
1741                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1742                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1743                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1744                                   RTM_NEWQDISC, NULL) <= 0)
1745                         goto done;
1746                 q_idx++;
1747         }
1748
1749 out:
1750         *q_idx_p = q_idx;
1751         return ret;
1752 done:
1753         ret = -1;
1754         goto out;
1755 }
1756
1757 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1758 {
1759         struct net *net = sock_net(skb->sk);
1760         int idx, q_idx;
1761         int s_idx, s_q_idx;
1762         struct net_device *dev;
1763         const struct nlmsghdr *nlh = cb->nlh;
1764         struct nlattr *tca[TCA_MAX + 1];
1765         int err;
1766
1767         s_idx = cb->args[0];
1768         s_q_idx = q_idx = cb->args[1];
1769
1770         idx = 0;
1771         ASSERT_RTNL();
1772
1773         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1774                                      rtm_tca_policy, cb->extack);
1775         if (err < 0)
1776                 return err;
1777
1778         for_each_netdev(net, dev) {
1779                 struct netdev_queue *dev_queue;
1780
1781                 if (idx < s_idx)
1782                         goto cont;
1783                 if (idx > s_idx)
1784                         s_q_idx = 0;
1785                 q_idx = 0;
1786
1787                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1788                                        skb, cb, &q_idx, s_q_idx,
1789                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1790                         goto done;
1791
1792                 dev_queue = dev_ingress_queue(dev);
1793                 if (dev_queue &&
1794                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1795                                        &q_idx, s_q_idx, false,
1796                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1797                         goto done;
1798
1799 cont:
1800                 idx++;
1801         }
1802
1803 done:
1804         cb->args[0] = idx;
1805         cb->args[1] = q_idx;
1806
1807         return skb->len;
1808 }
1809
1810
1811
1812 /************************************************
1813  *      Traffic classes manipulation.           *
1814  ************************************************/
1815
1816 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1817                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1818                           int event, struct netlink_ext_ack *extack)
1819 {
1820         struct tcmsg *tcm;
1821         struct nlmsghdr  *nlh;
1822         unsigned char *b = skb_tail_pointer(skb);
1823         struct gnet_dump d;
1824         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1825
1826         cond_resched();
1827         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1828         if (!nlh)
1829                 goto out_nlmsg_trim;
1830         tcm = nlmsg_data(nlh);
1831         tcm->tcm_family = AF_UNSPEC;
1832         tcm->tcm__pad1 = 0;
1833         tcm->tcm__pad2 = 0;
1834         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1835         tcm->tcm_parent = q->handle;
1836         tcm->tcm_handle = q->handle;
1837         tcm->tcm_info = 0;
1838         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1839                 goto nla_put_failure;
1840         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1841                 goto nla_put_failure;
1842
1843         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1844                                          NULL, &d, TCA_PAD) < 0)
1845                 goto nla_put_failure;
1846
1847         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1848                 goto nla_put_failure;
1849
1850         if (gnet_stats_finish_copy(&d) < 0)
1851                 goto nla_put_failure;
1852
1853         if (extack && extack->_msg &&
1854             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1855                 goto out_nlmsg_trim;
1856
1857         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1858
1859         return skb->len;
1860
1861 out_nlmsg_trim:
1862 nla_put_failure:
1863         nlmsg_trim(skb, b);
1864         return -1;
1865 }
1866
1867 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1868                          struct nlmsghdr *n, struct Qdisc *q,
1869                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1870 {
1871         struct sk_buff *skb;
1872         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1873
1874         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1875         if (!skb)
1876                 return -ENOBUFS;
1877
1878         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1879                 kfree_skb(skb);
1880                 return -EINVAL;
1881         }
1882
1883         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1884                               n->nlmsg_flags & NLM_F_ECHO);
1885 }
1886
1887 static int tclass_del_notify(struct net *net,
1888                              const struct Qdisc_class_ops *cops,
1889                              struct sk_buff *oskb, struct nlmsghdr *n,
1890                              struct Qdisc *q, unsigned long cl,
1891                              struct netlink_ext_ack *extack)
1892 {
1893         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1894         struct sk_buff *skb;
1895         int err = 0;
1896
1897         if (!cops->delete)
1898                 return -EOPNOTSUPP;
1899
1900         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1901         if (!skb)
1902                 return -ENOBUFS;
1903
1904         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1905                            RTM_DELTCLASS, extack) < 0) {
1906                 kfree_skb(skb);
1907                 return -EINVAL;
1908         }
1909
1910         err = cops->delete(q, cl, extack);
1911         if (err) {
1912                 kfree_skb(skb);
1913                 return err;
1914         }
1915
1916         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1917                              n->nlmsg_flags & NLM_F_ECHO);
1918         return err;
1919 }
1920
1921 #ifdef CONFIG_NET_CLS
1922
1923 struct tcf_bind_args {
1924         struct tcf_walker w;
1925         unsigned long base;
1926         unsigned long cl;
1927         u32 classid;
1928 };
1929
1930 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1931 {
1932         struct tcf_bind_args *a = (void *)arg;
1933
1934         if (n && tp->ops->bind_class) {
1935                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1936
1937                 sch_tree_lock(q);
1938                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1939                 sch_tree_unlock(q);
1940         }
1941         return 0;
1942 }
1943
1944 struct tc_bind_class_args {
1945         struct qdisc_walker w;
1946         unsigned long new_cl;
1947         u32 portid;
1948         u32 clid;
1949 };
1950
1951 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1952                                 struct qdisc_walker *w)
1953 {
1954         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1955         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1956         struct tcf_block *block;
1957         struct tcf_chain *chain;
1958
1959         block = cops->tcf_block(q, cl, NULL);
1960         if (!block)
1961                 return 0;
1962         for (chain = tcf_get_next_chain(block, NULL);
1963              chain;
1964              chain = tcf_get_next_chain(block, chain)) {
1965                 struct tcf_proto *tp;
1966
1967                 for (tp = tcf_get_next_proto(chain, NULL);
1968                      tp; tp = tcf_get_next_proto(chain, tp)) {
1969                         struct tcf_bind_args arg = {};
1970
1971                         arg.w.fn = tcf_node_bind;
1972                         arg.classid = a->clid;
1973                         arg.base = cl;
1974                         arg.cl = a->new_cl;
1975                         tp->ops->walk(tp, &arg.w, true);
1976                 }
1977         }
1978
1979         return 0;
1980 }
1981
1982 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1983                            unsigned long new_cl)
1984 {
1985         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1986         struct tc_bind_class_args args = {};
1987
1988         if (!cops->tcf_block)
1989                 return;
1990         args.portid = portid;
1991         args.clid = clid;
1992         args.new_cl = new_cl;
1993         args.w.fn = tc_bind_class_walker;
1994         q->ops->cl_ops->walk(q, &args.w);
1995 }
1996
1997 #else
1998
1999 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2000                            unsigned long new_cl)
2001 {
2002 }
2003
2004 #endif
2005
2006 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2007                          struct netlink_ext_ack *extack)
2008 {
2009         struct net *net = sock_net(skb->sk);
2010         struct tcmsg *tcm = nlmsg_data(n);
2011         struct nlattr *tca[TCA_MAX + 1];
2012         struct net_device *dev;
2013         struct Qdisc *q = NULL;
2014         const struct Qdisc_class_ops *cops;
2015         unsigned long cl = 0;
2016         unsigned long new_cl;
2017         u32 portid;
2018         u32 clid;
2019         u32 qid;
2020         int err;
2021
2022         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2023                                      rtm_tca_policy, extack);
2024         if (err < 0)
2025                 return err;
2026
2027         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2028         if (!dev)
2029                 return -ENODEV;
2030
2031         /*
2032            parent == TC_H_UNSPEC - unspecified parent.
2033            parent == TC_H_ROOT   - class is root, which has no parent.
2034            parent == X:0         - parent is root class.
2035            parent == X:Y         - parent is a node in hierarchy.
2036            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2037
2038            handle == 0:0         - generate handle from kernel pool.
2039            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2040            handle == X:Y         - clear.
2041            handle == X:0         - root class.
2042          */
2043
2044         /* Step 1. Determine qdisc handle X:0 */
2045
2046         portid = tcm->tcm_parent;
2047         clid = tcm->tcm_handle;
2048         qid = TC_H_MAJ(clid);
2049
2050         if (portid != TC_H_ROOT) {
2051                 u32 qid1 = TC_H_MAJ(portid);
2052
2053                 if (qid && qid1) {
2054                         /* If both majors are known, they must be identical. */
2055                         if (qid != qid1)
2056                                 return -EINVAL;
2057                 } else if (qid1) {
2058                         qid = qid1;
2059                 } else if (qid == 0)
2060                         qid = rtnl_dereference(dev->qdisc)->handle;
2061
2062                 /* Now qid is genuine qdisc handle consistent
2063                  * both with parent and child.
2064                  *
2065                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2066                  */
2067                 if (portid)
2068                         portid = TC_H_MAKE(qid, portid);
2069         } else {
2070                 if (qid == 0)
2071                         qid = rtnl_dereference(dev->qdisc)->handle;
2072         }
2073
2074         /* OK. Locate qdisc */
2075         q = qdisc_lookup(dev, qid);
2076         if (!q)
2077                 return -ENOENT;
2078
2079         /* An check that it supports classes */
2080         cops = q->ops->cl_ops;
2081         if (cops == NULL)
2082                 return -EINVAL;
2083
2084         /* Now try to get class */
2085         if (clid == 0) {
2086                 if (portid == TC_H_ROOT)
2087                         clid = qid;
2088         } else
2089                 clid = TC_H_MAKE(qid, clid);
2090
2091         if (clid)
2092                 cl = cops->find(q, clid);
2093
2094         if (cl == 0) {
2095                 err = -ENOENT;
2096                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2097                     !(n->nlmsg_flags & NLM_F_CREATE))
2098                         goto out;
2099         } else {
2100                 switch (n->nlmsg_type) {
2101                 case RTM_NEWTCLASS:
2102                         err = -EEXIST;
2103                         if (n->nlmsg_flags & NLM_F_EXCL)
2104                                 goto out;
2105                         break;
2106                 case RTM_DELTCLASS:
2107                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2108                         /* Unbind the class with flilters with 0 */
2109                         tc_bind_tclass(q, portid, clid, 0);
2110                         goto out;
2111                 case RTM_GETTCLASS:
2112                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2113                         goto out;
2114                 default:
2115                         err = -EINVAL;
2116                         goto out;
2117                 }
2118         }
2119
2120         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2121                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2122                 return -EOPNOTSUPP;
2123         }
2124
2125         new_cl = cl;
2126         err = -EOPNOTSUPP;
2127         if (cops->change)
2128                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2129         if (err == 0) {
2130                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2131                 /* We just create a new class, need to do reverse binding. */
2132                 if (cl != new_cl)
2133                         tc_bind_tclass(q, portid, clid, new_cl);
2134         }
2135 out:
2136         return err;
2137 }
2138
2139 struct qdisc_dump_args {
2140         struct qdisc_walker     w;
2141         struct sk_buff          *skb;
2142         struct netlink_callback *cb;
2143 };
2144
2145 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2146                             struct qdisc_walker *arg)
2147 {
2148         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2149
2150         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2151                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2152                               RTM_NEWTCLASS, NULL);
2153 }
2154
2155 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2156                                 struct tcmsg *tcm, struct netlink_callback *cb,
2157                                 int *t_p, int s_t)
2158 {
2159         struct qdisc_dump_args arg;
2160
2161         if (tc_qdisc_dump_ignore(q, false) ||
2162             *t_p < s_t || !q->ops->cl_ops ||
2163             (tcm->tcm_parent &&
2164              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2165                 (*t_p)++;
2166                 return 0;
2167         }
2168         if (*t_p > s_t)
2169                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2170         arg.w.fn = qdisc_class_dump;
2171         arg.skb = skb;
2172         arg.cb = cb;
2173         arg.w.stop  = 0;
2174         arg.w.skip = cb->args[1];
2175         arg.w.count = 0;
2176         q->ops->cl_ops->walk(q, &arg.w);
2177         cb->args[1] = arg.w.count;
2178         if (arg.w.stop)
2179                 return -1;
2180         (*t_p)++;
2181         return 0;
2182 }
2183
2184 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2185                                struct tcmsg *tcm, struct netlink_callback *cb,
2186                                int *t_p, int s_t, bool recur)
2187 {
2188         struct Qdisc *q;
2189         int b;
2190
2191         if (!root)
2192                 return 0;
2193
2194         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2195                 return -1;
2196
2197         if (!qdisc_dev(root) || !recur)
2198                 return 0;
2199
2200         if (tcm->tcm_parent) {
2201                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2202                 if (q && q != root &&
2203                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2204                         return -1;
2205                 return 0;
2206         }
2207         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2208                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2209                         return -1;
2210         }
2211
2212         return 0;
2213 }
2214
2215 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2216 {
2217         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2218         struct net *net = sock_net(skb->sk);
2219         struct netdev_queue *dev_queue;
2220         struct net_device *dev;
2221         int t, s_t;
2222
2223         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2224                 return 0;
2225         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2226         if (!dev)
2227                 return 0;
2228
2229         s_t = cb->args[0];
2230         t = 0;
2231
2232         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2233                                 skb, tcm, cb, &t, s_t, true) < 0)
2234                 goto done;
2235
2236         dev_queue = dev_ingress_queue(dev);
2237         if (dev_queue &&
2238             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2239                                 &t, s_t, false) < 0)
2240                 goto done;
2241
2242 done:
2243         cb->args[0] = t;
2244
2245         dev_put(dev);
2246         return skb->len;
2247 }
2248
2249 #ifdef CONFIG_PROC_FS
2250 static int psched_show(struct seq_file *seq, void *v)
2251 {
2252         seq_printf(seq, "%08x %08x %08x %08x\n",
2253                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2254                    1000000,
2255                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2256
2257         return 0;
2258 }
2259
2260 static int __net_init psched_net_init(struct net *net)
2261 {
2262         struct proc_dir_entry *e;
2263
2264         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2265         if (e == NULL)
2266                 return -ENOMEM;
2267
2268         return 0;
2269 }
2270
2271 static void __net_exit psched_net_exit(struct net *net)
2272 {
2273         remove_proc_entry("psched", net->proc_net);
2274 }
2275 #else
2276 static int __net_init psched_net_init(struct net *net)
2277 {
2278         return 0;
2279 }
2280
2281 static void __net_exit psched_net_exit(struct net *net)
2282 {
2283 }
2284 #endif
2285
2286 static struct pernet_operations psched_net_ops = {
2287         .init = psched_net_init,
2288         .exit = psched_net_exit,
2289 };
2290
2291 DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
2292
2293 static int __init pktsched_init(void)
2294 {
2295         int err;
2296
2297         err = register_pernet_subsys(&psched_net_ops);
2298         if (err) {
2299                 pr_err("pktsched_init: "
2300                        "cannot initialize per netns operations\n");
2301                 return err;
2302         }
2303
2304         register_qdisc(&pfifo_fast_ops);
2305         register_qdisc(&pfifo_qdisc_ops);
2306         register_qdisc(&bfifo_qdisc_ops);
2307         register_qdisc(&pfifo_head_drop_qdisc_ops);
2308         register_qdisc(&mq_qdisc_ops);
2309         register_qdisc(&noqueue_qdisc_ops);
2310
2311         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2312         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2313         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2314                       0);
2315         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2316         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2317         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2318                       0);
2319
2320         tc_wrapper_init();
2321
2322         return 0;
2323 }
2324
2325 subsys_initcall(pktsched_init);